Vince's CSV Parser
csv_reader.hpp
Go to the documentation of this file.
1 
5 #pragma once
6 
7 #include <algorithm>
8 #include <deque>
9 #include <fstream>
10 #include <iterator>
11 #include <memory>
12 #include <mutex>
13 #include <thread>
14 #include <sstream>
15 #include <string>
16 #include <vector>
17 
18 #include "../external/mio.hpp"
19 #include "basic_csv_parser.hpp"
20 #include "common.hpp"
21 #include "data_type.hpp"
22 #include "csv_format.hpp"
23 
25 namespace csv {
27  namespace internals {
28  std::string format_row(const std::vector<std::string>& row, csv::string_view delim = ", ");
29 
30  std::vector<std::string> _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv());
31 
32  struct GuessScore {
33  double score;
34  size_t header;
35  };
36 
37  CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format);
38 
39  CSVGuessResult _guess_format(csv::string_view head, const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
40  }
41 
42  std::vector<std::string> get_col_names(
43  csv::string_view filename,
44  const CSVFormat format = CSVFormat::guess_csv());
45 
47  CSVGuessResult guess_format(csv::string_view filename,
48  const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
49 
57  class CSVReader {
58  public:
69  class iterator {
70  public:
71  #ifndef DOXYGEN_SHOULD_SKIP_THIS
72  using value_type = CSVRow;
73  using difference_type = std::ptrdiff_t;
74  using pointer = CSVRow * ;
75  using reference = CSVRow & ;
76  using iterator_category = std::input_iterator_tag;
77  #endif
78 
79  iterator() = default;
80  iterator(CSVReader* reader) : daddy(reader) {};
82 
84  CONSTEXPR_14 reference operator*() { return this->row; }
85 
87  CONSTEXPR_14 pointer operator->() { return &(this->row); }
88 
89  iterator& operator++();
90  iterator operator++(int);
95  CONSTEXPR bool operator==(const iterator& other) const noexcept {
96  return (this->daddy == other.daddy) && (this->i == other.i);
97  }
98 
99  CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
100  private:
101  CSVReader * daddy = nullptr; // Pointer to parent
102  CSVRow row; // Current row
103  size_t i = 0; // Index of current row
104  };
105 
110  CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv());
111 
118  template<typename TStream,
119  csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
120  CSVReader(TStream& source, CSVFormat format = CSVFormat()) : _format(format) {
121  using Parser = internals::StreamParser<TStream>;
122 
123  if (!format.col_names.empty())
124  this->set_col_names(format.col_names);
125 
126  this->parser = std::unique_ptr<Parser>(
127  new Parser(source, format, col_names)); // For C++11
128  this->initial_read();
129  }
131 
132  CSVReader(const CSVReader&) = delete; // No copy constructor
133  CSVReader(CSVReader&&) = default; // Move constructor
134  CSVReader& operator=(const CSVReader&) = delete; // No copy assignment
135  CSVReader& operator=(CSVReader&& other) = default;
136  ~CSVReader() {
137  if (this->read_csv_worker.joinable()) {
138  this->read_csv_worker.join();
139  }
140  }
141 
144  bool read_row(CSVRow &row);
145  iterator begin();
146  HEDLEY_CONST iterator end() const noexcept;
147 
149  bool eof() const noexcept { return this->parser->eof(); };
151 
154  CSVFormat get_format() const;
155  std::vector<std::string> get_col_names() const;
156  int index_of(csv::string_view col_name) const;
158 
161 
167  CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; }
168 
170  CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; }
171 
173  bool utf8_bom() const noexcept { return this->parser->utf8_bom(); }
175 
176  protected:
185  void set_col_names(const std::vector<std::string>&);
186 
189  CSVFormat _format;
191 
194 
195  internals::ColNamesPtr col_names = std::make_shared<internals::ColNames>();
196 
198  std::unique_ptr<internals::IBasicCSVParser> parser = nullptr;
199 
201  std::unique_ptr<RowCollection> records{new RowCollection(100)};
202 
203  size_t n_cols = 0;
204  size_t _n_rows = 0;
208  bool read_csv(size_t bytes = internals::ITERATION_CHUNK_SIZE);
210 
213  private:
215  bool header_trimmed = false;
216 
219  std::thread read_csv_worker;
221 
223  void initial_read() {
224  this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE);
225  this->read_csv_worker.join();
226  }
227 
228  void trim_header();
229  };
230 }
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
static CSVFormat guess_csv()
CSVFormat for guessing the delimiter.
Definition: csv_format.hpp:126
An input iterator capable of handling large files.
Definition: csv_reader.hpp:69
CONSTEXPR bool operator==(const iterator &other) const noexcept
Returns true if iterators were constructed from the same CSVReader and point to the same row.
Definition: csv_reader.hpp:95
iterator & operator++()
Pre-increment iterator.
CONSTEXPR_14 reference operator*()
Access the CSVRow held by the iterator.
Definition: csv_reader.hpp:84
CONSTEXPR_14 pointer operator->()
Return a pointer to the CSVRow the iterator has stopped at.
Definition: csv_reader.hpp:87
Main class for parsing CSVs from files and in-memory sources.
Definition: csv_reader.hpp:57
CONSTEXPR bool empty() const noexcept
Whether or not the file or stream contains valid CSV rows, not including the header.
Definition: csv_reader.hpp:167
bool utf8_bom() const noexcept
Whether or not CSV was prefixed with a UTF-8 bom.
Definition: csv_reader.hpp:173
CSVFormat get_format() const
Return the format of the original raw CSV.
Definition: csv_reader.cpp:174
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
Definition: csv_reader.cpp:198
HEDLEY_CONST iterator end() const noexcept
A placeholder for the imaginary past the end row in a CSV.
CONSTEXPR size_t n_rows() const noexcept
Retrieves the number of rows that have been read so far.
Definition: csv_reader.hpp:170
bool eof() const noexcept
Returns true if we have reached end of file.
Definition: csv_reader.hpp:149
CSVReader(TStream &source, CSVFormat format=CSVFormat())
Allows parsing stream sources such as std::stringstream or std::ifstream
Definition: csv_reader.hpp:120
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
Definition: csv_reader.cpp:272
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Definition: csv_reader.cpp:187
iterator begin()
Return an iterator to the first row in the reader.
CSVReader(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Reads an arbitrarily large CSV file using memory-mapped IO.
Definition: csv_reader.cpp:154
Data structure for representing CSV rows.
Definition: csv_row.hpp:304
A class for parsing CSV data from a std::stringstream or an std::ifstream
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition: common.hpp:117
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
Defines an object used to store CSV format settings.
Implements data type parsing functionality.
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
Definition: csv_reader.hpp:201
size_t _n_rows
How many rows (minus header) have been read so far.
Definition: csv_reader.hpp:204
bool read_csv(size_t bytes=internals::ITERATION_CHUNK_SIZE)
Read a chunk of CSV data.
Definition: csv_reader.cpp:241
internals::ColNamesPtr col_names
Pointer to a object containing column information.
Definition: csv_reader.hpp:195
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
Definition: csv_reader.cpp:224
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
Definition: csv_reader.hpp:198
size_t n_cols
The number of columns in this CSV.
Definition: csv_reader.hpp:203
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
Definition: common.hpp:151
std::vector< std::string > _get_col_names(csv::string_view head, CSVFormat format)
Return a CSV's column names.
Definition: csv_reader.cpp:28
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition: csv_reader.cpp:9
CSVGuessResult _guess_format(csv::string_view head, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
Definition: csv_reader.cpp:93
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
Definition: csv_reader.cpp:125
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
Definition: csv_reader.cpp:138
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75
Stores the inferred format of a CSV file.
Definition: csv_format.hpp:28