Vince's CSV Parser
csv_reader.hpp
Go to the documentation of this file.
1 
5 #pragma once
6 
7 #include <algorithm>
8 #include <deque>
9 #include <fstream>
10 #include <iterator>
11 #include <memory>
12 #include <mutex>
13 #include <thread>
14 #include <sstream>
15 #include <string>
16 #include <vector>
17 
18 #include "../external/mio.hpp"
19 #include "basic_csv_parser.hpp"
20 #include "common.hpp"
21 #include "data_type.h"
22 #include "csv_format.hpp"
23 
25 namespace csv {
27  namespace internals {
28  std::string format_row(const std::vector<std::string>& row, csv::string_view delim = ", ");
29 
30  std::vector<std::string> _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv());
31 
32  struct GuessScore {
33  double score;
34  size_t header;
35  };
36 
37  CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format);
38 
39  CSVGuessResult _guess_format(csv::string_view head, const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
40  }
41 
42  std::vector<std::string> get_col_names(
43  csv::string_view filename,
44  const CSVFormat format = CSVFormat::guess_csv());
45 
47  CSVGuessResult guess_format(csv::string_view filename,
48  const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
49 
57  class CSVReader {
58  public:
69  class iterator {
70  public:
71  #ifndef DOXYGEN_SHOULD_SKIP_THIS
72  using value_type = CSVRow;
73  using difference_type = std::ptrdiff_t;
74  using pointer = CSVRow * ;
75  using reference = CSVRow & ;
76  using iterator_category = std::input_iterator_tag;
77  #endif
78 
79  iterator() = default;
80  iterator(CSVReader* reader) : daddy(reader) {};
82 
84  CONSTEXPR_14 reference operator*() { return this->row; }
85 
87  CONSTEXPR_14 pointer operator->() { return &(this->row); }
88 
89  iterator& operator++();
90  iterator operator++(int);
91  iterator& operator--();
92 
96  CONSTEXPR bool operator==(const iterator& other) const noexcept {
97  return (this->daddy == other.daddy) && (this->i == other.i);
98  }
99 
100  CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
101  private:
102  CSVReader * daddy = nullptr; // Pointer to parent
103  CSVRow row; // Current row
104  size_t i = 0; // Index of current row
105  };
106 
111  CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv());
112 
119  template<typename TStream,
120  csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
121  CSVReader(TStream& source, CSVFormat format = CSVFormat()) : _format(format) {
122  using Parser = internals::StreamParser<TStream>;
123 
124  if (!format.col_names.empty())
125  this->set_col_names(format.col_names);
126 
127  this->parser = std::unique_ptr<Parser>(
128  new Parser(source, format, col_names)); // For C++11
129  this->initial_read();
130  }
132 
133  CSVReader(const CSVReader&) = delete; // No copy constructor
134  CSVReader(CSVReader&&) = default; // Move constructor
135  CSVReader& operator=(const CSVReader&) = delete; // No copy assignment
136  CSVReader& operator=(CSVReader&& other) = default;
137  ~CSVReader() {
138  if (this->read_csv_worker.joinable()) {
139  this->read_csv_worker.join();
140  }
141  }
142 
145  bool read_row(CSVRow &row);
146  iterator begin();
147  HEDLEY_CONST iterator end() const noexcept;
148 
150  bool eof() const noexcept { return this->parser->eof(); };
152 
155  CSVFormat get_format() const;
156  std::vector<std::string> get_col_names() const;
157  int index_of(csv::string_view col_name) const;
159 
162 
168  CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; }
169 
171  CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; }
172 
174  bool utf8_bom() const noexcept { return this->parser->utf8_bom(); }
176 
177  protected:
186  void set_col_names(const std::vector<std::string>&);
187 
190  CSVFormat _format;
192 
195 
196  internals::ColNamesPtr col_names = std::make_shared<internals::ColNames>();
197 
199  std::unique_ptr<internals::IBasicCSVParser> parser = nullptr;
200 
202  std::unique_ptr<RowCollection> records{new RowCollection(100)};
203 
204  size_t n_cols = 0;
205  size_t _n_rows = 0;
209  bool read_csv(size_t bytes = internals::ITERATION_CHUNK_SIZE);
211 
214  private:
216  bool header_trimmed = false;
217 
220  std::thread read_csv_worker;
222 
224  void initial_read() {
225  this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE);
226  this->read_csv_worker.join();
227  }
228 
229  void trim_header();
230  };
231 }
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
static CSVFormat guess_csv()
CSVFormat for guessing the delimiter.
Definition: csv_format.hpp:126
An input iterator capable of handling large files.
Definition: csv_reader.hpp:69
CONSTEXPR bool operator==(const iterator &other) const noexcept
Returns true if iterators were constructed from the same CSVReader and point to the same row.
Definition: csv_reader.hpp:96
iterator & operator++()
Pre-increment iterator.
CONSTEXPR_14 reference operator*()
Access the CSVRow held by the iterator.
Definition: csv_reader.hpp:84
CONSTEXPR_14 pointer operator->()
Return a pointer to the CSVRow the iterator has stopped at.
Definition: csv_reader.hpp:87
Main class for parsing CSVs from files and in-memory sources.
Definition: csv_reader.hpp:57
CONSTEXPR bool empty() const noexcept
Whether or not the file or stream contains valid CSV rows, not including the header.
Definition: csv_reader.hpp:168
bool utf8_bom() const noexcept
Whether or not CSV was prefixed with a UTF-8 bom.
Definition: csv_reader.hpp:174
CSVFormat get_format() const
Return the format of the original raw CSV.
Definition: csv_reader.cpp:174
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
Definition: csv_reader.cpp:198
HEDLEY_CONST iterator end() const noexcept
A placeholder for the imaginary past the end row in a CSV.
CONSTEXPR size_t n_rows() const noexcept
Retrieves the number of rows that have been read so far.
Definition: csv_reader.hpp:171
bool eof() const noexcept
Returns true if we have reached end of file.
Definition: csv_reader.hpp:150
CSVReader(TStream &source, CSVFormat format=CSVFormat())
Allows parsing stream sources such as std::stringstream or std::ifstream
Definition: csv_reader.hpp:121
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
Definition: csv_reader.cpp:272
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Definition: csv_reader.cpp:187
iterator begin()
Return an iterator to the first row in the reader.
CSVReader(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Reads an arbitrarily large CSV file using memory-mapped IO.
Definition: csv_reader.cpp:154
Data structure for representing CSV rows.
Definition: csv_row.hpp:296
A class for parsing CSV data from a std::stringstream or an std::ifstream
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition: common.hpp:117
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
Defines an object used to store CSV format settings.
Implements data type parsing functionality.
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
Definition: csv_reader.hpp:202
size_t _n_rows
How many rows (minus header) have been read so far.
Definition: csv_reader.hpp:205
bool read_csv(size_t bytes=internals::ITERATION_CHUNK_SIZE)
Read a chunk of CSV data.
Definition: csv_reader.cpp:241
internals::ColNamesPtr col_names
Pointer to a object containing column information.
Definition: csv_reader.hpp:196
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
Definition: csv_reader.cpp:224
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
Definition: csv_reader.hpp:199
size_t n_cols
The number of columns in this CSV.
Definition: csv_reader.hpp:204
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
Definition: common.hpp:151
std::vector< std::string > _get_col_names(csv::string_view head, CSVFormat format)
Return a CSV's column names.
Definition: csv_reader.cpp:28
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition: csv_reader.cpp:9
CSVGuessResult _guess_format(csv::string_view head, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
Definition: csv_reader.cpp:93
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
Definition: csv_reader.cpp:125
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
Definition: csv_reader.cpp:138
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75
Stores the inferred format of a CSV file.
Definition: csv_format.hpp:28