Vince's CSV Parser
csv_reader.cpp
Go to the documentation of this file.
1 
5 #include "csv_reader.hpp"
6 
7 namespace csv {
8  namespace internals {
9  CSV_INLINE std::string format_row(const std::vector<std::string>& row, csv::string_view delim) {
11  std::stringstream ret;
12  for (size_t i = 0; i < row.size(); i++) {
13  ret << row[i];
14  if (i + 1 < row.size()) ret << delim;
15  else ret << '\n';
16  }
17  ret.flush();
18 
19  return ret.str();
20  }
21 
28  CSV_INLINE std::vector<std::string> _get_col_names(csv::string_view head, CSVFormat format) {
29  // Parse the CSV
30  auto trim_chars = format.get_trim_chars();
31  std::stringstream source(head.data());
32  RowCollection rows;
33 
34  StreamParser<std::stringstream> parser(source, format);
35  parser.set_output(rows);
36  parser.next();
37 
38  return CSVRow(std::move(rows[format.get_header()]));
39  }
40 
41  CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) {
42  // Frequency counter of row length
43  std::unordered_map<size_t, size_t> row_tally = { { 0, 0 } };
44 
45  // Map row lengths to row num where they first occurred
46  std::unordered_map<size_t, size_t> row_when = { { 0, 0 } };
47 
48  // Parse the CSV
49  std::stringstream source(head.data());
50  RowCollection rows;
51 
52  StreamParser<std::stringstream> parser(source, format);
53  parser.set_output(rows);
54  parser.next();
55 
56  for (size_t i = 0; i < rows.size(); i++) {
57  auto& row = rows[i];
58 
59  // Ignore zero-length rows
60  if (row.size() > 0) {
61  if (row_tally.find(row.size()) != row_tally.end()) {
62  row_tally[row.size()]++;
63  }
64  else {
65  row_tally[row.size()] = 1;
66  row_when[row.size()] = i;
67  }
68  }
69  }
70 
71  double final_score = 0;
72  size_t header_row = 0;
73 
74  // Final score is equal to the largest
75  // row size times rows of that size
76  for (auto& pair : row_tally) {
77  auto row_size = pair.first;
78  auto row_count = pair.second;
79  double score = (double)(row_size * row_count);
80  if (score > final_score) {
81  final_score = score;
82  header_row = row_when[row_size];
83  }
84  }
85 
86  return {
87  final_score,
88  header_row
89  };
90  }
91 
93  CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector<char>& delims) {
100  CSVFormat format;
101  size_t max_score = 0,
102  header = 0;
103  char current_delim = delims[0];
104 
105  for (char cand_delim : delims) {
106  auto result = calculate_score(head, format.delimiter(cand_delim));
107 
108  if ((size_t)result.score > max_score) {
109  max_score = (size_t)result.score;
110  current_delim = cand_delim;
111  header = result.header;
112  }
113  }
114 
115  return { current_delim, (int)header };
116  }
117  }
118 
125  CSV_INLINE std::vector<std::string> get_col_names(csv::string_view filename, CSVFormat format) {
126  auto head = internals::get_csv_head(filename);
127 
129  if (format.guess_delim()) {
130  auto guess_result = guess_format(filename, format.get_possible_delims());
131  format.delimiter(guess_result.delim).header_row(guess_result.header_row);
132  }
133 
134  return internals::_get_col_names(head, format);
135  }
136 
138  CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector<char>& delims) {
139  auto head = internals::get_csv_head(filename);
140  return internals::_guess_format(head, delims);
141  }
142 
154  CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) {
155  auto head = internals::get_csv_head(filename);
156  using Parser = internals::MmapParser;
157 
159  if (format.guess_delim()) {
160  auto guess_result = internals::_guess_format(head, format.possible_delimiters);
161  format.delimiter(guess_result.delim);
162  format.header = guess_result.header_row;
163  this->_format = format;
164  }
165 
166  if (!format.col_names.empty())
167  this->set_col_names(format.col_names);
168 
169  this->parser = std::unique_ptr<Parser>(new Parser(filename, format, this->col_names)); // For C++11
170  this->initial_read();
171  }
172 
175  CSVFormat new_format = this->_format;
176 
177  // Since users are normally not allowed to set
178  // column names and header row simulatenously,
179  // we will set the backing variables directly here
180  new_format.col_names = this->col_names->get_col_names();
181  new_format.header = this->_format.header;
182 
183  return new_format;
184  }
185 
187  CSV_INLINE std::vector<std::string> CSVReader::get_col_names() const {
188  if (this->col_names) {
189  return this->col_names->get_col_names();
190  }
191 
192  return std::vector<std::string>();
193  }
194 
199  auto _col_names = this->get_col_names();
200  for (size_t i = 0; i < _col_names.size(); i++)
201  if (_col_names[i] == col_name) return (int)i;
202 
203  return CSV_NOT_FOUND;
204  }
205 
206  CSV_INLINE void CSVReader::trim_header() {
207  if (!this->header_trimmed) {
208  for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) {
209  if (i == this->_format.header && this->col_names->empty()) {
210  this->set_col_names(this->records->pop_front());
211  }
212  else {
213  this->records->pop_front();
214  }
215  }
216 
217  this->header_trimmed = true;
218  }
219  }
220 
224  CSV_INLINE void CSVReader::set_col_names(const std::vector<std::string>& names)
225  {
226  this->col_names->set_col_names(names);
227  this->n_cols = names.size();
228  }
229 
241  CSV_INLINE bool CSVReader::read_csv(size_t bytes) {
242  // Tell read_row() to listen for CSV rows
243  this->records->notify_all();
244 
245  this->parser->set_output(*this->records);
246  this->parser->next(bytes);
247 
248  if (!this->header_trimmed) {
249  this->trim_header();
250  }
251 
252  // Tell read_row() to stop waiting
253  this->records->kill_all();
254 
255  return true;
256  }
257 
273  while (true) {
274  if (this->records->empty()) {
275  if (this->records->is_waitable())
276  // Reading thread is currently active => wait for it to populate records
277  this->records->wait();
278  else if (this->parser->eof())
279  // End of file and no more records
280  return false;
281  else {
282  // Reading thread is not active => start another one
283  if (this->read_csv_worker.joinable())
284  this->read_csv_worker.join();
285 
286  this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE);
287  }
288  }
289  else if (this->records->front().size() != this->n_cols &&
290  this->_format.variable_column_policy != VariableColumnPolicy::KEEP) {
291  auto errored_row = this->records->pop_front();
292 
293  if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) {
294  if (errored_row.size() < this->n_cols)
295  throw std::runtime_error("Line too short " + internals::format_row(errored_row));
296 
297  throw std::runtime_error("Line too long " + internals::format_row(errored_row));
298  }
299  }
300  else {
301  row = this->records->pop_front();
302  this->_n_rows++;
303  return true;
304  }
305  }
306 
307  return false;
308  }
309 }
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
CSVFormat & delimiter(char delim)
Sets the delimiter of the CSV file.
Definition: csv_format.cpp:11
CSVFormat & header_row(int row)
Sets the header row.
Definition: csv_format.cpp:42
CSVFormat get_format() const
Return the format of the original raw CSV.
Definition: csv_reader.cpp:174
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
Definition: csv_reader.cpp:198
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
Definition: csv_reader.cpp:272
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Definition: csv_reader.cpp:187
CSVReader(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Reads an arbitrarily large CSV file using memory-mapped IO.
Definition: csv_reader.cpp:154
Data structure for representing CSV rows.
Definition: csv_row.hpp:304
Parser for memory-mapped files.
A class for parsing CSV data from a std::stringstream or an std::ifstream
void next(size_t bytes=ITERATION_CHUNK_SIZE) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
Defines functionality needed for basic CSV parsing.
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
Definition: csv_reader.hpp:201
size_t _n_rows
How many rows (minus header) have been read so far.
Definition: csv_reader.hpp:204
bool read_csv(size_t bytes=internals::ITERATION_CHUNK_SIZE)
Read a chunk of CSV data.
Definition: csv_reader.cpp:241
internals::ColNamesPtr col_names
Pointer to a object containing column information.
Definition: csv_reader.hpp:195
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
Definition: csv_reader.cpp:224
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
Definition: csv_reader.hpp:198
size_t n_cols
The number of columns in this CSV.
Definition: csv_reader.hpp:203
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
Definition: common.hpp:151
std::vector< std::string > _get_col_names(csv::string_view head, CSVFormat format)
Return a CSV's column names.
Definition: csv_reader.cpp:28
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition: csv_reader.cpp:9
CSVGuessResult _guess_format(csv::string_view head, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
Definition: csv_reader.cpp:93
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
Definition: csv_reader.cpp:125
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
Definition: common.hpp:207
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
Definition: csv_reader.cpp:138
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75
Stores the inferred format of a CSV file.
Definition: csv_format.hpp:28