Vince's CSV Parser
basic_csv_parser.hpp
Go to the documentation of this file.
1 
5 #pragma once
6 #include <algorithm>
7 #include <array>
8 #include <condition_variable>
9 #include <deque>
10 #include <fstream>
11 #include <memory>
12 #include <mutex>
13 #include <unordered_map>
14 #include <unordered_set>
15 #include <thread>
16 #include <vector>
17 
18 #include "../external/mio.hpp"
19 #include "col_names.hpp"
20 #include "common.hpp"
21 #include "csv_format.hpp"
22 #include "csv_row.hpp"
23 
24 namespace csv {
25  namespace internals {
30  HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter) {
31  std::array<ParseFlags, 256> ret = {};
32  for (int i = -128; i < 128; i++) {
33  const int arr_idx = i + 128;
34  char ch = char(i);
35 
36  if (ch == delimiter)
37  ret[arr_idx] = ParseFlags::DELIMITER;
38  else if (ch == '\r' || ch == '\n')
39  ret[arr_idx] = ParseFlags::NEWLINE;
40  else
41  ret[arr_idx] = ParseFlags::NOT_SPECIAL;
42  }
43 
44  return ret;
45  }
46 
51  HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter, char quote_char) {
52  std::array<ParseFlags, 256> ret = make_parse_flags(delimiter);
53  ret[(size_t)quote_char + 128] = ParseFlags::QUOTE;
54  return ret;
55  }
56 
61  HEDLEY_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char* ws_chars, size_t n_chars) {
62  std::array<bool, 256> ret = {};
63  for (int i = -128; i < 128; i++) {
64  const int arr_idx = i + 128;
65  char ch = char(i);
66  ret[arr_idx] = false;
67 
68  for (size_t j = 0; j < n_chars; j++) {
69  if (ws_chars[j] == ch) {
70  ret[arr_idx] = true;
71  }
72  }
73  }
74 
75  return ret;
76  }
77 
78  inline WhitespaceMap make_ws_flags(const std::vector<char>& flags) {
79  return make_ws_flags(flags.data(), flags.size());
80  }
81 
82  CSV_INLINE size_t get_file_size(csv::string_view filename);
83 
84  CSV_INLINE std::string get_csv_head(csv::string_view filename);
85 
87  CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size);
88 
93  template<typename T>
95  public:
96  ThreadSafeDeque(size_t notify_size = 100) : _notify_size(notify_size) {};
97  ThreadSafeDeque(const ThreadSafeDeque& other) {
98  this->data = other.data;
99  this->_notify_size = other._notify_size;
100  }
101 
102  ThreadSafeDeque(const std::deque<T>& source) : ThreadSafeDeque() {
103  this->data = source;
104  }
105 
106  void clear() noexcept { this->data.clear(); }
107 
108  bool empty() const noexcept {
109  return this->data.empty();
110  }
111 
112  T& front() noexcept {
113  return this->data.front();
114  }
115 
116  T& operator[](size_t n) {
117  return this->data[n];
118  }
119 
120  void push_back(T&& item) {
121  std::lock_guard<std::mutex> lock{ this->_lock };
122  this->data.push_back(std::move(item));
123 
124  if (this->size() >= _notify_size) {
125  this->_cond.notify_all();
126  }
127  }
128 
129  T pop_front() noexcept {
130  std::lock_guard<std::mutex> lock{ this->_lock };
131  T item = std::move(data.front());
132  data.pop_front();
133  return item;
134  }
135 
136  size_t size() const noexcept { return this->data.size(); }
137 
139  constexpr bool is_waitable() const noexcept { return this->_is_waitable; }
140 
142  void wait() {
143  if (!is_waitable()) {
144  return;
145  }
146 
147  std::unique_lock<std::mutex> lock{ this->_lock };
148  this->_cond.wait(lock, [this] { return this->size() >= _notify_size || !this->is_waitable(); });
149  lock.unlock();
150  }
151 
152  typename std::deque<T>::iterator begin() noexcept {
153  return this->data.begin();
154  }
155 
156  typename std::deque<T>::iterator end() noexcept {
157  return this->data.end();
158  }
159 
161  void notify_all() {
162  std::unique_lock<std::mutex> lock{ this->_lock };
163  this->_is_waitable = true;
164  this->_cond.notify_all();
165  }
166 
168  void kill_all() {
169  std::unique_lock<std::mutex> lock{ this->_lock };
170  this->_is_waitable = false;
171  this->_cond.notify_all();
172  }
173 
174  private:
175  bool _is_waitable = false;
176  size_t _notify_size;
177  std::mutex _lock;
178  std::condition_variable _cond;
179  std::deque<T> data;
180  };
181 
182  constexpr const int UNINITIALIZED_FIELD = -1;
183  }
184 
187 
188  namespace internals {
196  public:
197  IBasicCSVParser() = default;
198  IBasicCSVParser(const CSVFormat&, const ColNamesPtr&);
199  IBasicCSVParser(const ParseFlagMap& parse_flags, const WhitespaceMap& ws_flags
200  ) : _parse_flags(parse_flags), _ws_flags(ws_flags) {};
201 
202  virtual ~IBasicCSVParser() {}
203 
205  bool eof() { return this->_eof; }
206 
208  virtual void next(size_t bytes) = 0;
209 
211  void end_feed();
212 
213  CONSTEXPR_17 ParseFlags parse_flag(const char ch) const noexcept {
214  return _parse_flags.data()[ch + 128];
215  }
216 
217  CONSTEXPR_17 ParseFlags compound_parse_flag(const char ch) const noexcept {
218  return quote_escape_flag(parse_flag(ch), this->quote_escape);
219  }
220 
222  CONSTEXPR bool utf8_bom() const { return this->_utf8_bom; }
223 
224  void set_output(RowCollection& rows) { this->_records = &rows; }
225 
226  protected:
229  CSVRow current_row;
230  RawCSVDataPtr data_ptr = nullptr;
231  ColNamesPtr _col_names = nullptr;
232  CSVFieldList* fields = nullptr;
233  int field_start = UNINITIALIZED_FIELD;
234  size_t field_length = 0;
235 
239 
242  bool _eof = false;
243 
245  size_t source_size = 0;
247 
249  CONSTEXPR bool no_chunk() const { return this->source_size < ITERATION_CHUNK_SIZE; }
250 
255  size_t parse();
256 
258  void reset_data_ptr();
259  private:
263  WhitespaceMap _ws_flags;
264  bool quote_escape = false;
265  bool field_has_double_quote = false;
266 
268  size_t data_pos = 0;
269 
271  bool unicode_bom_scan = false;
272  bool _utf8_bom = false;
273 
275  RowCollection* _records = nullptr;
276 
277  CONSTEXPR_17 bool ws_flag(const char ch) const noexcept {
278  return _ws_flags.data()[ch + 128];
279  }
280 
281  size_t& current_row_start() {
282  return this->current_row.data_start;
283  }
284 
285  void parse_field() noexcept;
286 
288  void push_field();
289 
291  void push_row();
292 
294  void trim_utf8_bom();
295  };
296 
300  template<typename TStream>
303 
304  public:
305  StreamParser(TStream& source,
306  const CSVFormat& format,
307  const ColNamesPtr& col_names = nullptr
308  ) : IBasicCSVParser(format, col_names), _source(std::move(source)) {};
309 
310  StreamParser(
311  TStream& source,
312  internals::ParseFlagMap parse_flags,
313  internals::WhitespaceMap ws_flags) :
314  IBasicCSVParser(parse_flags, ws_flags),
315  _source(std::move(source))
316  {};
317 
318  ~StreamParser() {}
319 
320  void next(size_t bytes = ITERATION_CHUNK_SIZE) override {
321  if (this->eof()) return;
322 
323  this->reset_data_ptr();
324  this->data_ptr->_data = std::make_shared<std::string>();
325 
326  if (source_size == 0) {
327  const auto start = _source.tellg();
328  _source.seekg(0, std::ios::end);
329  const auto end = _source.tellg();
330  _source.seekg(0, std::ios::beg);
331 
332  source_size = end - start;
333  }
334 
335  // Read data into buffer
336  size_t length = std::min(source_size - stream_pos, bytes);
337  std::unique_ptr<char[]> buff(new char[length]);
338  _source.seekg(stream_pos, std::ios::beg);
339  _source.read(buff.get(), length);
340  stream_pos = _source.tellg();
341  ((std::string*)(this->data_ptr->_data.get()))->assign(buff.get(), length);
342 
343  // Create string_view
344  this->data_ptr->data = *((std::string*)this->data_ptr->_data.get());
345 
346  // Parse
347  this->current_row = CSVRow(this->data_ptr);
348  size_t remainder = this->parse();
349 
350  if (stream_pos == source_size || no_chunk()) {
351  this->_eof = true;
352  this->end_feed();
353  }
354  else {
355  this->stream_pos -= (length - remainder);
356  }
357  }
358 
359  private:
360  TStream _source;
361  size_t stream_pos = 0;
362  };
363 
373  class MmapParser : public IBasicCSVParser {
374  public:
375  MmapParser(csv::string_view filename,
376  const CSVFormat& format,
377  const ColNamesPtr& col_names = nullptr
378  ) : IBasicCSVParser(format, col_names) {
379  this->_filename = filename.data();
380  this->source_size = get_file_size(filename);
381  };
382 
383  ~MmapParser() {}
384 
385  void next(size_t bytes) override;
386 
387  private:
388  std::string _filename;
389  size_t mmap_pos = 0;
390  };
391  }
392 }
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
Data structure for representing CSV rows.
Definition: csv_row.hpp:296
Abstract base class which provides CSV parsing logic.
CONSTEXPR bool no_chunk() const
Whether or not source needs to be read in chunks.
ParseFlagMap _parse_flags
An array where the (i + 128)th slot gives the ParseFlags for ASCII character i.
void reset_data_ptr()
Create a new RawCSVDataPtr for a new chunk of data.
bool eof()
Whether or not we have reached the end of source.
void end_feed()
Indicate the last block of data has been parsed.
size_t parse()
Parse the current chunk of data *.
virtual void next(size_t bytes)=0
Parse the next block of data.
size_t source_size
The size of the incoming CSV.
CONSTEXPR bool utf8_bom() const
Whether or not this CSV has a UTF-8 byte order mark.
Parser for memory-mapped files.
A class for parsing CSV data from a std::stringstream or an std::ifstream
void next(size_t bytes=ITERATION_CHUNK_SIZE) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
void wait()
Wait for an item to become available.
void notify_all()
Tell listeners that this deque is actively being pushed to.
constexpr bool is_waitable() const noexcept
Returns true if a thread is actively pushing items to this deque.
void kill_all()
Tell all listeners to stop.
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition: common.hpp:117
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
Defines an object used to store CSV format settings.
Defines the data type used for storing information about a CSV row.
std::array< ParseFlags, 256 > ParseFlagMap
An array which maps ASCII chars to a parsing flag.
Definition: common.hpp:200
std::array< bool, 256 > WhitespaceMap
An array which maps ASCII chars to a flag indicating if it is whitespace.
Definition: common.hpp:203
HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
Definition: common.hpp:151
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition: common.hpp:166
@ NOT_SPECIAL
Characters with no special meaning or escaped delimiters and newlines.
@ NEWLINE
Characters which signify a new row.
@ QUOTE
Characters which may signify a quote escape.
@ DELIMITER
Characters which signify a new field.
constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept
Transform the ParseFlags given the context of whether or not the current field is quote escaped.
Definition: common.hpp:176
HEDLEY_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
The all encompassing namespace.
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75