Vince's CSV Parser
basic_csv_parser.cpp
1 #include "basic_csv_parser.hpp"
2 
3 namespace csv {
4  namespace internals {
5  CSV_INLINE size_t get_file_size(csv::string_view filename) {
6  std::ifstream infile(std::string(filename), std::ios::binary);
7  const auto start = infile.tellg();
8  infile.seekg(0, std::ios::end);
9  const auto end = infile.tellg();
10 
11  return end - start;
12  }
13 
14  CSV_INLINE std::string get_csv_head(csv::string_view filename) {
15  return get_csv_head(filename, get_file_size(filename));
16  }
17 
18  CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) {
19  const size_t bytes = 500000;
20 
21  std::error_code error;
22  size_t length = std::min((size_t)file_size, bytes);
23  auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error);
24 
25  if (error) {
26  throw std::runtime_error("Cannot open file " + std::string(filename));
27  }
28 
29  return std::string(mmap.begin(), mmap.end());
30  }
31 
32 #ifdef _MSC_VER
33 #pragma region IBasicCVParser
34 #endif
35  CSV_INLINE IBasicCSVParser::IBasicCSVParser(
36  const CSVFormat& format,
37  const ColNamesPtr& col_names
38  ) : _col_names(col_names) {
39  if (format.no_quote) {
40  _parse_flags = internals::make_parse_flags(format.get_delim());
41  }
42  else {
43  _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char);
44  }
45 
46  _ws_flags = internals::make_ws_flags(
47  format.trim_chars.data(), format.trim_chars.size()
48  );
49  }
50 
51  CSV_INLINE void IBasicCSVParser::end_feed() {
53 
54  bool empty_last_field = this->data_ptr
55  && this->data_ptr->_data
56  && !this->data_ptr->data.empty()
57  && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER
58  || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE);
59 
60  // Push field
61  if (this->field_length > 0 || empty_last_field) {
62  this->push_field();
63  }
64 
65  // Push row
66  if (this->current_row.size() > 0)
67  this->push_row();
68  }
69 
70  CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
72  auto& in = this->data_ptr->data;
73 
74  // Trim off leading whitespace
75  while (data_pos < in.size() && ws_flag(in[data_pos]))
76  data_pos++;
77 
78  if (field_start == UNINITIALIZED_FIELD)
79  field_start = (int)(data_pos - current_row_start());
80 
81  // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
82  // sequences, use the loop below to avoid having to go through the outer
83  // switch statement as much as possible
84  while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL)
85  data_pos++;
86 
87  field_length = data_pos - (field_start + current_row_start());
88 
89  // Trim off trailing whitespace, this->field_length constraint matters
90  // when field is entirely whitespace
91  for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--)
92  this->field_length--;
93  }
94 
95  CSV_INLINE void IBasicCSVParser::push_field()
96  {
97  // Update
98  if (field_has_double_quote) {
99  fields->emplace_back(
100  field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
101  field_length,
102  true
103  );
104  field_has_double_quote = false;
105 
106  }
107  else {
108  fields->emplace_back(
109  field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
110  field_length
111  );
112  }
113 
114  current_row.row_length++;
115 
116  // Reset field state
117  field_start = UNINITIALIZED_FIELD;
118  field_length = 0;
119  }
120 
122  CSV_INLINE size_t IBasicCSVParser::parse()
123  {
124  using internals::ParseFlags;
125 
126  this->quote_escape = false;
127  this->data_pos = 0;
128  this->current_row_start() = 0;
129  this->trim_utf8_bom();
130 
131  auto& in = this->data_ptr->data;
132  while (this->data_pos < in.size()) {
133  switch (compound_parse_flag(in[this->data_pos])) {
134  case ParseFlags::DELIMITER:
135  this->push_field();
136  this->data_pos++;
137  break;
138 
139  case ParseFlags::NEWLINE:
140  this->data_pos++;
141 
142  // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines)
143  while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE)
144  this->data_pos++;
145 
146  // End of record -> Write record
147  this->push_field();
148  this->push_row();
149 
150  // Reset
151  this->current_row = CSVRow(data_ptr, this->data_pos, fields->size());
152  break;
153 
154  case ParseFlags::NOT_SPECIAL:
155  this->parse_field();
156  break;
157 
158  case ParseFlags::QUOTE_ESCAPE_QUOTE:
159  if (data_pos + 1 == in.size()) return this->current_row_start();
160  else if (data_pos + 1 < in.size()) {
161  auto next_ch = parse_flag(in[data_pos + 1]);
162  if (next_ch >= ParseFlags::DELIMITER) {
163  quote_escape = false;
164  data_pos++;
165  break;
166  }
167  else if (next_ch == ParseFlags::QUOTE) {
168  // Case: Escaped quote
169  data_pos += 2;
170  this->field_length += 2;
171  this->field_has_double_quote = true;
172  break;
173  }
174  }
175 
176  // Case: Unescaped single quote => not strictly valid but we'll keep it
177  this->field_length++;
178  data_pos++;
179 
180  break;
181 
182  default: // Quote (currently not quote escaped)
183  if (this->field_length == 0) {
184  quote_escape = true;
185  data_pos++;
186  if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos]))
187  field_start = (int)(data_pos - current_row_start());
188  break;
189  }
190 
191  // Case: Unescaped quote
192  this->field_length++;
193  data_pos++;
194 
195  break;
196  }
197  }
198 
199  return this->current_row_start();
200  }
201 
202  CSV_INLINE void IBasicCSVParser::push_row() {
203  current_row.row_length = fields->size() - current_row.fields_start;
204  this->_records->push_back(std::move(current_row));
205  }
206 
207  CSV_INLINE void IBasicCSVParser::reset_data_ptr() {
208  this->data_ptr = std::make_shared<RawCSVData>();
209  this->data_ptr->parse_flags = this->_parse_flags;
210  this->data_ptr->col_names = this->_col_names;
211  this->fields = &(this->data_ptr->fields);
212  }
213 
214  CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
215  auto& data = this->data_ptr->data;
216 
217  if (!this->unicode_bom_scan && data.size() >= 3) {
218  if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') {
219  this->data_pos += 3; // Remove BOM from input string
220  this->_utf8_bom = true;
221  }
222 
223  this->unicode_bom_scan = true;
224  }
225  }
226 #ifdef _MSC_VER
227 #pragma endregion
228 #endif
229 
230 #ifdef _MSC_VER
231 #pragma region Specializations
232 #endif
233  CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) {
234  // Reset parser state
235  this->field_start = UNINITIALIZED_FIELD;
236  this->field_length = 0;
237  this->reset_data_ptr();
238 
239  // Create memory map
240  size_t length = std::min(this->source_size - this->mmap_pos, bytes);
241  std::error_code error;
242  this->data_ptr->_data = std::make_shared<mio::basic_mmap_source<char>>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error));
243  this->mmap_pos += length;
244  if (error) throw error;
245 
246  auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr->_data.get());
247 
248  // Create string view
249  this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length());
250 
251  // Parse
252  this->current_row = CSVRow(this->data_ptr);
253  size_t remainder = this->parse();
254 
255  if (this->mmap_pos == this->source_size || no_chunk()) {
256  this->_eof = true;
257  this->end_feed();
258  }
259 
260  this->mmap_pos -= (length - remainder);
261  }
262 #ifdef _MSC_VER
263 #pragma endregion
264 #endif
265  }
266 }
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
Data structure for representing CSV rows.
Definition: csv_row.hpp:304
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
Definition: common.hpp:151
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition: common.hpp:166
HEDLEY_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
The all encompassing namespace.
CSVReader parse(csv::string_view in, CSVFormat format)
Shorthand function for parsing an in-memory CSV string.
Definition: csv_utility.cpp:14
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75