Vince's CSV Parser
basic_csv_parser.cpp
1 #include "basic_csv_parser.hpp"
2 
3 namespace csv {
4  namespace internals {
5  CSV_INLINE size_t get_file_size(csv::string_view filename) {
6  std::ifstream infile(std::string(filename), std::ios::binary);
7  const auto start = infile.tellg();
8  infile.seekg(0, std::ios::end);
9  const auto end = infile.tellg();
10 
11  return end - start;
12  }
13 
14  CSV_INLINE std::string get_csv_head(csv::string_view filename) {
15  return get_csv_head(filename, get_file_size(filename));
16  }
17 
18  CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) {
19  const size_t bytes = 500000;
20 
21  std::error_code error;
22  size_t length = std::min((size_t)file_size, bytes);
23  auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error);
24 
25  if (error) {
26  throw std::runtime_error("Cannot open file " + std::string(filename));
27  }
28 
29  return std::string(mmap.begin(), mmap.end());
30  }
31 
32 #ifdef _MSC_VER
33 #pragma region IBasicCVParser
34 #endif
35  CSV_INLINE IBasicCSVParser::IBasicCSVParser(
36  const CSVFormat& format,
37  const ColNamesPtr& col_names
38  ) : _col_names(col_names) {
39  if (format.no_quote) {
40  _parse_flags = internals::make_parse_flags(format.get_delim());
41  }
42  else {
43  _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char);
44  }
45 
46  _ws_flags = internals::make_ws_flags(
47  format.trim_chars.data(), format.trim_chars.size()
48  );
49  }
50 
51  CSV_INLINE void IBasicCSVParser::end_feed() {
53 
54  bool empty_last_field = this->data_ptr
55  && this->data_ptr->_data
56  && !this->data_ptr->data.empty()
57  && parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER;
58 
59  // Push field
60  if (this->field_length > 0 || empty_last_field) {
61  this->push_field();
62  }
63 
64  // Push row
65  if (this->current_row.size() > 0)
66  this->push_row();
67  }
68 
69  CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
71  auto& in = this->data_ptr->data;
72 
73  // Trim off leading whitespace
74  while (data_pos < in.size() && ws_flag(in[data_pos]))
75  data_pos++;
76 
77  if (field_start == UNINITIALIZED_FIELD)
78  field_start = (int)(data_pos - current_row_start());
79 
80  // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
81  // sequences, use the loop below to avoid having to go through the outer
82  // switch statement as much as possible
83  while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL)
84  data_pos++;
85 
86  field_length = data_pos - (field_start + current_row_start());
87 
88  // Trim off trailing whitespace, this->field_length constraint matters
89  // when field is entirely whitespace
90  for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--)
91  this->field_length--;
92  }
93 
94  CSV_INLINE void IBasicCSVParser::push_field()
95  {
96  // Update
97  if (field_has_double_quote) {
98  fields->emplace_back(
99  field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
100  field_length,
101  true
102  );
103  field_has_double_quote = false;
104 
105  }
106  else {
107  fields->emplace_back(
108  field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
109  field_length
110  );
111  }
112 
113  current_row.row_length++;
114 
115  // Reset field state
116  field_start = UNINITIALIZED_FIELD;
117  field_length = 0;
118  }
119 
121  CSV_INLINE size_t IBasicCSVParser::parse()
122  {
123  using internals::ParseFlags;
124 
125  this->quote_escape = false;
126  this->data_pos = 0;
127  this->current_row_start() = 0;
128  this->trim_utf8_bom();
129 
130  auto& in = this->data_ptr->data;
131  while (this->data_pos < in.size()) {
132  switch (compound_parse_flag(in[this->data_pos])) {
133  case ParseFlags::DELIMITER:
134  this->push_field();
135  this->data_pos++;
136  break;
137 
138  case ParseFlags::NEWLINE:
139  this->data_pos++;
140 
141  // Catches CRLF (or LFLF)
142  if (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE)
143  this->data_pos++;
144 
145  // End of record -> Write record
146  this->push_field();
147  this->push_row();
148 
149  // Reset
150  this->current_row = CSVRow(data_ptr, this->data_pos, fields->size());
151  break;
152 
153  case ParseFlags::NOT_SPECIAL:
154  this->parse_field();
155  break;
156 
157  case ParseFlags::QUOTE_ESCAPE_QUOTE:
158  if (data_pos + 1 == in.size()) return this->current_row_start();
159  else if (data_pos + 1 < in.size()) {
160  auto next_ch = parse_flag(in[data_pos + 1]);
161  if (next_ch >= ParseFlags::DELIMITER) {
162  quote_escape = false;
163  data_pos++;
164  break;
165  }
166  else if (next_ch == ParseFlags::QUOTE) {
167  // Case: Escaped quote
168  data_pos += 2;
169  this->field_length += 2;
170  this->field_has_double_quote = true;
171  break;
172  }
173  }
174 
175  // Case: Unescaped single quote => not strictly valid but we'll keep it
176  this->field_length++;
177  data_pos++;
178 
179  break;
180 
181  default: // Quote (currently not quote escaped)
182  if (this->field_length == 0) {
183  quote_escape = true;
184  data_pos++;
185  if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos]))
186  field_start = (int)(data_pos - current_row_start());
187  break;
188  }
189 
190  // Case: Unescaped quote
191  this->field_length++;
192  data_pos++;
193 
194  break;
195  }
196  }
197 
198  return this->current_row_start();
199  }
200 
201  CSV_INLINE void IBasicCSVParser::push_row() {
202  current_row.row_length = fields->size() - current_row.fields_start;
203  this->_records->push_back(std::move(current_row));
204  }
205 
206  CSV_INLINE void IBasicCSVParser::reset_data_ptr() {
207  this->data_ptr = std::make_shared<RawCSVData>();
208  this->data_ptr->parse_flags = this->_parse_flags;
209  this->data_ptr->col_names = this->_col_names;
210  this->fields = &(this->data_ptr->fields);
211  }
212 
213  CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
214  auto& data = this->data_ptr->data;
215 
216  if (!this->unicode_bom_scan && data.size() >= 3) {
217  if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') {
218  this->data_pos += 3; // Remove BOM from input string
219  this->_utf8_bom = true;
220  }
221 
222  this->unicode_bom_scan = true;
223  }
224  }
225 #ifdef _MSC_VER
226 #pragma endregion
227 #endif
228 
229 #ifdef _MSC_VER
230 #pragma region Specializations
231 #endif
232  CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) {
233  // Reset parser state
234  this->field_start = UNINITIALIZED_FIELD;
235  this->field_length = 0;
236  this->reset_data_ptr();
237 
238  // Create memory map
239  size_t length = std::min(this->source_size - this->mmap_pos, bytes);
240  std::error_code error;
241  this->data_ptr->_data = std::make_shared<mio::basic_mmap_source<char>>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error));
242  this->mmap_pos += length;
243  if (error) throw error;
244 
245  auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr->_data.get());
246 
247  // Create string view
248  this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length());
249 
250  // Parse
251  this->current_row = CSVRow(this->data_ptr);
252  size_t remainder = this->parse();
253 
254  if (this->mmap_pos == this->source_size || no_chunk()) {
255  this->_eof = true;
256  this->end_feed();
257  }
258 
259  this->mmap_pos -= (length - remainder);
260  }
261 #ifdef _MSC_VER
262 #pragma endregion
263 #endif
264  }
265 }
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
Data structure for representing CSV rows.
Definition: csv_row.hpp:296
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
Definition: common.hpp:151
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition: common.hpp:166
HEDLEY_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
The all encompassing namespace.
CSVReader parse(csv::string_view in, CSVFormat format)
Shorthand function for parsing an in-memory CSV string.
Definition: csv_utility.cpp:14
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75