Vince's CSV Parser
csv_row.hpp
Go to the documentation of this file.
1 
5 #pragma once
6 #include <cmath>
7 #include <iterator>
8 #include <memory> // For CSVField
9 #include <limits> // For CSVField
10 #include <unordered_map>
11 #include <unordered_set>
12 #include <string>
13 #include <sstream>
14 #include <vector>
15 
16 #include "common.hpp"
17 #include "data_type.h"
18 #include "col_names.hpp"
19 
20 namespace csv {
21  namespace internals {
22  class IBasicCSVParser;
23 
24  static const std::string ERROR_NAN = "Not a number.";
25  static const std::string ERROR_OVERFLOW = "Overflow error.";
26  static const std::string ERROR_FLOAT_TO_INT =
27  "Attempted to convert a floating point value to an integral type.";
28  static const std::string ERROR_NEG_TO_UNSIGNED = "Negative numbers cannot be converted to unsigned types.";
29 
30  std::string json_escape_string(csv::string_view s) noexcept;
31 
33  struct RawCSVField {
34  RawCSVField() = default;
35  RawCSVField(size_t _start, size_t _length, bool _double_quote = false) {
36  start = _start;
37  length = _length;
38  has_double_quote = _double_quote;
39  }
40 
42  size_t start;
43 
45  size_t length;
46 
49  };
50 
62  class CSVFieldList {
63  public:
65  CSVFieldList(size_t single_buffer_capacity = (size_t)(internals::PAGE_SIZE / sizeof(RawCSVField))) :
66  _single_buffer_capacity(single_buffer_capacity) {
67  this->allocate();
68  }
69 
70  // No copy constructor
71  CSVFieldList(const CSVFieldList& other) = delete;
72 
73  // CSVFieldArrays may be moved
74  CSVFieldList(CSVFieldList&& other) :
75  _single_buffer_capacity(other._single_buffer_capacity) {
76  buffers = std::move(other.buffers);
77  _current_buffer_size = other._current_buffer_size;
78  _back = other._back;
79  }
80 
81  ~CSVFieldList() {
82  for (auto& buffer : buffers)
83  delete[] buffer;
84  }
85 
86  template <class... Args>
87  void emplace_back(Args&&... args) {
88  if (this->_current_buffer_size == this->_single_buffer_capacity) {
89  this->allocate();
90  }
91 
92  *(_back++) = RawCSVField(std::forward<Args>(args)...);
93  _current_buffer_size++;
94  }
95 
96  size_t size() const noexcept {
97  return this->_current_buffer_size + ((this->buffers.size() - 1) * this->_single_buffer_capacity);
98  }
99 
100  RawCSVField& operator[](size_t n) const;
101 
102  private:
103  const size_t _single_buffer_capacity;
104 
105  std::vector<RawCSVField*> buffers = {};
106 
108  size_t _current_buffer_size = 0;
109 
111  RawCSVField* _back = nullptr;
112 
114  void allocate();
115  };
116 
117 
119  struct RawCSVData {
120  std::shared_ptr<void> _data = nullptr;
121  csv::string_view data = "";
122 
124 
125  std::unordered_set<size_t> has_double_quotes = {};
126 
127  // TODO: Consider replacing with a more thread-safe structure
128  std::unordered_map<size_t, std::string> double_quote_fields = {};
129 
130  internals::ColNamesPtr col_names = nullptr;
131  internals::ParseFlagMap parse_flags;
132  internals::WhitespaceMap ws_flags;
133  };
134 
135  using RawCSVDataPtr = std::shared_ptr<RawCSVData>;
136  }
137 
143  class CSVField {
144  public:
146  constexpr explicit CSVField(csv::string_view _sv) noexcept : sv(_sv) { };
147 
148  operator std::string() const {
149  return std::string("<CSVField> ") + std::string(this->sv);
150  }
151 
180  template<typename T = std::string> T get() {
181  IF_CONSTEXPR(std::is_arithmetic<T>::value) {
182  // Note: this->type() also converts the CSV value to float
183  if (this->type() <= DataType::CSV_STRING) {
184  throw std::runtime_error(internals::ERROR_NAN);
185  }
186  }
187 
188  IF_CONSTEXPR(std::is_integral<T>::value) {
189  // Note: this->is_float() also converts the CSV value to float
190  if (this->is_float()) {
191  throw std::runtime_error(internals::ERROR_FLOAT_TO_INT);
192  }
193 
194  IF_CONSTEXPR(std::is_unsigned<T>::value) {
195  if (this->value < 0) {
196  throw std::runtime_error(internals::ERROR_NEG_TO_UNSIGNED);
197  }
198  }
199  }
200 
201  // Allow fallthrough from previous if branch
202  IF_CONSTEXPR(!std::is_floating_point<T>::value) {
203  IF_CONSTEXPR(std::is_unsigned<T>::value) {
204  // Quick hack to perform correct unsigned integer boundary checks
205  if (this->value > internals::get_uint_max<sizeof(T)>()) {
206  throw std::runtime_error(internals::ERROR_OVERFLOW);
207  }
208  }
209  else if (internals::type_num<T>() < this->_type) {
210  throw std::runtime_error(internals::ERROR_OVERFLOW);
211  }
212  }
213 
214  return static_cast<T>(this->value);
215  }
216 
218  bool try_parse_hex(int& parsedValue);
219 
233  template<typename T>
234  CONSTEXPR_14 bool operator==(T other) const noexcept
235  {
236  static_assert(std::is_arithmetic<T>::value,
237  "T should be a numeric value.");
238 
239  if (this->_type != DataType::UNKNOWN) {
240  if (this->_type == DataType::CSV_STRING) {
241  return false;
242  }
243 
244  return internals::is_equal(value, static_cast<long double>(other), 0.000001L);
245  }
246 
247  long double out = 0;
248  if (internals::data_type(this->sv, &out) == DataType::CSV_STRING) {
249  return false;
250  }
251 
252  return internals::is_equal(out, static_cast<long double>(other), 0.000001L);
253  }
254 
256  CONSTEXPR csv::string_view get_sv() const noexcept { return this->sv; }
257 
259  CONSTEXPR_14 bool is_null() noexcept { return type() == DataType::CSV_NULL; }
260 
262  CONSTEXPR_14 bool is_str() noexcept { return type() == DataType::CSV_STRING; }
263 
265  CONSTEXPR_14 bool is_num() noexcept { return type() >= DataType::CSV_INT8; }
266 
268  CONSTEXPR_14 bool is_int() noexcept {
269  return (type() >= DataType::CSV_INT8) && (type() <= DataType::CSV_INT64);
270  }
271 
273  CONSTEXPR_14 bool is_float() noexcept { return type() == DataType::CSV_DOUBLE; };
274 
276  CONSTEXPR_14 DataType type() noexcept {
277  this->get_value();
278  return _type;
279  }
280 
281  private:
282  long double value = 0;
283  csv::string_view sv = "";
284  DataType _type = DataType::UNKNOWN;
285  CONSTEXPR_14 void get_value() noexcept {
286  /* Check to see if value has been cached previously, if not
287  * evaluate it
288  */
289  if ((int)_type < 0) {
290  this->_type = internals::data_type(this->sv, &this->value);
291  }
292  }
293  };
294 
296  class CSVRow {
297  public:
299 
300  CSVRow() = default;
301 
303  CSVRow(internals::RawCSVDataPtr _data) : data(_data) {}
304  CSVRow(internals::RawCSVDataPtr _data, size_t _data_start, size_t _field_bounds)
305  : data(_data), data_start(_data_start), fields_start(_field_bounds) {}
306 
308  CONSTEXPR bool empty() const noexcept { return this->size() == 0; }
309 
311  CONSTEXPR size_t size() const noexcept { return row_length; }
312 
315  CSVField operator[](size_t n) const;
316  CSVField operator[](const std::string&) const;
317  std::string to_json(const std::vector<std::string>& subset = {}) const;
318  std::string to_json_array(const std::vector<std::string>& subset = {}) const;
319 
321  std::vector<std::string> get_col_names() const {
322  return this->data->col_names->get_col_names();
323  }
324 
329  operator std::vector<std::string>() const;
331 
335  class iterator {
336  public:
337 #ifndef DOXYGEN_SHOULD_SKIP_THIS
338  using value_type = CSVField;
339  using difference_type = int;
340 
341  // Using CSVField * as pointer type causes segfaults in MSVC debug builds
342  // but using shared_ptr as pointer type won't compile in g++
343 #ifdef _MSC_BUILD
344  using pointer = std::shared_ptr<CSVField>;
345 #else
346  using pointer = CSVField * ;
347 #endif
348 
349  using reference = CSVField & ;
350  using iterator_category = std::random_access_iterator_tag;
351 #endif
352  iterator(const CSVRow*, int i);
353 
354  reference operator*() const;
355  pointer operator->() const;
356 
357  iterator operator++(int);
358  iterator& operator++();
359  iterator operator--(int);
360  iterator& operator--();
361  iterator operator+(difference_type n) const;
362  iterator operator-(difference_type n) const;
363 
365  CONSTEXPR bool operator==(const iterator& other) const noexcept {
366  return this->i == other.i;
367  };
368 
369  CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
370 
371 #ifndef NDEBUG
372  friend CSVRow;
373 #endif
374 
375  private:
376  const CSVRow * daddy = nullptr; // Pointer to parent
377  std::shared_ptr<CSVField> field = nullptr; // Current field pointed at
378  int i = 0; // Index of current field
379  };
380 
382  using reverse_iterator = std::reverse_iterator<iterator>;
383 
388  iterator begin() const;
389  iterator end() const noexcept;
390  reverse_iterator rbegin() const noexcept;
391  reverse_iterator rend() const;
393 
394  private:
396  csv::string_view get_field(size_t index) const;
397 
398  internals::RawCSVDataPtr data;
399 
401  size_t data_start = 0;
402 
404  size_t fields_start = 0;
405 
407  size_t row_length = 0;
408  };
409 
410 #ifdef _MSC_VER
411 #pragma region CSVField::get Specializations
412 #endif
414  template<>
415  inline std::string CSVField::get<std::string>() {
416  return std::string(this->sv);
417  }
418 
424  template<>
425  CONSTEXPR_14 csv::string_view CSVField::get<csv::string_view>() {
426  return this->sv;
427  }
428 
430  template<>
431  CONSTEXPR_14 long double CSVField::get<long double>() {
432  if (!is_num())
433  throw std::runtime_error(internals::ERROR_NAN);
434 
435  return this->value;
436  }
437 #ifdef _MSC_VER
438 #pragma endregion CSVField::get Specializations
439 #endif
440 
442  template<>
443  CONSTEXPR bool CSVField::operator==(const char * other) const noexcept
444  {
445  return this->sv == other;
446  }
447 
449  template<>
451  {
452  return this->sv == other;
453  }
454 }
455 
456 inline std::ostream& operator << (std::ostream& os, csv::CSVField const& value) {
457  os << std::string(value);
458  return os;
459 }
Data type representing individual CSV values.
Definition: csv_row.hpp:143
CONSTEXPR_14 bool is_num() noexcept
Returns true if field is an integer or float.
Definition: csv_row.hpp:265
CONSTEXPR_14 bool is_str() noexcept
Returns true if field is a non-numeric, non-empty string.
Definition: csv_row.hpp:262
CONSTEXPR_14 bool is_int() noexcept
Returns true if field is an integer.
Definition: csv_row.hpp:268
CONSTEXPR_14 bool is_null() noexcept
Returns true if field is an empty string or string of whitespace characters.
Definition: csv_row.hpp:259
constexpr CSVField(csv::string_view _sv) noexcept
Constructs a CSVField from a string_view.
Definition: csv_row.hpp:146
CONSTEXPR_14 DataType type() noexcept
Return the type of the underlying CSV data.
Definition: csv_row.hpp:276
T get()
Returns the value casted to the requested type, performing type checking before.
Definition: csv_row.hpp:180
CONSTEXPR_14 bool operator==(T other) const noexcept
Compares the contents of this field to a numeric value.
Definition: csv_row.hpp:234
CONSTEXPR_14 bool is_float() noexcept
Returns true if field is a floating point value.
Definition: csv_row.hpp:273
CONSTEXPR csv::string_view get_sv() const noexcept
Return a string view over the field's contents.
Definition: csv_row.hpp:256
bool try_parse_hex(int &parsedValue)
Parse a hexadecimal value, returning false if the value is not hex.
Definition: csv_row.cpp:101
A random access iterator over the contents of a CSV row.
Definition: csv_row.hpp:335
CONSTEXPR bool operator==(const iterator &other) const noexcept
Two iterators are equal if they point to the same field.
Definition: csv_row.hpp:365
Data structure for representing CSV rows.
Definition: csv_row.hpp:296
iterator end() const noexcept
Return an iterator pointing to just after the end of the CSVRow.
Definition: csv_row.cpp:180
std::reverse_iterator< iterator > reverse_iterator
A reverse iterator over the contents of a CSVRow.
Definition: csv_row.hpp:382
std::string to_json(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON object, i.e.
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
Definition: csv_row.hpp:308
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON array, i.e.
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
Definition: csv_row.hpp:311
std::vector< std::string > get_col_names() const
Retrieve this row's associated column names.
Definition: csv_row.hpp:321
CSVField operator[](size_t n) const
Return a CSVField object corrsponding to the nth value in the row.
Definition: csv_row.cpp:34
iterator begin() const
Return an iterator pointing to the first field.
Definition: csv_row.cpp:171
CSVRow(internals::RawCSVDataPtr _data)
Construct a CSVRow from a RawCSVDataPtr.
Definition: csv_row.hpp:303
A class used for efficiently storing RawCSVField objects and expanding as necessary.
Definition: csv_row.hpp:62
CSVFieldList(size_t single_buffer_capacity=(size_t)(internals::PAGE_SIZE/sizeof(RawCSVField)))
Construct a CSVFieldList which allocates blocks of a certain size.
Definition: csv_row.hpp:65
Abstract base class which provides CSV parsing logic.
A standalone header file containing shared code.
#define IF_CONSTEXPR
Expands to if constexpr in C++17 and if otherwise.
Definition: common.hpp:84
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition: common.hpp:117
Implements data type parsing functionality.
std::array< ParseFlags, 256 > ParseFlagMap
An array which maps ASCII chars to a parsing flag.
Definition: common.hpp:200
std::array< bool, 256 > WhitespaceMap
An array which maps ASCII chars to a flag indicating if it is whitespace.
Definition: common.hpp:203
bool is_equal(T a, T b, T epsilon=0.001)
Definition: common.hpp:154
const int PAGE_SIZE
Size of a memory page in bytes.
Definition: common.hpp:145
CONSTEXPR_14 DataType data_type(csv::string_view in, long double *const out)
Distinguishes numeric from other text values.
Definition: data_type.h:238
CONSTEXPR_14 long double get_uint_max()
Given a byte size, return the largest number than can be stored in an unsigned integer of that size.
Definition: data_type.h:134
The all encompassing namespace.
DataType
Enumerates the different CSV field types that are recognized by this library.
Definition: data_type.h:20
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_NULL
Empty string.
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-numeric string.
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75
A class for storing raw CSV data and associated metadata.
Definition: csv_row.hpp:119
A barebones class used for describing CSV fields.
Definition: csv_row.hpp:33
size_t start
The start of the field, relative to the beginning of the row.
Definition: csv_row.hpp:42
bool has_double_quote
Whether or not the field contains an escaped quote.
Definition: csv_row.hpp:48
size_t length
The length of the row, ignoring quote escape characters.
Definition: csv_row.hpp:45