Vince's CSV Parser
data_type.hpp
Go to the documentation of this file.
1 
5 #pragma once
6 #include <cmath>
7 #include <cctype>
8 #include <string>
9 #include <cassert>
10 
11 #include "common.hpp"
12 
13 namespace csv {
20  enum class DataType {
21  UNKNOWN = -1,
22  CSV_NULL,
23  CSV_STRING,
24  CSV_INT8,
25  CSV_INT16,
26  CSV_INT32,
27  CSV_INT64,
28  CSV_BIGINT,
29  CSV_DOUBLE
30  };
31 
32  static_assert(DataType::CSV_STRING < DataType::CSV_INT8, "String type should come before numeric types.");
33  static_assert(DataType::CSV_INT8 < DataType::CSV_INT64, "Smaller integer types should come before larger integer types.");
34  static_assert(DataType::CSV_INT64 < DataType::CSV_DOUBLE, "Integer types should come before floating point value types.");
35 
36  namespace internals {
38  template<typename T>
39  HEDLEY_CONST CONSTEXPR_14
40  long double pow10(const T& n) noexcept {
41  long double multiplicand = n > 0 ? 10 : 0.1,
42  ret = 1;
43 
44  // Make all numbers positive
45  T iterations = n > 0 ? n : -n;
46 
47  for (T i = 0; i < iterations; i++) {
48  ret *= multiplicand;
49  }
50 
51  return ret;
52  }
53 
55  template<>
56  HEDLEY_CONST CONSTEXPR_14
57  long double pow10(const unsigned& n) noexcept {
58  long double multiplicand = n > 0 ? 10 : 0.1,
59  ret = 1;
60 
61  for (unsigned i = 0; i < n; i++) {
62  ret *= multiplicand;
63  }
64 
65  return ret;
66  }
67 
68 #ifndef DOXYGEN_SHOULD_SKIP_THIS
70  constexpr DataType int_type_arr[8] = {
71  DataType::CSV_INT8, // 1
73  DataType::UNKNOWN,
75  DataType::UNKNOWN,
76  DataType::UNKNOWN,
77  DataType::UNKNOWN,
79  };
80 
81  template<typename T>
82  inline DataType type_num() {
83  static_assert(std::is_integral<T>::value, "T should be an integral type.");
84  static_assert(sizeof(T) <= 8, "Byte size must be no greater than 8.");
85  return int_type_arr[sizeof(T) - 1];
86  }
87 
88  template<> inline DataType type_num<float>() { return DataType::CSV_DOUBLE; }
89  template<> inline DataType type_num<double>() { return DataType::CSV_DOUBLE; }
90  template<> inline DataType type_num<long double>() { return DataType::CSV_DOUBLE; }
91  template<> inline DataType type_num<std::nullptr_t>() { return DataType::CSV_NULL; }
92  template<> inline DataType type_num<std::string>() { return DataType::CSV_STRING; }
93 
94  CONSTEXPR_14 DataType data_type(csv::string_view in, long double* const out = nullptr,
95  const char decimalsymbol = '.');
96 #endif
97 
104  template<size_t Bytes>
105  CONSTEXPR_14 long double get_int_max() {
106  static_assert(Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8,
107  "Bytes must be a power of 2 below 8.");
108 
109  IF_CONSTEXPR (sizeof(signed char) == Bytes) {
110  return (long double)std::numeric_limits<signed char>::max();
111  }
112 
113  IF_CONSTEXPR (sizeof(short) == Bytes) {
114  return (long double)std::numeric_limits<short>::max();
115  }
116 
117  IF_CONSTEXPR (sizeof(int) == Bytes) {
118  return (long double)std::numeric_limits<int>::max();
119  }
120 
121  IF_CONSTEXPR (sizeof(long int) == Bytes) {
122  return (long double)std::numeric_limits<long int>::max();
123  }
124 
125  IF_CONSTEXPR (sizeof(long long int) == Bytes) {
126  return (long double)std::numeric_limits<long long int>::max();
127  }
128 
129  HEDLEY_UNREACHABLE();
130  }
131 
135  template<size_t Bytes>
136  CONSTEXPR_14 long double get_uint_max() {
137  static_assert(Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8,
138  "Bytes must be a power of 2 below 8.");
139 
140  IF_CONSTEXPR(sizeof(unsigned char) == Bytes) {
141  return (long double)std::numeric_limits<unsigned char>::max();
142  }
143 
144  IF_CONSTEXPR(sizeof(unsigned short) == Bytes) {
145  return (long double)std::numeric_limits<unsigned short>::max();
146  }
147 
148  IF_CONSTEXPR(sizeof(unsigned int) == Bytes) {
149  return (long double)std::numeric_limits<unsigned int>::max();
150  }
151 
152  IF_CONSTEXPR(sizeof(unsigned long int) == Bytes) {
153  return (long double)std::numeric_limits<unsigned long int>::max();
154  }
155 
156  IF_CONSTEXPR(sizeof(unsigned long long int) == Bytes) {
157  return (long double)std::numeric_limits<unsigned long long int>::max();
158  }
159 
160  HEDLEY_UNREACHABLE();
161  }
162 
164  CONSTEXPR_VALUE_14 long double CSV_INT8_MAX = get_int_max<1>();
165 
167  CONSTEXPR_VALUE_14 long double CSV_INT16_MAX = get_int_max<2>();
168 
170  CONSTEXPR_VALUE_14 long double CSV_INT32_MAX = get_int_max<4>();
171 
173  CONSTEXPR_VALUE_14 long double CSV_INT64_MAX = get_int_max<8>();
174 
176  CONSTEXPR_VALUE_14 long double CSV_UINT8_MAX = get_uint_max<1>();
177 
179  CONSTEXPR_VALUE_14 long double CSV_UINT16_MAX = get_uint_max<2>();
180 
182  CONSTEXPR_VALUE_14 long double CSV_UINT32_MAX = get_uint_max<4>();
183 
185  CONSTEXPR_VALUE_14 long double CSV_UINT64_MAX = get_uint_max<8>();
186 
191  HEDLEY_PRIVATE CONSTEXPR_14
193  csv::string_view exponential_part,
194  const long double& coeff,
195  long double * const out) {
196  long double exponent = 0;
197  auto result = data_type(exponential_part, &exponent);
198 
199  // Exponents in scientific notation should not be decimal numbers
200  if (result >= DataType::CSV_INT8 && result < DataType::CSV_DOUBLE) {
201  if (out) *out = coeff * pow10(exponent);
202  return DataType::CSV_DOUBLE;
203  }
204 
205  return DataType::CSV_STRING;
206  }
207 
211  HEDLEY_PRIVATE HEDLEY_PURE CONSTEXPR_14
212  DataType _determine_integral_type(const long double& number) noexcept {
213  // We can assume number is always non-negative
214  assert(number >= 0);
215 
216  if (number <= internals::CSV_INT8_MAX)
217  return DataType::CSV_INT8;
218  else if (number <= internals::CSV_INT16_MAX)
219  return DataType::CSV_INT16;
220  else if (number <= internals::CSV_INT32_MAX)
221  return DataType::CSV_INT32;
222  else if (number <= internals::CSV_INT64_MAX)
223  return DataType::CSV_INT64;
224  else // Conversion to long long will cause an overflow
225  return DataType::CSV_BIGINT;
226  }
227 
241  CONSTEXPR_14
242  DataType data_type(csv::string_view in, long double* const out, const char decimalSymbol) {
243  // Empty string --> NULL
244  if (in.size() == 0)
245  return DataType::CSV_NULL;
246 
247  bool ws_allowed = true,
248  dot_allowed = true,
249  digit_allowed = true,
250  is_negative = false,
251  has_digit = false,
252  prob_float = false;
253 
254  unsigned places_after_decimal = 0;
255  long double integral_part = 0,
256  decimal_part = 0;
257 
258  for (size_t i = 0, ilen = in.size(); i < ilen; i++) {
259  const char& current = in[i];
260 
261  switch (current) {
262  case ' ':
263  if (!ws_allowed) {
264  if (isdigit(in[i - 1])) {
265  digit_allowed = false;
266  ws_allowed = true;
267  }
268  else {
269  // Ex: '510 123 4567'
270  return DataType::CSV_STRING;
271  }
272  }
273  break;
274  case '+':
275  if (!ws_allowed) {
276  return DataType::CSV_STRING;
277  }
278 
279  break;
280  case '-':
281  if (!ws_allowed) {
282  // Ex: '510-123-4567'
283  return DataType::CSV_STRING;
284  }
285 
286  is_negative = true;
287  break;
288  // case decimalSymbol: not allowed because decimalSymbol is not a literal,
289  // it is handled in the default block
290  case 'e':
291  case 'E':
292  // Process scientific notation
293  if (prob_float || (i && i + 1 < ilen && isdigit(in[i - 1]))) {
294  size_t exponent_start_idx = i + 1;
295  prob_float = true;
296 
297  // Strip out plus sign
298  if (in[i + 1] == '+') {
299  exponent_start_idx++;
300  }
301 
303  in.substr(exponent_start_idx),
304  is_negative ? -(integral_part + decimal_part) : integral_part + decimal_part,
305  out
306  );
307  }
308 
309  return DataType::CSV_STRING;
310  break;
311  default:
312  short digit = static_cast<short>(current - '0');
313  if (digit >= 0 && digit <= 9) {
314  // Process digit
315  has_digit = true;
316 
317  if (!digit_allowed)
318  return DataType::CSV_STRING;
319  else if (ws_allowed) // Ex: '510 456'
320  ws_allowed = false;
321 
322  // Build current number
323  if (prob_float)
324  decimal_part += digit / pow10(++places_after_decimal);
325  else
326  integral_part = (integral_part * 10) + digit;
327  }
328  // case decimalSymbol: not allowed because decimalSymbol is not a literal.
329  else if (dot_allowed && current == decimalSymbol) {
330  dot_allowed = false;
331  prob_float = true;
332  }
333  else {
334  return DataType::CSV_STRING;
335  }
336  }
337  }
338 
339  // No non-numeric/non-whitespace characters found
340  if (has_digit) {
341  long double number = integral_part + decimal_part;
342  if (out) {
343  *out = is_negative ? -number : number;
344  }
345 
346  return prob_float ? DataType::CSV_DOUBLE : _determine_integral_type(number);
347  }
348 
349  // Just whitespace
350  return DataType::CSV_NULL;
351  }
352  }
353 }
A standalone header file containing shared code.
#define IF_CONSTEXPR
Expands to if constexpr in C++17 and if otherwise.
Definition: common.hpp:84
CONSTEXPR_VALUE_14 long double CSV_INT16_MAX
Largest number that can be stored in a 16-bit integer.
Definition: data_type.hpp:167
CONSTEXPR_VALUE_14 long double CSV_INT32_MAX
Largest number that can be stored in a 32-bit integer.
Definition: data_type.hpp:170
CONSTEXPR_VALUE_14 long double CSV_UINT16_MAX
Largest number that can be stored in a 16-bit unsigned integer.
Definition: data_type.hpp:179
CONSTEXPR_14 DataType data_type(csv::string_view in, long double *const out, const char decimalSymbol)
Distinguishes numeric from other text values.
Definition: data_type.hpp:242
CONSTEXPR_VALUE_14 long double CSV_UINT32_MAX
Largest number that can be stored in a 32-bit unsigned integer.
Definition: data_type.hpp:182
HEDLEY_PRIVATE CONSTEXPR_14 DataType _process_potential_exponential(csv::string_view exponential_part, const long double &coeff, long double *const out)
Given a pointer to the start of what is start of the exponential part of a number written (possibly) ...
Definition: data_type.hpp:192
CONSTEXPR_VALUE_14 long double CSV_INT64_MAX
Largest number that can be stored in a 64-bit integer.
Definition: data_type.hpp:173
CONSTEXPR_VALUE_14 long double CSV_INT8_MAX
Largest number that can be stored in a 8-bit integer.
Definition: data_type.hpp:164
CONSTEXPR_VALUE_14 long double CSV_UINT64_MAX
Largest number that can be stored in a 64-bit unsigned integer.
Definition: data_type.hpp:185
CONSTEXPR_VALUE_14 long double CSV_UINT8_MAX
Largest number that can be stored in a 8-bit ungisned integer.
Definition: data_type.hpp:176
HEDLEY_CONST CONSTEXPR_14 long double pow10(const T &n) noexcept
Compute 10 to the power of n.
Definition: data_type.hpp:40
HEDLEY_PRIVATE HEDLEY_PURE CONSTEXPR_14 DataType _determine_integral_type(const long double &number) noexcept
Given the absolute value of an integer, determine what numeric type it fits in.
Definition: data_type.hpp:212
CONSTEXPR_14 long double get_uint_max()
Given a byte size, return the largest number than can be stored in an unsigned integer of that size.
Definition: data_type.hpp:136
CONSTEXPR_14 long double get_int_max()
Given a byte size, return the largest number than can be stored in an integer of that size.
Definition: data_type.hpp:105
The all encompassing namespace.
DataType
Enumerates the different CSV field types that are recognized by this library.
Definition: data_type.hpp:20
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_NULL
Empty string.
@ CSV_BIGINT
Value too big to fit in a 64-bit in.
@ CSV_INT16
16-bit integer (short on MSVC/GCC)
@ CSV_INT32
32-bit integer (int on MSVC/GCC)
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-numeric string.
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75