Vince's CSV Parser
csv_stat.cpp
Go to the documentation of this file.
1 
5 #include <string>
6 #include "csv_stat.hpp"
7 
8 namespace csv {
14  reader(filename, format) {
15  this->calc();
16  }
17 
19  CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) :
20  reader(stream, format) {
21  this->calc();
22  }
23 
25  CSV_INLINE std::vector<long double> CSVStat::get_mean() const {
26  std::vector<long double> ret;
27  for (size_t i = 0; i < this->get_col_names().size(); i++) {
28  ret.push_back(this->rolling_means[i]);
29  }
30  return ret;
31  }
32 
34  CSV_INLINE std::vector<long double> CSVStat::get_variance() const {
35  std::vector<long double> ret;
36  for (size_t i = 0; i < this->get_col_names().size(); i++) {
37  ret.push_back(this->rolling_vars[i]/(this->n[i] - 1));
38  }
39  return ret;
40  }
41 
43  CSV_INLINE std::vector<long double> CSVStat::get_mins() const {
44  std::vector<long double> ret;
45  for (size_t i = 0; i < this->get_col_names().size(); i++) {
46  ret.push_back(this->mins[i]);
47  }
48  return ret;
49  }
50 
52  CSV_INLINE std::vector<long double> CSVStat::get_maxes() const {
53  std::vector<long double> ret;
54  for (size_t i = 0; i < this->get_col_names().size(); i++) {
55  ret.push_back(this->maxes[i]);
56  }
57  return ret;
58  }
59 
61  CSV_INLINE std::vector<CSVStat::FreqCount> CSVStat::get_counts() const {
62  std::vector<FreqCount> ret;
63  for (size_t i = 0; i < this->get_col_names().size(); i++) {
64  ret.push_back(this->counts[i]);
65  }
66  return ret;
67  }
68 
70  CSV_INLINE std::vector<CSVStat::TypeCount> CSVStat::get_dtypes() const {
71  std::vector<TypeCount> ret;
72  for (size_t i = 0; i < this->get_col_names().size(); i++) {
73  ret.push_back(this->dtypes[i]);
74  }
75  return ret;
76  }
77 
78  CSV_INLINE void CSVStat::calc_chunk() {
80  if (dtypes.empty()) {
82  for (size_t i = 0; i < this->get_col_names().size(); i++) {
83  dtypes.push_back({});
84  counts.push_back({});
85  rolling_means.push_back(0);
86  rolling_vars.push_back(0);
87  mins.push_back(NAN);
88  maxes.push_back(NAN);
89  n.push_back(0);
90  }
91  }
92 
93  // Start threads
94  std::vector<std::thread> pool;
95  for (size_t i = 0; i < this->get_col_names().size(); i++)
96  pool.push_back(std::thread(&CSVStat::calc_worker, this, i));
97 
98  // Block until done
99  for (auto& th : pool)
100  th.join();
101 
102  this->records.clear();
103  }
104 
105  CSV_INLINE void CSVStat::calc() {
106  constexpr size_t CALC_CHUNK_SIZE = 5000;
107 
108  for (auto& row : reader) {
109  this->records.push_back(std::move(row));
110 
112  if (this->records.size() == CALC_CHUNK_SIZE) {
113  calc_chunk();
114  }
115  }
116 
117  if (!this->records.empty()) {
118  calc_chunk();
119  }
120  }
121 
122  CSV_INLINE void CSVStat::calc_worker(const size_t &i) {
128  auto current_record = this->records.begin();
129 
130  for (size_t processed = 0; current_record != this->records.end(); processed++) {
131  if (current_record->size() == this->get_col_names().size()) {
132  auto current_field = (*current_record)[i];
133 
134  // Optimization: Don't count() if there's too many distinct values in the first 1000 rows
135  if (processed < 1000 || this->counts[i].size() <= 500)
136  this->count(current_field, i);
137 
138  this->dtype(current_field, i);
139 
140  // Numeric Stuff
141  if (current_field.is_num()) {
142  long double x_n = current_field.get<long double>();
143 
144  // This actually calculates mean AND variance
145  this->variance(x_n, i);
146  this->min_max(x_n, i);
147  }
148  }
149  else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
150  throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record));
151  }
152 
153  ++current_record;
154  }
155  }
156 
157  CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) {
163  auto type = data.type();
164  if (this->dtypes[i].find(type) !=
165  this->dtypes[i].end()) {
166  // Increment count
167  this->dtypes[i][type]++;
168  } else {
169  // Initialize count
170  this->dtypes[i].insert(std::make_pair(type, 1));
171  }
172  }
173 
174  CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) {
180  auto item = data.get<std::string>();
181 
182  if (this->counts[i].find(item) !=
183  this->counts[i].end()) {
184  // Increment count
185  this->counts[i][item]++;
186  } else {
187  // Initialize count
188  this->counts[i].insert(std::make_pair(item, 1));
189  }
190  }
191 
192  CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) {
197  if (std::isnan(this->mins[i]))
198  this->mins[i] = x_n;
199  if (std::isnan(this->maxes[i]))
200  this->maxes[i] = x_n;
201 
202  if (x_n < this->mins[i])
203  this->mins[i] = x_n;
204  else if (x_n > this->maxes[i])
205  this->maxes[i] = x_n;
206  }
207 
208  CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) {
214  long double& current_rolling_mean = this->rolling_means[i];
215  long double& current_rolling_var = this->rolling_vars[i];
216  long double& current_n = this->n[i];
217  long double delta;
218  long double delta2;
219 
220  current_n++;
221 
222  if (current_n == 1) {
223  current_rolling_mean = x_n;
224  } else {
225  delta = x_n - current_rolling_mean;
226  current_rolling_mean += delta/current_n;
227  delta2 = x_n - current_rolling_mean;
228  current_rolling_var += delta*delta2;
229  }
230  }
231 
240  CSV_INLINE std::unordered_map<std::string, DataType> csv_data_types(const std::string& filename) {
241  CSVStat stat(filename);
242  std::unordered_map<std::string, DataType> csv_dtypes;
243 
244  auto col_names = stat.get_col_names();
245  auto temp = stat.get_dtypes();
246 
247  for (size_t i = 0; i < stat.get_col_names().size(); i++) {
248  auto& col = temp[i];
249  auto& col_name = col_names[i];
250 
251  if (col[DataType::CSV_STRING])
252  csv_dtypes[col_name] = DataType::CSV_STRING;
253  else if (col[DataType::CSV_INT64])
254  csv_dtypes[col_name] = DataType::CSV_INT64;
255  else if (col[DataType::CSV_INT32])
256  csv_dtypes[col_name] = DataType::CSV_INT32;
257  else if (col[DataType::CSV_INT16])
258  csv_dtypes[col_name] = DataType::CSV_INT16;
259  else if (col[DataType::CSV_INT8])
260  csv_dtypes[col_name] = DataType::CSV_INT8;
261  else
262  csv_dtypes[col_name] = DataType::CSV_DOUBLE;
263  }
264 
265  return csv_dtypes;
266  }
267 }
Stores information about how to parse a CSV file.
Definition: csv_format.hpp:36
Class for calculating statistics from CSV files and in-memory sources.
Definition: csv_stat.hpp:18
std::vector< long double > get_mean() const
Return current means.
Definition: csv_stat.cpp:25
std::vector< long double > get_variance() const
Return current variances.
Definition: csv_stat.cpp:34
CSVStat(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Calculate statistics for an arbitrarily large file.
Definition: csv_stat.cpp:13
std::vector< long double > get_mins() const
Return current mins.
Definition: csv_stat.cpp:43
std::vector< TypeCount > get_dtypes() const
Get data type counts for each column.
Definition: csv_stat.cpp:70
std::vector< long double > get_maxes() const
Return current maxes.
Definition: csv_stat.cpp:52
std::vector< FreqCount > get_counts() const
Get counts for each column.
Definition: csv_stat.cpp:61
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition: common.hpp:26
Calculates statistics from CSV files.
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition: csv_reader.cpp:9
The all encompassing namespace.
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_INT16
16-bit integer (short on MSVC/GCC)
@ CSV_INT32
32-bit integer (int on MSVC/GCC)
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-numeric string.
std::unordered_map< std::string, DataType > csv_data_types(const std::string &filename)
Useful for uploading CSV files to SQL databases.
Definition: csv_stat.cpp:240
nonstd::string_view string_view
The string_view class used by this library.
Definition: common.hpp:75