14 reader(filename, format) {
20 reader(stream, format) {
26 std::vector<long double> ret;
27 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
28 ret.push_back(this->rolling_means[i]);
35 std::vector<long double> ret;
36 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
37 ret.push_back(this->rolling_vars[i]/(this->n[i] - 1));
44 std::vector<long double> ret;
45 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
46 ret.push_back(this->mins[i]);
53 std::vector<long double> ret;
54 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
55 ret.push_back(this->maxes[i]);
62 std::vector<FreqCount> ret;
63 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
64 ret.push_back(this->counts[i]);
71 std::vector<TypeCount> ret;
72 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
73 ret.push_back(this->dtypes[i]);
82 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
85 rolling_means.push_back(0);
86 rolling_vars.push_back(0);
94 std::vector<std::thread> pool;
95 for (
size_t i = 0; i < this->get_col_names().size(); i++)
96 pool.push_back(std::thread(&CSVStat::calc_worker,
this, i));
102 this->records.clear();
106 constexpr
size_t CALC_CHUNK_SIZE = 5000;
108 for (
auto& row : reader) {
109 this->records.push_back(std::move(row));
112 if (this->records.size() == CALC_CHUNK_SIZE) {
117 if (!this->records.empty()) {
122 CSV_INLINE void CSVStat::calc_worker(
const size_t &i) {
128 auto current_record = this->records.begin();
130 for (
size_t processed = 0; current_record != this->records.end(); processed++) {
131 if (current_record->size() == this->get_col_names().size()) {
132 auto current_field = (*current_record)[i];
135 if (processed < 1000 || this->counts[i].size() <= 500)
136 this->count(current_field, i);
138 this->dtype(current_field, i);
141 if (current_field.is_num()) {
142 long double x_n = current_field.get<
long double>();
145 this->variance(x_n, i);
146 this->min_max(x_n, i);
149 else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
150 throw std::runtime_error(
"Line has different length than the others " +
internals::format_row(*current_record));
157 CSV_INLINE void CSVStat::dtype(CSVField& data,
const size_t &i) {
163 auto type = data.type();
164 if (this->dtypes[i].find(type) !=
165 this->dtypes[i].end()) {
167 this->dtypes[i][type]++;
170 this->dtypes[i].insert(std::make_pair(type, 1));
174 CSV_INLINE void CSVStat::count(CSVField& data,
const size_t &i) {
180 auto item = data.get<std::string>();
182 if (this->counts[i].find(item) !=
183 this->counts[i].end()) {
185 this->counts[i][item]++;
188 this->counts[i].insert(std::make_pair(item, 1));
192 CSV_INLINE void CSVStat::min_max(
const long double &x_n,
const size_t &i) {
197 if (std::isnan(this->mins[i]))
199 if (std::isnan(this->maxes[i]))
200 this->maxes[i] = x_n;
202 if (x_n < this->mins[i])
204 else if (x_n > this->maxes[i])
205 this->maxes[i] = x_n;
208 CSV_INLINE void CSVStat::variance(
const long double &x_n,
const size_t &i) {
214 long double& current_rolling_mean = this->rolling_means[i];
215 long double& current_rolling_var = this->rolling_vars[i];
216 long double& current_n = this->n[i];
222 if (current_n == 1) {
223 current_rolling_mean = x_n;
225 delta = x_n - current_rolling_mean;
226 current_rolling_mean += delta/current_n;
227 delta2 = x_n - current_rolling_mean;
228 current_rolling_var += delta*delta2;
242 std::unordered_map<std::string, DataType> csv_dtypes;
244 auto col_names = stat.get_col_names();
247 for (
size_t i = 0; i < stat.get_col_names().size(); i++) {
249 auto& col_name = col_names[i];
Class for calculating statistics from CSV files and in-memory sources.
std::vector< long double > get_mean() const
Return current means.
std::vector< long double > get_variance() const
Return current variances.
CSVStat(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Calculate statistics for an arbitrarily large file.
std::vector< long double > get_mins() const
Return current mins.
std::vector< TypeCount > get_dtypes() const
Get data type counts for each column.
std::vector< long double > get_maxes() const
Return current maxes.
std::vector< FreqCount > get_counts() const
Get counts for each column.
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Calculates statistics from CSV files.
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
The all encompassing namespace.
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_INT16
16-bit integer (short on MSVC/GCC)
@ CSV_INT32
32-bit integer (int on MSVC/GCC)
@ CSV_STRING
Non-numeric string.
std::unordered_map< std::string, DataType > csv_data_types(const std::string &filename)
Useful for uploading CSV files to SQL databases.
nonstd::string_view string_view
The string_view class used by this library.