11 std::stringstream ret;
12 for (
size_t i = 0; i < row.size(); i++) {
14 if (i + 1 < row.size()) ret << delim;
30 auto trim_chars = format.get_trim_chars();
31 std::stringstream source(head.data());
35 parser.set_output(rows);
38 return CSVRow(std::move(rows[format.get_header()]));
43 std::unordered_map<size_t, size_t> row_tally = { { 0, 0 } };
46 std::unordered_map<size_t, size_t> row_when = { { 0, 0 } };
49 std::stringstream source(head.data());
52 StreamParser<std::stringstream> parser(source, format);
53 parser.set_output(rows);
56 for (
size_t i = 0; i < rows.size(); i++) {
61 if (row_tally.find(row.size()) != row_tally.end()) {
62 row_tally[row.size()]++;
65 row_tally[row.size()] = 1;
66 row_when[row.size()] = i;
71 double final_score = 0;
72 size_t header_row = 0;
76 for (
auto& pair : row_tally) {
77 auto row_size = pair.first;
78 auto row_count = pair.second;
79 double score = (double)(row_size * row_count);
80 if (score > final_score) {
82 header_row = row_when[row_size];
101 size_t max_score = 0,
103 char current_delim = delims[0];
105 for (
char cand_delim : delims) {
106 auto result = calculate_score(head, format.
delimiter(cand_delim));
108 if ((
size_t)result.score > max_score) {
109 max_score = (size_t)result.score;
110 current_delim = cand_delim;
111 header = result.header;
115 return { current_delim, (int)header };
126 auto head = internals::get_csv_head(filename);
129 if (format.guess_delim()) {
130 auto guess_result =
guess_format(filename, format.get_possible_delims());
139 auto head = internals::get_csv_head(filename);
155 auto head = internals::get_csv_head(filename);
159 if (format.guess_delim()) {
162 format.header = guess_result.header_row;
163 this->_format = format;
166 if (!format.col_names.empty())
169 this->
parser = std::unique_ptr<Parser>(
new Parser(filename, format, this->
col_names));
170 this->initial_read();
180 new_format.col_names = this->
col_names->get_col_names();
181 new_format.header = this->_format.header;
192 return std::vector<std::string>();
200 for (
size_t i = 0; i < _col_names.size(); i++)
201 if (_col_names[i] == col_name)
return (
int)i;
207 if (!this->header_trimmed) {
208 for (
int i = 0; i <= this->_format.header && !this->
records->empty(); i++) {
209 if (i == this->_format.header && this->col_names->empty()) {
217 this->header_trimmed =
true;
227 this->
n_cols = names.size();
246 this->
parser->next(bytes);
248 if (!this->header_trimmed) {
275 if (this->
records->is_waitable())
278 else if (this->
parser->eof())
283 if (this->read_csv_worker.joinable())
284 this->read_csv_worker.join();
289 else if (this->
records->front().size() != this->n_cols &&
290 this->_format.variable_column_policy != VariableColumnPolicy::KEEP) {
291 auto errored_row = this->
records->pop_front();
293 if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) {
294 if (errored_row.size() < this->n_cols)
301 row = this->
records->pop_front();
CSVFormat get_format() const
Return the format of the original raw CSV.
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
CSVReader(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Reads an arbitrarily large CSV file using memory-mapped IO.
Data structure for representing CSV rows.
Parser for memory-mapped files.
A class for parsing CSV data from a std::stringstream or an std::ifstream
void next(size_t bytes=ITERATION_CHUNK_SIZE) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Defines functionality needed for basic CSV parsing.
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
size_t _n_rows
How many rows (minus header) have been read so far.
bool read_csv(size_t bytes=internals::ITERATION_CHUNK_SIZE)
Read a chunk of CSV data.
internals::ColNamesPtr col_names
Pointer to a object containing column information.
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
size_t n_cols
The number of columns in this CSV.
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
std::vector< std::string > _get_col_names(csv::string_view head, CSVFormat format)
Return a CSV's column names.
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
CSVGuessResult _guess_format(csv::string_view head, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
nonstd::string_view string_view
The string_view class used by this library.
Stores the inferred format of a CSV file.