6 std::ifstream infile(std::string(filename), std::ios::binary);
7 const auto start = infile.tellg();
8 infile.seekg(0, std::ios::end);
9 const auto end = infile.tellg();
15 return get_csv_head(filename, get_file_size(filename));
19 const size_t bytes = 500000;
21 std::error_code error;
22 size_t length = std::min((
size_t)file_size, bytes);
23 auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error);
26 throw std::runtime_error(
"Cannot open file " + std::string(filename));
29 return std::string(mmap.begin(), mmap.end());
33 #pragma region IBasicCVParser
37 const ColNamesPtr& col_names
38 ) : _col_names(col_names) {
39 if (format.no_quote) {
47 format.trim_chars.data(), format.trim_chars.size()
54 bool empty_last_field = this->data_ptr
55 && this->data_ptr->_data
56 && !this->data_ptr->data.empty()
57 && parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER;
60 if (this->field_length > 0 || empty_last_field) {
65 if (this->current_row.size() > 0)
69 CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
71 auto& in = this->data_ptr->data;
74 while (data_pos < in.size() && ws_flag(in[data_pos]))
77 if (field_start == UNINITIALIZED_FIELD)
78 field_start = (int)(data_pos - current_row_start());
83 while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL)
86 field_length = data_pos - (field_start + current_row_start());
90 for (
size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--)
97 if (field_has_double_quote) {
99 field_start == UNINITIALIZED_FIELD ? 0 : (
unsigned int)field_start,
103 field_has_double_quote =
false;
107 fields->emplace_back(
108 field_start == UNINITIALIZED_FIELD ? 0 : (
unsigned int)field_start,
113 current_row.row_length++;
116 field_start = UNINITIALIZED_FIELD;
125 this->quote_escape =
false;
127 this->current_row_start() = 0;
128 this->trim_utf8_bom();
130 auto& in = this->data_ptr->data;
131 while (this->data_pos < in.size()) {
132 switch (compound_parse_flag(in[this->data_pos])) {
133 case ParseFlags::DELIMITER:
138 case ParseFlags::NEWLINE:
142 if (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE)
150 this->current_row =
CSVRow(data_ptr, this->data_pos, fields->size());
153 case ParseFlags::NOT_SPECIAL:
157 case ParseFlags::QUOTE_ESCAPE_QUOTE:
158 if (data_pos + 1 == in.size())
return this->current_row_start();
159 else if (data_pos + 1 < in.size()) {
160 auto next_ch = parse_flag(in[data_pos + 1]);
161 if (next_ch >= ParseFlags::DELIMITER) {
162 quote_escape =
false;
166 else if (next_ch == ParseFlags::QUOTE) {
169 this->field_length += 2;
170 this->field_has_double_quote =
true;
176 this->field_length++;
182 if (this->field_length == 0) {
185 if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos]))
186 field_start = (int)(data_pos - current_row_start());
191 this->field_length++;
198 return this->current_row_start();
202 current_row.row_length = fields->size() - current_row.fields_start;
203 this->_records->push_back(std::move(current_row));
207 this->data_ptr = std::make_shared<RawCSVData>();
208 this->data_ptr->parse_flags = this->_parse_flags;
209 this->data_ptr->col_names = this->_col_names;
210 this->fields = &(this->data_ptr->fields);
213 CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
214 auto& data = this->data_ptr->data;
216 if (!this->unicode_bom_scan && data.size() >= 3) {
217 if (data[0] ==
'\xEF' && data[1] ==
'\xBB' && data[2] ==
'\xBF') {
219 this->_utf8_bom =
true;
222 this->unicode_bom_scan =
true;
230 #pragma region Specializations
234 this->field_start = UNINITIALIZED_FIELD;
235 this->field_length = 0;
236 this->reset_data_ptr();
239 size_t length = std::min(this->source_size - this->mmap_pos, bytes);
240 std::error_code error;
241 this->data_ptr->_data = std::make_shared<mio::basic_mmap_source<char>>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error));
242 this->mmap_pos += length;
243 if (error)
throw error;
245 auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr->_data.get());
248 this->data_ptr->data =
csv::string_view(mmap_ptr->data(), mmap_ptr->length());
251 this->current_row =
CSVRow(this->data_ptr);
252 size_t remainder = this->
parse();
254 if (this->mmap_pos == this->source_size || no_chunk()) {
259 this->mmap_pos -= (length - remainder);
Contains the main CSV parsing algorithm and various utility functions.
Data structure for representing CSV rows.
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
constexpr size_t ITERATION_CHUNK_SIZE
For functions that lazy load a large CSV, this determines how many bytes are read at a time.
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
HEDLEY_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
The all encompassing namespace.
CSVReader parse(csv::string_view in, CSVFormat format)
Shorthand function for parsing an in-memory CSV string.
nonstd::string_view string_view
The string_view class used by this library.