22 using namespace impala;
25 int num_cols,
int num_partition_keys,
const bool* is_materialized_col,
26 char tuple_delim,
char field_delim,
char collection_item_delim,
char escape_char)
28 field_delim_(field_delim),
29 process_escapes_(escape_char !=
'\0'),
30 escape_char_(escape_char),
31 collection_item_delim_(collection_item_delim),
32 tuple_delim_(tuple_delim),
33 current_column_has_escape_(false),
34 last_char_is_escape_(false),
35 last_row_delim_offset_(-1),
37 num_partition_keys_(num_partition_keys),
38 is_materialized_col_(is_materialized_col),
40 unfinished_tuple_(false){
43 DCHECK(escape_char ==
'\0' || escape_char != tuple_delim);
44 DCHECK(escape_char ==
'\0' || escape_char != field_delim);
45 DCHECK(escape_char ==
'\0' || escape_char != collection_item_delim);
49 memset(search_chars, 0,
sizeof(search_chars));
60 for (
int i = 1; i < 16; ++i) {
63 for (
int i = 14; i >= 0; --i) {
71 if (tuple_delim !=
'\0') {
79 if (field_delim !=
'\0' || collection_item_delim !=
'\0') {
99 char** byte_buffer_ptr,
char** row_end_locations,
101 int* num_tuples,
int* num_fields,
char** next_column_start) {
103 *next_column_start = *byte_buffer_ptr;
114 ParseSse<true>(max_tuples, &remaining_len, byte_buffer_ptr, row_end_locations,
115 field_locations, num_tuples, num_fields, next_column_start);
117 ParseSse<false>(max_tuples, &remaining_len, byte_buffer_ptr, row_end_locations,
118 field_locations, num_tuples, num_fields, next_column_start);
122 if (*num_tuples == max_tuples)
return Status::OK;
125 while (remaining_len > 0) {
126 bool new_tuple =
false;
127 bool new_col =
false;
151 ++*next_column_start;
153 AddColumn<true>(*byte_buffer_ptr - *next_column_start,
154 next_column_start, num_fields, field_locations);
155 FillColumns<false>(0, NULL, num_fields, field_locations);
157 row_end_locations[*num_tuples] = *byte_buffer_ptr;
162 if (*num_tuples == max_tuples) {
168 }
else if (new_col) {
169 AddColumn<true>(*byte_buffer_ptr - *next_column_start,
170 next_column_start, num_fields, field_locations);
180 DCHECK_EQ(remaining_len, 0);
181 AddColumn<true>(*byte_buffer_ptr - *next_column_start,
182 next_column_start, num_fields, field_locations);
183 FillColumns<false>(0, NULL, num_fields, field_locations);
196 const char* buffer_start = buffer;
202 if (*buffer_start ==
'\n')
return 1;
209 __m128i xmm_buffer, xmm_tuple_mask;
214 xmm_buffer = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer));
217 int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
218 if (tuple_mask != 0) {
222 tuple_start += i + 1;
234 for (; tuple_start < len; ++tuple_start) {
244 if (!found)
return -1;
253 int num_escape_chars = 0;
254 int before_tuple_end = tuple_start - 2;
258 for (; before_tuple_end >= 0; --before_tuple_end) {
268 if (before_tuple_end < 0) {
269 static bool warning_logged =
false;
270 if (!warning_logged) {
271 LOG(WARNING) <<
"Unhandled code path. This might cause a tuple to be "
272 <<
"skipped or repeated.";
273 warning_logged =
true;
279 if (num_escape_chars % 2 != 0)
goto restart;
282 if (tuple_start == len - 1 && buffer_start[tuple_start] ==
'\r') {
287 if (tuple_start < len && buffer_start[tuple_start] ==
'\n' &&
288 buffer_start[tuple_start - 1] ==
'\r') {
static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER]
Precomputed mask values up to 16 bits.
uint16_t low_mask_[16]
Precomputed masks to process escape characters.
__m128i xmm_delim_search_
SSE(xmm) register containing the delimiter search character.
DelimitedTextParser(int num_cols, int num_partition_keys, const bool *is_materialized_col, char tuple_delim, char field_delim_= '\0', char collection_item_delim= '^', char escape_char= '\0')
num_cols is the total number of columns including partition keys.
char tuple_delim_
Character delimiting tuples.
int num_delims_
The number of delimiters contained in xmm_delim_search_, i.e. its length.
__m128i xmm_escape_search_
SSE(xmm) register containing the escape search character.
bool last_char_is_escape_
Whether or not the previous character was the escape character.
char collection_item_delim_
Character delimiting collection items (to become slots).
static const int STRCHR_MODE
__m128i xmm_tuple_search_
SSE(xmm) register containing the tuple search character.
char escape_char_
Escape character. Only used if process_escapes_ is true.
int FindFirstInstance(const char *buffer, int len)
static const int64_t SSE4_2
char field_delim_
Character delimiting fields (to become slots).
Status ParseFieldLocations(int max_tuples, int64_t remaining_len, char **byte_buffer_ptr, char **row_end_locations, FieldLocation *field_locations, int *num_tuples, int *num_fields, char **next_column_start)
bool current_column_has_escape_
bool process_escapes_
True if this parser should handle escape characters.
bool unfinished_tuple_
True if the last tuple is unfinished (not ended with tuple delimiter).
static SSE_ALWAYS_INLINE __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2, const int mode)
void ParserReset()
Called to initialize parser at beginning of scan range.
int32_t last_row_delim_offset_
static bool IsSupported(long flag)
Returns whether of not the cpu supports this flag.
int column_idx_
Index to keep track of the current column in the current file.
int num_partition_keys_
Number of partition columns in the table.
static const int CHARS_PER_128_BIT_REGISTER