16 #ifndef IMPALA_EXEC_DELIMITED_TEXT_PARSER_INLINE_H
17 #define IMPALA_EXEC_DELIMITED_TEXT_PARSER_INLINE_H
29 uint16_t* delim_mask) {
31 bool first_char_is_escape = *last_char_is_escape;
32 bool escape_next = first_char_is_escape;
41 *last_char_is_escape = escape_mask &
46 escape_mask = escape_mask << 1 | (first_char_is_escape ? 1 : 0);
49 *delim_mask &= ~escape_mask;
52 template <
bool process_escapes>
57 field_locations[*num_fields].
start = *next_column_start;
59 field_locations[*num_fields].
len = -len;
61 field_locations[*num_fields].
len = len;
66 *next_column_start += len + 1;
70 template <
bool process_escapes>
75 if (last_column == NULL) last_column = &dummy;
77 AddColumn<process_escapes>(len, last_column, num_fields, field_locations);
97 template <
bool process_escapes>
99 int64_t* remaining_len,
char** byte_buffer_ptr,
100 char** row_end_locations,
102 int* num_tuples,
int* num_fields,
char** next_column_start) {
123 __m128i xmm_buffer, xmm_delim_mask, xmm_escape_mask;
127 xmm_buffer = _mm_loadu_si128(reinterpret_cast<__m128i*>(*byte_buffer_ptr));
137 uint16_t delim_mask = _mm_extract_epi16(xmm_delim_mask, 0);
139 uint16_t escape_mask = 0;
141 if (process_escapes) {
145 escape_mask = _mm_extract_epi16(xmm_escape_mask, 0);
149 char* last_char = *byte_buffer_ptr + 15;
150 bool last_char_is_unescaped_delim = delim_mask >> 15;
154 int last_col_idx = 0;
157 while (delim_mask != 0) {
159 int n = ffs(delim_mask) - 1;
165 if (process_escapes) {
172 char* delim_ptr = *byte_buffer_ptr + n;
175 AddColumn<process_escapes>(delim_ptr - *next_column_start,
176 next_column_start, num_fields, field_locations);
184 ++*next_column_start;
188 AddColumn<process_escapes>(delim_ptr - *next_column_start,
189 next_column_start, num_fields, field_locations);
190 FillColumns<false>(0, NULL, num_fields, field_locations);
192 row_end_locations[*num_tuples] = delim_ptr;
196 if (
UNLIKELY(*num_tuples == max_tuples)) {
197 (*byte_buffer_ptr) += (n + 1);
199 *remaining_len -= (n + 1);
208 if (process_escapes) {
220 template <
bool process_escapes>
223 char* next_column_start = buffer;
224 __m128i xmm_buffer, xmm_delim_mask, xmm_escape_mask;
231 xmm_buffer = _mm_loadu_si128(reinterpret_cast<__m128i*>(buffer));
235 uint16_t delim_mask = _mm_extract_epi16(xmm_delim_mask, 0);
237 uint16_t escape_mask = 0;
239 if (process_escapes) {
243 escape_mask = _mm_extract_epi16(xmm_escape_mask, 0);
247 int last_col_idx = 0;
250 while (delim_mask != 0) {
252 int n = ffs(delim_mask) - 1;
256 if (process_escapes) {
266 AddColumn<process_escapes>(buffer + n - next_column_start,
267 &next_column_start, num_fields, field_locations);
270 if (process_escapes) {
281 while (remaining_len > 0) {
291 AddColumn<process_escapes>(buffer - next_column_start,
292 &next_column_start, num_fields, field_locations);
301 FillColumns<process_escapes>(buffer - next_column_start,
302 &next_column_start, num_fields, field_locations);
static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER]
Precomputed mask values up to 16 bits.
uint16_t low_mask_[16]
Precomputed masks to process escape characters.
__m128i xmm_delim_search_
SSE(xmm) register containing the delimiter search character.
char tuple_delim_
Character delimiting tuples.
void AddColumn(int len, char **next_column_start, int *num_fields, FieldLocation *field_locations)
int num_delims_
The number of delimiters contained in xmm_delim_search_, i.e. its length.
__m128i xmm_escape_search_
SSE(xmm) register containing the escape search character.
bool last_char_is_escape_
Whether or not the previous character was the escape character.
char collection_item_delim_
Character delimiting collection items (to become slots).
static const int STRCHR_MODE
int num_cols_
Number of columns in the table (including partition columns)
char escape_char_
Escape character. Only used if process_escapes_ is true.
bool ReturnCurrentColumn() const
void FillColumns(int len, char **last_column, int *num_fields, impala::FieldLocation *field_locations)
static const int64_t SSE4_2
char field_delim_
Character delimiting fields (to become slots).
void ParseSse(int max_tuples, int64_t *remaining_len, char **byte_buffer_ptr, char **row_end_locations_, FieldLocation *field_locations, int *num_tuples, int *num_fields, char **next_column_start)
void ParseSingleTuple(int64_t len, char *buffer, FieldLocation *field_locations, int *num_fields)
Simplified version of ParseSSE which does not handle tuple delimiters.
bool current_column_has_escape_
void ProcessEscapeMask(uint16_t escape_mask, bool *last_char_is_escape, uint16_t *delim_mask)
bool unfinished_tuple_
True if the last tuple is unfinished (not ended with tuple delimiter).
static SSE_ALWAYS_INLINE __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2, const int mode)
int32_t last_row_delim_offset_
static bool IsSupported(long flag)
Returns whether of not the cpu supports this flag.
int column_idx_
Index to keep track of the current column in the current file.
int num_partition_keys_
Number of partition columns in the table.
static const int CHARS_PER_128_BIT_REGISTER