Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
delimited-text-parser.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_DELIMITED_TEXT_PARSER_H
17 #define IMPALA_EXEC_DELIMITED_TEXT_PARSER_H
18 
19 #include "exec/hdfs-scanner.h"
20 #include "exec/hdfs-scan-node.h"
21 #include "util/sse-util.h"
22 
23 namespace impala {
24 
26  public:
27 
34  //
36  //
40  //
45  int num_cols, int num_partition_keys, const bool* is_materialized_col,
46  char tuple_delim, char field_delim_ = '\0', char collection_item_delim = '^',
47  char escape_char = '\0');
48 
50  void ParserReset();
51 
54 
55  char escape_char() const { return escape_char_; }
56 
76  Status ParseFieldLocations(int max_tuples, int64_t remaining_len,
77  char** byte_buffer_ptr, char** row_end_locations,
78  FieldLocation* field_locations,
79  int* num_tuples, int* num_fields, char** next_column_start);
80 
88  template <bool process_escapes>
89  void ParseSingleTuple(int64_t len, char* buffer, FieldLocation* field_locations,
90  int* num_fields);
91 
97  int FindFirstInstance(const char* buffer, int len);
98 
102  bool ReturnCurrentColumn() const {
104  }
105 
115  template <bool process_escapes>
116  void FillColumns(int len, char** last_column,
117  int* num_fields, impala::FieldLocation* field_locations);
118 
122 
123  private:
125  void ParserInit(HdfsScanNode* scan_node);
126 
137  template <bool process_escapes>
138  void AddColumn(int len, char** next_column_start, int* num_fields,
139  FieldLocation* field_locations);
140 
147  template <bool process_escapes>
148  void ParseSse(int max_tuples, int64_t* remaining_len,
149  char** byte_buffer_ptr, char** row_end_locations_,
150  FieldLocation* field_locations,
151  int* num_tuples, int* num_fields, char** next_column_start);
152 
155 
158 
161 
164 
167 
170 
173 
176 
179 
183 
186 
195 
197  uint16_t low_mask_[16];
198  uint16_t high_mask_[16];
199 
202 
205 
208  const bool* is_materialized_col_;
209 
212 
215 };
216 
217 }// namespace impala
218 #endif// IMPALA_EXEC_DELIMITED_TEXT_PARSER_H
uint16_t low_mask_[16]
Precomputed masks to process escape characters.
__m128i xmm_delim_search_
SSE(xmm) register containing the delimiter search character.
DelimitedTextParser(int num_cols, int num_partition_keys, const bool *is_materialized_col, char tuple_delim, char field_delim_= '\0', char collection_item_delim= '^', char escape_char= '\0')
num_cols is the total number of columns including partition keys.
char tuple_delim_
Character delimiting tuples.
void AddColumn(int len, char **next_column_start, int *num_fields, FieldLocation *field_locations)
int num_delims_
The number of delimiters contained in xmm_delim_search_, i.e. its length.
__m128i xmm_escape_search_
SSE(xmm) register containing the escape search character.
bool last_char_is_escape_
Whether or not the previous character was the escape character.
char collection_item_delim_
Character delimiting collection items (to become slots).
__m128i xmm_tuple_search_
SSE(xmm) register containing the tuple search character.
void ParserInit(HdfsScanNode *scan_node)
Initialize the parser state.
int num_cols_
Number of columns in the table (including partition columns)
char escape_char_
Escape character. Only used if process_escapes_ is true.
void FillColumns(int len, char **last_column, int *num_fields, impala::FieldLocation *field_locations)
bool AtTupleStart()
Check if we are at the start of a tuple.
int FindFirstInstance(const char *buffer, int len)
char field_delim_
Character delimiting fields (to become slots).
void ParseSse(int max_tuples, int64_t *remaining_len, char **byte_buffer_ptr, char **row_end_locations_, FieldLocation *field_locations, int *num_tuples, int *num_fields, char **next_column_start)
Status ParseFieldLocations(int max_tuples, int64_t remaining_len, char **byte_buffer_ptr, char **row_end_locations, FieldLocation *field_locations, int *num_tuples, int *num_fields, char **next_column_start)
void ParseSingleTuple(int64_t len, char *buffer, FieldLocation *field_locations, int *num_fields)
Simplified version of ParseSSE which does not handle tuple delimiters.
bool process_escapes_
True if this parser should handle escape characters.
bool unfinished_tuple_
True if the last tuple is unfinished (not ended with tuple delimiter).
void ParserReset()
Called to initialize parser at beginning of scan range.
int column_idx_
Index to keep track of the current column in the current file.
int num_partition_keys_
Number of partition columns in the table.