Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hdfs-rcfile-scanner.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HDFS_RCFILE_SCANNER_H
17 #define IMPALA_EXEC_HDFS_RCFILE_SCANNER_H
18 
23 //
26 //
30 //
40 //
47 //
50 //
53 //
56 //
59 //
62 //
65 //
67 //
72 //
74 //
79 //
81 //
89 //
91 //
95 //
97 //
105 //
110 //
116 //
121 //
124 //
126 //
129 //
131 //
135 //
137 //
139 //
141 //
143 //
145 //
147 //
149 //
152 //
154 //
159 //
165 //
167 //
169 //
172 //
174 //
176 //
178 //
189 //
191 //
193 //
199 //
201 //
203 //
208 //
210 //
212 //
220 
222 
223 namespace impala {
224 
225 struct HdfsFileDesc;
226 class HdfsScanNode;
227 class TupleDescriptor;
228 class Tuple;
229 
232  public:
233  HdfsRCFileScanner(HdfsScanNode* scan_node, RuntimeState* state);
234  virtual ~HdfsRCFileScanner();
235 
236  virtual Status Prepare(ScannerContext* context);
237 
238  void DebugString(int indentation_level, std::stringstream* out) const;
239 
240  private:
243  static const char* const RCFILE_KEY_CLASS_NAME;
244 
247  static const char* const RCFILE_VALUE_CLASS_NAME;
248 
251  static const char* const RCFILE_METADATA_KEY_NUM_COLS;
252 
255  static const uint8_t RCFILE_VERSION_HEADER[4];
256 
258  virtual FileHeader* AllocateFileHeader();
259  virtual Status ReadFileHeader();
260  virtual Status InitNewRange();
261  virtual Status ProcessRange();
262 
263  virtual THdfsFileFormat::type file_format() const {
264  return THdfsFileFormat::RC_FILE;
265  }
266 
270 
277 
283 
296  void GetCurrentKeyBuffer(int col_idx, bool skip_col_data, uint8_t** key_buf_ptr);
297 
302 
311  Status NextField(int col_idx);
312 
319 
321  void ResetRowGroup();
322 
326  Status NextRow();
327 
328  enum Version {
329  SEQ6, // Version for sequence file and pre hive-0.9 rc files
330  RCF1 // The version post hive-0.9 which uses a new header
331  };
332 
337 
340  int num_cols;
341  };
342 
345  struct ColumnInfo {
348 
350  int32_t buffer_len;
352 
354  int32_t key_buffer_len;
356  uint8_t* key_buffer;
357 
359  int32_t key_buffer_pos;
360 
362  int32_t start_offset;
363 
365  int32_t buffer_pos;
366 
371  };
372 
376  std::vector<ColumnInfo> columns_;
377 
379  std::vector<uint8_t> key_buffer_;
380 
383 
386  int row_pos_;
387 
391 
395 
400 
404 
408 
412 };
413 
414 }
415 
416 #endif
virtual FileHeader * AllocateFileHeader()
Implementation of superclass functions.
int32_t current_field_len_rep
RLE: Repetition count of the current field.
virtual THdfsFileFormat::type file_format() const
Returns type of scanner: e.g. rcfile, seqfile.
std::vector< uint8_t > key_buffer_
Buffer for copying key buffers. This buffer is reused between row groups.
void DebugString(int indentation_level, std::stringstream *out) const
A scanner for reading RCFiles into tuples.
int32_t buffer_pos
Offset from the start of the column for the next field in the column.
int num_rows_
number of rows in this rowgroup object
int32_t buffer_len
Uncompressed and compressed byte lengths for this column.
HdfsRCFileScanner(HdfsScanNode *scan_node, RuntimeState *state)
int32_t current_field_len
RLE: Length of the current field.
bool materialize_column
If true, this column should be materialized, otherwise, it can be skipped.
void ResetRowGroup()
Reset state for a new row group.
virtual Status Prepare(ScannerContext *context)
One-time initialisation of state that is constant across scan ranges.
static const char *const RCFILE_VALUE_CLASS_NAME
int32_t key_buffer_len
Length and start of the key for this column.
uint8_t * key_buffer
This is a ptr into the scanner's key_buffer_ for this column.
virtual Status InitNewRange()
Reset internal state for a new scan range.
void GetCurrentKeyBuffer(int col_idx, bool skip_col_data, uint8_t **key_buf_ptr)
int32_t start_offset
Offset into row_group_buffer_ for the start of this column.
std::vector< ColumnInfo > columns_
static const uint8_t RCFILE_VERSION_HEADER[4]
static const char *const RCFILE_METADATA_KEY_NUM_COLS
int32_t key_buffer_pos
Current position in the key buffer.
static const char *const RCFILE_KEY_CLASS_NAME
Data that is fixed across headers. This struct is shared between scan ranges.