Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hdfs-sequence-scanner.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HDFS_SEQUENCE_SCANNER_H
17 #define IMPALA_EXEC_HDFS_SEQUENCE_SCANNER_H
18 
22 //
26 //
31 //
34 //
38 //
42 //
52 //
54 //
56 //
59 //
61 //
64 //
67 //
69 //
71 //
73 //
78 //
80 //
85 //
87 //
92 //
94 //
96 //
101 //
107 //
113 //
124 //
129 //
132 //
135 //
138 //
140 //
145 //
147 //
149 
151 
152 namespace impala {
153 
154 class DelimitedTextParser;
155 
157  public:
160  static const uint8_t SEQFILE_VERSION_HEADER[4];
161 
162  HdfsSequenceScanner(HdfsScanNode* scan_node, RuntimeState* state);
163 
164  virtual ~HdfsSequenceScanner();
165 
167  virtual Status Prepare(ScannerContext* context);
168 
170  static llvm::Function* Codegen(HdfsScanNode*,
171  const std::vector<ExprContext*>& conjunct_ctxs);
172 
173  protected:
175  virtual FileHeader* AllocateFileHeader();
176  virtual Status ReadFileHeader();
177  virtual Status InitNewRange();
178  virtual Status ProcessRange();
179 
180  virtual THdfsFileFormat::type file_format() const {
181  return THdfsFileFormat::SEQUENCE_FILE;
182  }
183 
184  private:
187  const static int MAX_BLOCK_SIZE = (1024 * 1024 * 1024);
188 
191  static const char* const SEQFILE_VALUE_CLASS_NAME;
192 
197 
201 
205 
209 
215  Status GetRecord(uint8_t** record_ptr, int64_t *record_len);
216 
219  virtual void LogRowParseError(int row_idx, std::stringstream*);
220 
222  boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
223  std::vector<FieldLocation> field_locations_;
224 
229  };
230 
232  struct RecordLocation {
233  uint8_t* record;
234  int64_t len;
235  };
236 
240  std::vector<RecordLocation> record_locations_;
241 
244 
247 
250 
253 
256 };
257 
258 } // namespace impala
259 
260 #endif // IMPALA_EXEC_HDFS_SEQUENCE_SCANNER_H
Struct for record locations and lens in compressed blocks.
boost::scoped_ptr< DelimitedTextParser > delimited_text_parser_
Helper class for picking fields and rows from delimited text.
Status GetRecord(uint8_t **record_ptr, int64_t *record_len)
uint8_t * next_record_in_compressed_block_
Next record from block compressed data.
std::vector< FieldLocation > field_locations_
virtual void LogRowParseError(int row_idx, std::stringstream *)
int64_t num_buffered_records_in_compressed_block_
Number of buffered records unparsed_data_buffer_ from block compressed data.
int current_key_length_
Length of the current key. This is specified as 4 bytes in the format description.
virtual THdfsFileFormat::type file_format() const
Returns type of scanner: e.g. rcfile, seqfile.
virtual FileHeader * AllocateFileHeader()
Implementation of sequence container super class methods.
static const uint8_t SEQFILE_VERSION_HEADER[4]
uint8_t * unparsed_data_buffer_
Buffer for data read from HDFS or from decompressing the HDFS data.
virtual Status Prepare(ScannerContext *context)
Implementation of HdfsScanner interface.
static const char *const SEQFILE_VALUE_CLASS_NAME
std::vector< RecordLocation > record_locations_
Data that is fixed across headers. This struct is shared between scan ranges.
virtual Status InitNewRange()
Reset internal state for a new scan range.
static llvm::Function * Codegen(HdfsScanNode *, const std::vector< ExprContext * > &conjunct_ctxs)
Codegen writing tuples and evaluating predicates.
int current_block_length_
Length of the current sequence file block (or record).
bool is_row_compressed
If true, the file uses row compression.
HdfsSequenceScanner(HdfsScanNode *scan_node, RuntimeState *state)