Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hdfs-text-scanner.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HDFS_TEXT_SCANNER_H
17 #define IMPALA_EXEC_HDFS_TEXT_SCANNER_H
18 
19 #include "exec/hdfs-scanner.h"
20 #include "runtime/string-buffer.h"
21 
22 namespace impala {
23 
24 class DelimitedTextParser;
25 class ScannerContext;
26 struct HdfsFileDesc;
27 
30 class HdfsTextScanner : public HdfsScanner {
31  public:
32  HdfsTextScanner(HdfsScanNode* scan_node, RuntimeState* state);
33  virtual ~HdfsTextScanner();
34 
36  virtual Status Prepare(ScannerContext* context);
37  virtual Status ProcessSplit();
38  virtual void Close();
39 
41  static Status IssueInitialRanges(HdfsScanNode* scan_node,
42  const std::vector<HdfsFileDesc*>& files);
43 
45  static llvm::Function* Codegen(HdfsScanNode*,
46  const std::vector<ExprContext*>& conjunct_ctxs);
47 
49  const static std::string LZO_INDEX_SUFFIX;
50 
51  static const char* LLVM_CLASS_NAME;
52 
53  protected:
57 
60 
63 
66 
69 
70  private:
71  const static int NEXT_BLOCK_READ_SIZE = 1024; //bytes
72 
75  virtual Status InitNewRange();
76 
82  Status FindFirstTuple(bool* tuple_found);
83 
89  Status ProcessRange(int* num_tuples, bool past_scan_range);
90 
93 
98  virtual Status FillByteBuffer(bool* eosr, int num_bytes = 0);
99 
103 
108  Status FillByteBufferGzip(bool* eosr);
109 
116 
125  int WriteFields(MemPool*, TupleRow* tuple_row_mem, int num_fields, int num_tuples);
126 
130  int WritePartialTuple(FieldLocation*, int num_fields, bool copy_strings);
131 
134  virtual void LogRowParseError(int row_idx, std::stringstream*);
135 
137  boost::scoped_ptr<MemPool> boundary_pool_;
138 
143 
146 
149 
151  boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
152 
154  std::vector<FieldLocation> field_locations_;
155 
158  std::vector<char*> row_end_locations_;
159 
163 
169 
174 
178 
181 };
182 
183 }
184 
185 #endif
Status ProcessRange(int *num_tuples, bool past_scan_range)
virtual Status InitNewRange()
std::vector< char * > row_end_locations_
virtual void LogRowParseError(int row_idx, std::stringstream *)
StringBuffer boundary_column_
Helper string for dealing with columns that span file blocks.
char * byte_buffer_end_
Ending position of HDFS buffer.
Status FillByteBufferCompressedFile(bool *eosr)
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48
static const int NEXT_BLOCK_READ_SIZE
Status FinishScanRange()
Reads past the end of the scan range for the next tuple end.
std::vector< FieldLocation > field_locations_
Return field locations from the Delimited Text Parser.
int WritePartialTuple(FieldLocation *, int num_fields, bool copy_strings)
virtual Status Prepare(ScannerContext *context)
Implementation of HdfsScanner interface.
bool only_parsing_header_
True if we are parsing the header for this scanner.
static Status IssueInitialRanges(HdfsScanNode *scan_node, const std::vector< HdfsFileDesc * > &files)
Issue io manager byte ranges for 'files'.
boost::scoped_ptr< DelimitedTextParser > delimited_text_parser_
Helper class for picking fields and rows from delimited text.
ObjectPool pool
static const char * LLVM_CLASS_NAME
Status FindFirstTuple(bool *tuple_found)
HdfsTextScanner(HdfsScanNode *scan_node, RuntimeState *state)
static llvm::Function * Codegen(HdfsScanNode *, const std::vector< ExprContext * > &conjunct_ctxs)
Codegen writing tuples and evaluating predicates.
int slot_idx_
Index into materialized_slots_ for the next slot to output for the current tuple. ...
RuntimeProfile::Counter * parse_delimiter_timer_
Time parsing text files.
int WriteFields(MemPool *, TupleRow *tuple_row_mem, int num_fields, int num_tuples)
Status FillByteBufferGzip(bool *eosr)
boost::scoped_ptr< MemPool > boundary_pool_
Mem pool for boundary_row_ and boundary_column_.
virtual Status ProcessSplit()
char * byte_buffer_ptr_
Current position in byte buffer.
void CopyBoundaryField(FieldLocation *data, MemPool *pool)
static const std::string LZO_INDEX_SUFFIX
Suffix for lzo index files.
int64_t byte_buffer_read_size_
Actual bytes received from last file read.
virtual Status FillByteBuffer(bool *eosr, int num_bytes=0)