Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hdfs-parquet-scanner.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HDFS_PARQUET_SCANNER_H
17 #define IMPALA_EXEC_HDFS_PARQUET_SCANNER_H
18 
19 #include "exec/hdfs-scanner.h"
20 #include "exec/parquet-common.h"
21 
22 namespace impala {
23 
24 struct HdfsFileDesc;
25 
30 //
39 //
44  public:
45  HdfsParquetScanner(HdfsScanNode* scan_node, RuntimeState* state);
46 
47  virtual ~HdfsParquetScanner();
48  virtual Status Prepare(ScannerContext* context);
49  virtual void Close();
50  virtual Status ProcessSplit();
51 
54  static Status IssueInitialRanges(HdfsScanNode* scan_node,
55  const std::vector<HdfsFileDesc*>& files);
56 
57  struct FileVersion {
59  std::string application;
60 
67  struct {
68  int major;
69  int minor;
70  int patch;
71  } version;
72 
75 
77 
79  FileVersion(const std::string& created_by);
80 
82  bool VersionLt(int major, int minor = 0, int patch = 0) const;
83 
85  bool VersionEq(int major, int minor, int patch) const;
86  };
87 
88  private:
90  struct SchemaNode {
92  const parquet::SchemaElement* element;
93 
96  int col_idx;
97 
101 
103  std::vector<SchemaNode> children;
104 
106 
107  SchemaNode() : col_idx(-1), max_def_level(-1), slot_desc(NULL) { }
108  std::string DebugString(int indent = 0) const;
109  };
110 
113  static const int FOOTER_SIZE = 100 * 1024;
114 
117  friend class BaseColumnReader;
118 
119  template<typename T> class ColumnReader;
120  template<typename T> friend class ColumnReader;
122  friend class BoolColumnReader;
123 
125  std::vector<BaseColumnReader*> column_readers_;
126 
128  parquet::FileMetaData file_metadata_;
129 
132 
135 
138 
141 
144  boost::scoped_ptr<MemPool> dictionary_pool_;
145 
148 
151 
154  Status AssembleRows(int row_group_idx);
155 
159  Status ProcessFooter(bool* eosr);
160 
167 
171 
174  Status InitColumns(int row_group_idx);
175 
178 
181  Status ValidateColumn(const BaseColumnReader& col_reader, int row_group_idx);
182 
185 
189  Status CreateSchemaTree(const std::vector<parquet::SchemaElement>& schema,
190  SchemaNode* node) const;
191 
193  Status CreateSchemaTree(const std::vector<parquet::SchemaElement>& schema,
194  int max_def_level, int* idx, int* col_idx, SchemaNode* node) const;
195 };
196 
197 } // namespace impala
198 
199 #endif
Internal representation of a column schema (including nested-type columns).
Status ValidateFileMetadata()
Validates the file metadata.
bool VersionEq(int major, int minor, int patch) const
Returns true if version is equal to <major>.<minor>.<patch>
std::vector< SchemaNode > children
Any nested schema nodes. Empty for non-nested types.
SchemaNode schema_
The root schema node for this file.
Status parse_status_
Returned in ProcessSplit.
virtual Status Prepare(ScannerContext *context)
One-time initialisation of state that is constant across scan ranges.
const DiskIoMgr::ScanRange * metadata_range_
Scan range for the metadata.
parquet::FileMetaData file_metadata_
File metadata thrift object.
std::vector< BaseColumnReader * > column_readers_
Column reader for each materialized columns for this file.
std::string application
Application that wrote the file. e.g. "IMPALA".
Status AssembleRows(int row_group_idx)
boost::scoped_ptr< MemPool > dictionary_pool_
FileVersion file_version_
Version of the application that wrote this file.
ScopedTimer< MonotonicStopWatch > assemble_rows_timer_
Timer for materializing rows. This ignores time getting the next buffer.
HdfsParquetScanner(HdfsScanNode *scan_node, RuntimeState *state)
struct impala::HdfsParquetScanner::FileVersion::@7 version
Status ValidateColumn(const BaseColumnReader &col_reader, int row_group_idx)
Status InitColumns(int row_group_idx)
RuntimeProfile::Counter * num_cols_counter_
Number of cols that need to be read.
const parquet::SchemaElement * element
The corresponding schema element defined in the file metadata.
Status InitNewRange()
Part of the HdfsScanner interface, not used in Parquet.
BaseColumnReader * CreateReader(const SchemaNode &node)
static const Status OK
Definition: status.h:87
static Status IssueInitialRanges(HdfsScanNode *scan_node, const std::vector< HdfsFileDesc * > &files)
std::string DebugString(int indent=0) const
bool is_impala_internal
If true, this file was generated by an Impala internal release.
bool VersionLt(int major, int minor=0, int patch=0) const
Returns true if version is strictly less than <major>.<minor>.<patch>
Status CreateSchemaTree(const std::vector< parquet::SchemaElement > &schema, SchemaNode *node) const