Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hdfs-avro-scanner.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HDFS_AVRO_SCANNER_H
17 #define IMPALA_EXEC_HDFS_AVRO_SCANNER_H
18 
22 //
27 //
40 //
41 //
47 //
53 //
61 //
67 
69 
70 #include <avro/basics.h>
71 #include "runtime/tuple.h"
72 #include "runtime/tuple-row.h"
73 
75 struct avro_obj_t;
76 typedef struct avro_obj_t* avro_schema_t;
77 
78 namespace impala {
79 
81  public:
84  static const uint8_t AVRO_VERSION_HEADER[4];
85 
86  HdfsAvroScanner(HdfsScanNode* scan_node, RuntimeState* state);
87 
89  static llvm::Function* Codegen(HdfsScanNode*,
90  const std::vector<ExprContext*>& conjunct_ctxs);
91 
92  protected:
94  virtual FileHeader* AllocateFileHeader();
96  virtual Status ReadFileHeader();
97  virtual Status InitNewRange();
98  virtual Status ProcessRange();
99 
100  virtual THdfsFileFormat::type file_format() const {
101  return THdfsFileFormat::AVRO;
102  }
103 
104  private:
109 
111 
114 
115  avro_schema_t operator->() const { return schema; }
116  avro_schema_t get() const { return schema; }
117 
118  private:
121  };
122 
126  struct SchemaElement {
129 
131  std::vector<SchemaElement> children;
132 
139 
143  };
144 
147  std::vector<SchemaElement> schema;
148 
154 
158  };
159 
161 
163  static const std::string AVRO_SCHEMA_KEY;
164  static const std::string AVRO_CODEC_KEY;
165 
167  static const std::string AVRO_NULL_CODEC;
168  static const std::string AVRO_SNAPPY_CODEC;
169  static const std::string AVRO_DEFLATE_CODEC;
170 
171  typedef int (*DecodeAvroDataFn)(HdfsAvroScanner*, int, MemPool*, uint8_t**,
172  Tuple*, TupleRow*);
173 
176 
179 
183  Status ResolveSchemas(const avro_schema_t& table_root,
184  const avro_schema_t& file_root);
185 
187  static SchemaElement ConvertSchema(const avro_schema_t& schema);
188 
191  Status VerifyTypesMatch(SlotDescriptor* slot_desc, avro_obj_t* schema);
192 
200  int DecodeAvroData(int max_tuples, MemPool* pool, uint8_t** data,
201  Tuple* tuple, TupleRow* tuple_row);
202 
204  void MaterializeTuple(MemPool* pool, uint8_t** data, Tuple* tuple);
205 
208  static llvm::Function* CodegenDecodeAvroData(
209  RuntimeState* state, llvm::Function* materialize_tuple_fn,
210  const std::vector<ExprContext*>& conjunct_ctxs);
211 
215  static llvm::Function* CodegenMaterializeTuple(HdfsScanNode* node,
216  LlvmCodeGen* codegen);
217 
227  void ReadAvroBoolean(
228  PrimitiveType type, uint8_t** data, bool write_slot, void* slot, MemPool* pool);
229  void ReadAvroInt32(
230  PrimitiveType type, uint8_t** data, bool write_slot, void* slot, MemPool* pool);
231  void ReadAvroInt64(
232  PrimitiveType type, uint8_t** data, bool write_slot, void* slot, MemPool* pool);
233  void ReadAvroFloat(
234  PrimitiveType type, uint8_t** data, bool write_slot, void* slot, MemPool* pool);
235  void ReadAvroDouble(
236  PrimitiveType type, uint8_t** data, bool write_slot, void* slot, MemPool* pool);
237  void ReadAvroVarchar(
238  PrimitiveType type, int max_len, uint8_t** data, bool write_slot, void* slot,
239  MemPool* pool);
240  void ReadAvroChar(
241  PrimitiveType type, int max_len, uint8_t** data, bool write_slot, void* slot,
242  MemPool* pool);
243  void ReadAvroString( PrimitiveType type, uint8_t** data, bool write_slot, void* slot,
244  MemPool* pool);
245 
251  void ReadAvroDecimal(
252  int slot_byte_size, uint8_t** data, bool write_slot, void* slot, MemPool* pool);
253 
256  bool ReadUnionType(int null_union_position, uint8_t** data);
257 
258  static const char* LLVM_CLASS_NAME;
259 };
260 } // namespace impala
261 
262 #endif // IMPALA_EXEC_HDFS_AVRO_SCANNER_H
int(* DecodeAvroDataFn)(HdfsAvroScanner *, int, MemPool *, uint8_t **, Tuple *, TupleRow *)
virtual Status ProcessRange()
void ReadAvroVarchar(PrimitiveType type, int max_len, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
virtual Status ReadFileHeader()
TODO: check that file schema matches metadata schema.
ScopedAvroSchemaT & operator=(const ScopedAvroSchemaT &)
static const std::string AVRO_SNAPPY_CODEC
struct avro_obj_t * avro_schema_t
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48
void ReadAvroBoolean(PrimitiveType type, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
static SchemaElement ConvertSchema(const avro_schema_t &schema)
Utility function that maps the Avro library's type representation to our own.
virtual FileHeader * AllocateFileHeader()
Implementation of BaseSeqeunceScanner super class methods.
Status VerifyTypesMatch(SlotDescriptor *slot_desc, avro_obj_t *schema)
Status ParseMetadata()
Utility function for decoding and parsing file header metadata.
LLVM code generator. This is the top level object to generate jitted code.
Definition: llvm-codegen.h:107
void ReadAvroDecimal(int slot_byte_size, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
void MaterializeTuple(MemPool *pool, uint8_t **data, Tuple *tuple)
Materializes a single tuple from serialized record data.
static llvm::Function * CodegenMaterializeTuple(HdfsScanNode *node, LlvmCodeGen *codegen)
std::vector< SchemaElement > children
Complex types, e.g. records, may have nested child types.
static const uint8_t AVRO_VERSION_HEADER[4]
PrimitiveType
Definition: types.h:27
void ReadAvroDouble(PrimitiveType type, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
ObjectPool pool
int DecodeAvroData(int max_tuples, MemPool *pool, uint8_t **data, Tuple *tuple, TupleRow *tuple_row)
avro_schema_t schema
If not NULL, this owns a reference to schema.
Wrapper for avro_schema_t's that handles decrementing the ref count.
virtual Status InitNewRange()
Reset internal state for a new scan range.
void ReadAvroChar(PrimitiveType type, int max_len, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
static const std::string AVRO_SCHEMA_KEY
Metadata keys.
void ReadAvroFloat(PrimitiveType type, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
static const char * LLVM_CLASS_NAME
HdfsAvroScanner(HdfsScanNode *scan_node, RuntimeState *state)
void ReadAvroInt64(PrimitiveType type, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
static llvm::Function * Codegen(HdfsScanNode *, const std::vector< ExprContext * > &conjunct_ctxs)
Codegen parsing records, writing tuples and evaluating predicates.
void ReadAvroInt32(PrimitiveType type, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
virtual THdfsFileFormat::type file_format() const
Returns type of scanner: e.g. rcfile, seqfile.
static llvm::Function * CodegenDecodeAvroData(RuntimeState *state, llvm::Function *materialize_tuple_fn, const std::vector< ExprContext * > &conjunct_ctxs)
DecodeAvroDataFn codegend_decode_avro_data_
The codegen'd version of DecodeAvroData() if available, NULL otherwise.
ScopedAvroSchemaT schema
The record field schema from the file.
void ReadAvroString(PrimitiveType type, uint8_t **data, bool write_slot, void *slot, MemPool *pool)
Status ResolveSchemas(const avro_schema_t &table_root, const avro_schema_t &file_root)
static const std::string AVRO_NULL_CODEC
Supported codecs, as they appear in the metadata.
std::vector< SchemaElement > schema
List of SchemaElements corresponding to the fields of the file schema.
static const std::string AVRO_CODEC_KEY
static const std::string AVRO_DEFLATE_CODEC
AvroFileHeader * avro_header_
bool ReadUnionType(int null_union_position, uint8_t **data)