Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hdfs-parquet-table-writer.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HDFS_PARQUET_TABLE_WRITER_H
17 #define IMPALA_EXEC_HDFS_PARQUET_TABLE_WRITER_H
18 
19 #include "exec/data-sink.h"
20 
21 #include <hdfs.h>
22 #include <map>
23 #include <boost/scoped_ptr.hpp>
24 
25 #include "util/compress.h"
26 #include "runtime/descriptors.h"
27 #include "exec/hdfs-table-writer.h"
28 #include "exec/parquet-common.h"
29 
30 namespace impala {
31 
32 class Expr;
33 struct OutputPartition;
34 class RuntimeState;
35 class ThriftSerializer;
36 class TupleRow;
37 
48 
50  public:
52  RuntimeState* state, OutputPartition* output_partition,
53  const HdfsPartitionDescriptor* part_desc,
54  const HdfsTableDescriptor* table_desc,
55  const std::vector<ExprContext*>& output_expr_ctxs);
56 
58 
60  virtual Status Init();
61 
64  virtual Status InitNewFile();
65 
67  virtual Status AppendRowBatch(RowBatch* batch,
68  const std::vector<int32_t>& row_group_indices,
69  bool* new_file);
70 
72  virtual Status Finalize();
73 
74  virtual void Close();
75 
77  virtual uint64_t default_block_size() const;
78 
79  virtual std::string file_extension() const { return "parq"; }
80 
81  private:
83  static const int DEFAULT_DATA_PAGE_SIZE = 64 * 1024;
84 
87  static const int64_t MAX_DATA_PAGE_SIZE = 1024 * 1024 * 1024;
88 
90  static const int HDFS_BLOCK_SIZE = 256 * 1024 * 1024;
91 
93  static const int HDFS_BLOCK_ALIGNMENT = 1024 * 1024;
94 
96  static const int ROW_GROUP_SIZE = HDFS_BLOCK_SIZE;
97 
99  static const int HDFS_MIN_FILE_SIZE = 8 * 1024 * 1024;
100 
104  friend class BaseColumnWriter;
105 
106  template<typename T> class ColumnWriter;
107  template<typename T> friend class ColumnWriter;
109  friend class BoolColumnWriter;
110 
112  int64_t MinBlockSize() const;
113 
117 
120 
123 
127 
131 
134  boost::scoped_ptr<ThriftSerializer> thrift_serializer_;
135 
137  parquet::FileMetaData file_metadata_;
138 
140  parquet::RowGroup* current_row_group_;
141 
143  std::vector<BaseColumnWriter*> columns_;
144 
146  int64_t row_count_;
147 
154 
157 
161  int64_t file_pos_;
162 
165  boost::scoped_ptr<MemPool> reusable_col_mem_pool_;
166 
169  boost::scoped_ptr<MemPool> per_file_mem_pool_;
170 
174  int row_idx_;
175 
178  std::vector<uint8_t> compression_staging_buffer_;
179 
181  TParquetInsertStats parquet_stats_;
182 };
183 
184 }
185 #endif
boost::scoped_ptr< ThriftSerializer > thrift_serializer_
boost::scoped_ptr< MemPool > reusable_col_mem_pool_
int64_t file_size_limit_
Limit on the total size of the file.
std::vector< uint8_t > compression_staging_buffer_
parquet::FileMetaData file_metadata_
File metdata thrift description.
static const int DEFAULT_DATA_PAGE_SIZE
Default data page size. In bytes.
parquet::RowGroup * current_row_group_
The current row group being written to.
virtual Status Init()
Initialize column information.
virtual void Close()
Called once when this writer should cleanup any resources.
int64_t row_count_
Number of rows in current file.
static const int HDFS_BLOCK_SIZE
Default hdfs block size. In bytes.
static const int HDFS_MIN_FILE_SIZE
Minimum file size. If the configured size is less, fail.
static const int HDFS_BLOCK_ALIGNMENT
Align block sizes to this constant. In bytes.
virtual Status Finalize()
Write out all the data.
HdfsParquetTableWriter(HdfsTableSink *parent, RuntimeState *state, OutputPartition *output_partition, const HdfsPartitionDescriptor *part_desc, const HdfsTableDescriptor *table_desc, const std::vector< ExprContext * > &output_expr_ctxs)
virtual std::string file_extension() const
Returns the file extension for this writer.
Status WriteFileHeader()
Write the file header information to the output file.
static const int ROW_GROUP_SIZE
Default row group size. In bytes.
std::vector< BaseColumnWriter * > columns_
array of pointers to column information.
boost::scoped_ptr< MemPool > per_file_mem_pool_
virtual uint64_t default_block_size() const
Returns the target HDFS block size to use.
Metadata for a single partition inside an Hdfs table.
Definition: descriptors.h:177
virtual Status AppendRowBatch(RowBatch *batch, const std::vector< int32_t > &row_group_indices, bool *new_file)
Appends parquet representation of rows in the batch to the current file.
Status WriteFileFooter()
Write the file metadata and footer.
TParquetInsertStats parquet_stats_
For each column, the on disk size written.
int64_t MinBlockSize() const
Minimum allowable block size in bytes. This is a function of the number of columns.