doc/html/hdfs-text-table-writer_8cc_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #include "exec/hdfs-text-table-writer.h"

 #include "exec/exec-node.h"

 #include "exprs/expr.h"

 #include "exprs/expr-context.h"

 #include "runtime/raw-value.h"

 #include "runtime/row-batch.h"

 #include "runtime/runtime-state.h"

 #include "runtime/hdfs-fs-cache.h"

 #include "util/codec.h"

 #include "util/compress.h"

 #include "util/hdfs-util.h"


 #include <hdfs.h>

 #include <stdlib.h>


 #include "common/names.h"


 // Hdfs block size for compressed text.

 static const int64_t COMPRESSED_BLOCK_SIZE = 64 * 1024 * 1024;


 // Size to buffer before compression. We want this to be less than the block size

 // (compressed text is not splittable).

 static const int64_t COMPRESSED_BUFFERED_SIZE = 60 * 1024 * 1024;


 namespace impala {


 HdfsTextTableWriter::HdfsTextTableWriter(HdfsTableSink* parent,

                                          RuntimeState* state, OutputPartition* output,

                                          const HdfsPartitionDescriptor* partition,

                                          const HdfsTableDescriptor* table_desc,

                                          const vector<ExprContext*>& output_expr_ctxs)

     : HdfsTableWriter(

         parent, state, output, partition, table_desc, output_expr_ctxs) {

   tuple_delim_ = partition->line_delim();

   field_delim_ = partition->field_delim();

   escape_char_ = partition->escape_char();

   flush_size_ = HDFS_FLUSH_WRITE_SIZE;


   // The default stringstream output precision is not very high, making it impossible

   // to properly output doubles (they get rounded to ints).  Set a more reasonable

   // precision.

   rowbatch_stringstream_.precision(RawValue::ASCII_PRECISION);

 }


 Status HdfsTextTableWriter::Init() {

   const TQueryOptions& query_options = state_->query_options();

   codec_ = THdfsCompression::NONE;

   if (query_options.__isset.compression_codec) {

     codec_ = query_options.compression_codec;

     if (codec_ == THdfsCompression::SNAPPY) {

       // hadoop.io.codec always means SNAPPY_BLOCKED. Alias the two.

       codec_ = THdfsCompression::SNAPPY_BLOCKED;

     }

   }


   if (codec_ != THdfsCompression::NONE) {

     mem_pool_.reset(new MemPool(parent_->mem_tracker()));

     RETURN_IF_ERROR(Codec::CreateCompressor(

         mem_pool_.get(), true, codec_, &compressor_));

     flush_size_ = COMPRESSED_BUFFERED_SIZE;

   } else {

     flush_size_ = HDFS_FLUSH_WRITE_SIZE;

   }

   parent_->mem_tracker()->Consume(flush_size_);

   return Status::OK;

 }


 void HdfsTextTableWriter::Close() {

   parent_->mem_tracker()->Release(flush_size_);

   if (mem_pool_.get() != NULL) mem_pool_->FreeAll();

 }


 uint64_t HdfsTextTableWriter::default_block_size() const {

   return compressor_.get() == NULL ? 0 : COMPRESSED_BLOCK_SIZE;

 }


 string HdfsTextTableWriter::file_extension() const {

   if (compressor_.get() == NULL) return "";

   return compressor_->file_extension();

 }


 Status HdfsTextTableWriter::AppendRowBatch(RowBatch* batch,

                                            const vector<int32_t>& row_group_indices,

                                            bool* new_file) {

   int32_t limit;

   if (row_group_indices.empty()) {

     limit = batch->num_rows();

   } else {

     limit = row_group_indices.size();

   }

   COUNTER_ADD(parent_->rows_inserted_counter(), limit);


   bool all_rows = row_group_indices.empty();

   int num_non_partition_cols =

       table_desc_->num_cols() - table_desc_->num_clustering_cols();

   DCHECK_GE(output_expr_ctxs_.size(), num_non_partition_cols) << parent_->DebugString();


   {

     SCOPED_TIMER(parent_->encode_timer());

     for (int row_idx = 0; row_idx < limit; ++row_idx) {

       TupleRow* current_row = all_rows ?

           batch->GetRow(row_idx) : batch->GetRow(row_group_indices[row_idx]);


       // There might be a select expr for partition cols as well, but we shouldn't be

       // writing their values to the row. Since there must be at least

       // num_non_partition_cols select exprs, and we assume that by convention any

       // partition col exprs are the last in output exprs, it's ok to just write

       // the first num_non_partition_cols values.

       for (int j = 0; j < num_non_partition_cols; ++j) {

         void* value = output_expr_ctxs_[j]->GetValue(current_row);

         if (value != NULL) {

           const ColumnType& type = output_expr_ctxs_[j]->root()->type();

           if (type.type == TYPE_CHAR) {

             char* val_ptr = StringValue::CharSlotToPtr(value, type);

             StringValue sv(val_ptr, StringValue::UnpaddedCharLength(val_ptr, type.len));

             PrintEscaped(&sv);

           } else if (type.IsVarLen()) {

             PrintEscaped(reinterpret_cast<const StringValue*>(value));

           } else {

             output_expr_ctxs_[j]->PrintValue(value, &rowbatch_stringstream_);

           }

         } else {

           // NULLs in hive are encoded based on the 'serialization.null.format' property.

           rowbatch_stringstream_ << table_desc_->null_column_value();

         }

         // Append field delimiter.

         if (j + 1 < num_non_partition_cols) {

           rowbatch_stringstream_ << field_delim_;

         }

       }

       // Append tuple delimiter.

       rowbatch_stringstream_ << tuple_delim_;

       ++output_->num_rows;

     }

   }


   *new_file = false;

   if (rowbatch_stringstream_.tellp() >= flush_size_) {

     RETURN_IF_ERROR(Flush());


     // If compressed, start a new file (compressed data is not splittable).

     *new_file = compressor_.get() != NULL;

   }


   return Status::OK;

 }


 Status HdfsTextTableWriter::Finalize() {

   return Flush();

 }


 Status HdfsTextTableWriter::Flush() {

   string rowbatch_string = rowbatch_stringstream_.str();

   rowbatch_stringstream_.str(string());

   const uint8_t* uncompressed_data =

       reinterpret_cast<const uint8_t*>(rowbatch_string.data());

   int64_t uncompressed_len = rowbatch_string.size();

   const uint8_t* data = uncompressed_data;

   int64_t len = uncompressed_len;


   if (compressor_.get() != NULL) {

     SCOPED_TIMER(parent_->compress_timer());

     uint8_t* compressed_data;

     int64_t compressed_len;

     RETURN_IF_ERROR(compressor_->ProcessBlock(false,

         uncompressed_len, uncompressed_data,

         &compressed_len, &compressed_data));

     data = compressed_data;

     len = compressed_len;

   }


   {

     SCOPED_TIMER(parent_->hdfs_write_timer());

     RETURN_IF_ERROR(Write(data, len));

   }


   return Status::OK;

 }


 inline void HdfsTextTableWriter::PrintEscaped(const StringValue* str_val) {

   for (int i = 0; i < str_val->len; ++i) {

     if (UNLIKELY(str_val->ptr[i] == field_delim_ || str_val->ptr[i] == escape_char_)) {

       rowbatch_stringstream_ << escape_char_;

     }

     rowbatch_stringstream_ << str_val->ptr[i];

   }

 }


 }

impala::HdfsTableWriter::HDFS_FLUSH_WRITE_SIZE
static const int HDFS_FLUSH_WRITE_SIZE
Definition: hdfs-table-writer.h:98

impala::HdfsTableDescriptor::null_column_value
const std::string & null_column_value() const
Definition: descriptors.h:233

impala::HdfsTextTableWriter::HdfsTextTableWriter
HdfsTextTableWriter(HdfsTableSink *parent, RuntimeState *state, OutputPartition *output, const HdfsPartitionDescriptor *partition, const HdfsTableDescriptor *table_desc, const std::vector< ExprContext * > &output_expr_ctxs)
Definition: hdfs-text-table-writer.cc:41

row-batch.h

impala::ColumnType::IsVarLen
bool IsVarLen() const
Definition: types.h:172

impala::HdfsTextTableWriter::Init
virtual Status Init()
Do initialization of writer.
Definition: hdfs-text-table-writer.cc:59

impala::RowBatch::num_rows
int num_rows() const
Definition: row-batch.h:215

impala::HdfsTableWriter::parent_
HdfsTableSink * parent_
Parent table sink object.
Definition: hdfs-table-writer.h:112

impala::HdfsTextTableWriter::Flush
Status Flush()
Definition: hdfs-text-table-writer.cc:166

impala::Codec::CreateCompressor
static Status CreateCompressor(MemPool *mem_pool, bool reuse, THdfsCompression::type format, boost::scoped_ptr< Codec > *compressor)

impala::TYPE_CHAR
Definition: types.h:47

impala::StringValue
Definition: string-value.h:33

impala::HdfsTableWriter::state_
RuntimeState * state_
Runtime state.
Definition: hdfs-table-writer.h:115

compress.h

impala::StringValue::len
int len
Definition: string-value.h:38

impala::HdfsTableDescriptor
Definition: descriptors.h:226

impala::RawValue::ASCII_PRECISION
static const int ASCII_PRECISION
Ascii output precision for double/float.
Definition: raw-value.h:40

RETURN_IF_ERROR
#define RETURN_IF_ERROR(stmt)
some generally useful macros
Definition: status.h:242

hdfs-util.h

raw-value.h

impala::RowBatch::GetRow
TupleRow * GetRow(int row_idx)
Definition: row-batch.h:140

codec.h

impala::HdfsTableWriter::Write
Status Write(const char *data, int32_t len)
Write to the current hdfs file.
Definition: hdfs-table-writer.h:101

expr-context.h

impala::HdfsTableSink::mem_tracker
MemTracker * mem_tracker()
Definition: hdfs-table-sink.h:149

impala::TableDescriptor::num_cols
int num_cols() const
Definition: descriptors.h:152

impala::OutputPartition
Definition: hdfs-table-sink.h:40

COMPRESSED_BLOCK_SIZE
static const int64_t COMPRESSED_BLOCK_SIZE
Definition: hdfs-text-table-writer.cc:33

impala::HdfsTextTableWriter::default_block_size
virtual uint64_t default_block_size() const
Definition: hdfs-text-table-writer.cc:87

impala::HdfsTableWriter::table_desc_
const HdfsTableDescriptor * table_desc_
Table descriptor of table to be written.
Definition: hdfs-table-writer.h:121

COUNTER_ADD
#define COUNTER_ADD(c, v)
Definition: runtime-profile.h:55

impala::TupleRow
Definition: tuple-row.h:28

impala::HdfsTextTableWriter::flush_size_
int64_t flush_size_
Size in rowbatch_stringstream_ before we call flush.
Definition: hdfs-text-table-writer.h:83

SCOPED_TIMER
#define SCOPED_TIMER(c)
Definition: runtime-profile.h:53

impala::HdfsTableSink::DebugString
std::string DebugString() const
Definition: hdfs-table-sink.cc:643

hdfs-fs-cache.h

impala::HdfsTableSink
Definition: hdfs-table-sink.h:122

impala::StringValue::UnpaddedCharLength
static int64_t UnpaddedCharLength(const char *cptr, int64_t len)
Returns number of characters in a char array (ignores trailing spaces)
Definition: string-value.inline.h:107

impala::HdfsPartitionDescriptor::escape_char
char escape_char() const
Definition: descriptors.h:183

impala::Status
Definition: status.h:81

impala::ColumnType::type
PrimitiveType type
Definition: types.h:60

impala::MemPool
Definition: mem-pool.h:77

impala::RuntimeState::query_options
const TQueryOptions & query_options() const
Definition: runtime-state.h:95

impala::HdfsTextTableWriter::file_extension
virtual std::string file_extension() const
Returns the file extension for this writer.
Definition: hdfs-text-table-writer.cc:91

impala::RuntimeState
Definition: runtime-state.h:69

impala::TableDescriptor::num_clustering_cols
int num_clustering_cols() const
Definition: descriptors.h:153

uint64_t

impala::HdfsTextTableWriter::AppendRowBatch
Status AppendRowBatch(RowBatch *current_row, const std::vector< int32_t > &row_group_indices, bool *new_file)
Definition: hdfs-text-table-writer.cc:96

exec-node.h

impala::ColumnType::len
int len
Only set if type == TYPE_CHAR or type == TYPE_VARCHAR.
Definition: types.h:62

impala::HdfsTableWriter::output_expr_ctxs_
std::vector< ExprContext * > output_expr_ctxs_
Expressions that materialize output values.
Definition: hdfs-table-writer.h:124

impala::HdfsTextTableWriter::PrintEscaped
void PrintEscaped(const StringValue *str_val)
Definition: hdfs-text-table-writer.cc:194

impala::HdfsTextTableWriter::mem_pool_
boost::scoped_ptr< MemPool > mem_pool_
Memory pool to use with compressor_.
Definition: hdfs-text-table-writer.h:96

impala::MemTracker::Release
void Release(int64_t bytes)
Decreases consumption of this tracker and its ancestors by 'bytes'.
Definition: mem-tracker.h:209

runtime-state.h

impala::HdfsTextTableWriter::Finalize
virtual Status Finalize()
Definition: hdfs-text-table-writer.cc:162

impala::OutputPartition::num_rows
int64_t num_rows
Records number of rows appended to the current file in this partition.
Definition: hdfs-table-sink.h:77

impala::RowBatch
Definition: row-batch.h:66

hdfs.h

impala::StringValue::ptr
char * ptr
Definition: string-value.h:37

impala::HdfsPartitionDescriptor::line_delim
char line_delim() const
Definition: descriptors.h:180

impala::HdfsTableWriter
Definition: hdfs-table-writer.h:33

impala::HdfsTextTableWriter::escape_char_
char escape_char_
Escape character.
Definition: hdfs-text-table-writer.h:80

impala::HdfsPartitionDescriptor::field_delim
char field_delim() const
Definition: descriptors.h:181

UNLIKELY
#define UNLIKELY(expr)
Definition: compiler-util.h:33

hdfs-text-table-writer.h

impala::HdfsTableSink::hdfs_write_timer
RuntimeProfile::Counter * hdfs_write_timer()
Definition: hdfs-table-sink.h:154

impala::HdfsTextTableWriter::rowbatch_stringstream_
std::stringstream rowbatch_stringstream_
Definition: hdfs-text-table-writer.h:87

impala::HdfsTableSink::rows_inserted_counter
RuntimeProfile::Counter * rows_inserted_counter()
Definition: hdfs-table-sink.h:151

impala::ColumnType
Definition: types.h:59

impala::Status::OK
static const Status OK
Definition: status.h:87

impala::HdfsTableSink::encode_timer
RuntimeProfile::Counter * encode_timer()
Definition: hdfs-table-sink.h:153

impala::MemTracker::Consume
void Consume(int64_t bytes)
Increases consumption of this tracker and its ancestors by 'bytes'.
Definition: mem-tracker.h:118

impala::StringValue::CharSlotToPtr
static char * CharSlotToPtr(void *slot, const ColumnType &type)
Definition: string-value.inline.h:115

expr.h

impala::HdfsPartitionDescriptor
Metadata for a single partition inside an Hdfs table.
Definition: descriptors.h:177

names.h

impala::HdfsTextTableWriter::compressor_
boost::scoped_ptr< Codec > compressor_
Compressor if compression is enabled.
Definition: hdfs-text-table-writer.h:93

impala::HdfsTextTableWriter::field_delim_
char field_delim_
Character delimiting fields (to become slots).
Definition: hdfs-text-table-writer.h:77

COMPRESSED_BUFFERED_SIZE
static const int64_t COMPRESSED_BUFFERED_SIZE
Definition: hdfs-text-table-writer.cc:37

impala::HdfsTextTableWriter::tuple_delim_
char tuple_delim_
Character delimiting tuples.
Definition: hdfs-text-table-writer.h:74

impala::HdfsTextTableWriter::Close
virtual void Close()
Called once when this writer should cleanup any resources.
Definition: hdfs-text-table-writer.cc:82

impala::HdfsTableSink::compress_timer
RuntimeProfile::Counter * compress_timer()
Definition: hdfs-table-sink.h:155

impala::HdfsTableWriter::output_
OutputPartition * output_
Structure describing partition written to by this writer.
Definition: hdfs-table-writer.h:118

impala::HdfsTextTableWriter::codec_
THdfsCompression::type codec_
Compression codec.
Definition: hdfs-text-table-writer.h:90