doc/html/hdfs-scanner_8cc_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #include "exec/hdfs-scanner.h"


 #include <sstream>

 #include <boost/algorithm/string.hpp>


 #include "codegen/codegen-anyval.h"

 #include "codegen/llvm-codegen.h"

 #include "common/logging.h"

 #include "common/object-pool.h"

 #include "exec/text-converter.h"

 #include "exec/hdfs-scan-node.h"

 #include "exec/read-write-util.h"

 #include "exec/text-converter.inline.h"

 #include "exprs/expr-context.h"

 #include "runtime/descriptors.h"

 #include "runtime/hdfs-fs-cache.h"

 #include "runtime/runtime-state.h"

 #include "runtime/mem-pool.h"

 #include "runtime/raw-value.h"

 #include "runtime/row-batch.h"

 #include "runtime/string-value.h"

 #include "runtime/tuple-row.h"

 #include "runtime/tuple.h"

 #include "util/codec.h"

 #include "util/debug-util.h"

 #include "util/runtime-profile.h"

 #include "util/sse-util.h"

 #include "util/string-parser.h"

 #include "gen-cpp/PlanNodes_types.h"


 #include "common/names.h"


 using namespace impala;

 using namespace llvm;


 const char* FieldLocation::LLVM_CLASS_NAME = "struct.impala::FieldLocation";

 const char* HdfsScanner::LLVM_CLASS_NAME = "class.impala::HdfsScanner";


 HdfsScanner::HdfsScanner(HdfsScanNode* scan_node, RuntimeState* state)

     : scan_node_(scan_node),

       state_(state),

       context_(NULL),

       tuple_byte_size_(scan_node->tuple_desc()->byte_size()),

       tuple_(NULL),

       batch_(NULL),

       num_errors_in_file_(0),

       num_null_bytes_(scan_node->tuple_desc()->num_null_bytes()),

       decompression_type_(THdfsCompression::NONE),

       data_buffer_pool_(new MemPool(scan_node->mem_tracker())),

       write_tuples_fn_(NULL) {

 }


 HdfsScanner::~HdfsScanner() {

   DCHECK(batch_ == NULL);

 }


 Status HdfsScanner::Prepare(ScannerContext* context) {

   context_ = context;

   stream_ = context->GetStream();

   RETURN_IF_ERROR(scan_node_->GetConjunctCtxs(&conjunct_ctxs_));

   template_tuple_ = scan_node_->InitTemplateTuple(

       state_, context_->partition_descriptor()->partition_key_value_ctxs());

   StartNewRowBatch();

   decompress_timer_ = ADD_TIMER(scan_node_->runtime_profile(), "DecompressionTime");

   return Status::OK;

 }


 void HdfsScanner::Close() {

   if (decompressor_.get() != NULL) decompressor_->Close();

   Expr::Close(conjunct_ctxs_, state_);

 }


 Status HdfsScanner::InitializeWriteTuplesFn(HdfsPartitionDescriptor* partition,

     THdfsFileFormat::type type, const string& scanner_name) {

   if (!scan_node_->tuple_desc()->string_slots().empty()

       && partition->escape_char() != '\0') {

     // Cannot use codegen if there are strings slots and we need to

     // compact (i.e. copy) the data.

     scan_node_->IncNumScannersCodegenDisabled();

     return Status::OK;

   }


   write_tuples_fn_ = reinterpret_cast<WriteTuplesFn>(scan_node_->GetCodegenFn(type));

   if (write_tuples_fn_ == NULL) {

     scan_node_->IncNumScannersCodegenDisabled();

     return Status::OK;

   }

   VLOG(2) << scanner_name << "(node_id=" << scan_node_->id()

           << ") using llvm codegend functions.";

   scan_node_->IncNumScannersCodegenEnabled();

   return Status::OK;

 }


 void HdfsScanner::StartNewRowBatch() {

   batch_ = new RowBatch(scan_node_->row_desc(), state_->batch_size(),

       scan_node_->mem_tracker());

   tuple_mem_ =

       batch_->tuple_data_pool()->Allocate(state_->batch_size() * tuple_byte_size_);

 }


 int HdfsScanner::GetMemory(MemPool** pool, Tuple** tuple_mem, TupleRow** tuple_row_mem) {

   DCHECK(batch_ != NULL);

   DCHECK_GT(batch_->capacity(), batch_->num_rows());

   *pool = batch_->tuple_data_pool();

   *tuple_mem = reinterpret_cast<Tuple*>(tuple_mem_);

   *tuple_row_mem = batch_->GetRow(batch_->AddRow());

   return batch_->capacity() - batch_->num_rows();

 }


 Status HdfsScanner::CommitRows(int num_rows) {

   DCHECK(batch_ != NULL);

   DCHECK_LE(num_rows, batch_->capacity() - batch_->num_rows());

   batch_->CommitRows(num_rows);

   tuple_mem_ += scan_node_->tuple_desc()->byte_size() * num_rows;


   // We need to pass the row batch to the scan node if there is too much memory attached,

   // which can happen if the query is very selective.

   if (batch_->AtCapacity() || context_->num_completed_io_buffers() > 0) {

     context_->ReleaseCompletedResources(batch_, /* done */ false);

     scan_node_->AddMaterializedRowBatch(batch_);

     StartNewRowBatch();

   }


   if (context_->cancelled()) return Status::CANCELLED;

   RETURN_IF_ERROR(state_->CheckQueryState());

   // Free local expr allocations for this thread

   ExprContext::FreeLocalAllocations(conjunct_ctxs_);

   return Status::OK;

 }


 void HdfsScanner::AddFinalRowBatch() {

   DCHECK(batch_ != NULL);

   context_->ReleaseCompletedResources(batch_, /* done */ true);

   scan_node_->AddMaterializedRowBatch(batch_);

   batch_ = NULL;

 }


 // In this code path, no slots were materialized from the input files.  The only

 // slots are from partition keys.  This lets us simplify writing out the batches.

 //   1. template_tuple_ is the complete tuple.

 //   2. Eval conjuncts against the tuple.

 //   3. If it passes, stamp out 'num_tuples' copies of it into the row_batch.

 int HdfsScanner::WriteEmptyTuples(RowBatch* row_batch, int num_tuples) {

   DCHECK_GT(num_tuples, 0);


   if (template_tuple_ == NULL) {

     // No slots from partitions keys or slots.  This is count(*).  Just add the

     // number of rows to the batch.

     row_batch->AddRows(num_tuples);

     row_batch->CommitRows(num_tuples);

   } else {

     // Make a row and evaluate the row

     int row_idx = row_batch->AddRow();


     TupleRow* current_row = row_batch->GetRow(row_idx);

     current_row->SetTuple(scan_node_->tuple_idx(), template_tuple_);

     if (!EvalConjuncts(current_row)) return 0;

     // Add first tuple

     row_batch->CommitLastRow();

     --num_tuples;


     DCHECK_LE(num_tuples, row_batch->capacity() - row_batch->num_rows());


     for (int n = 0; n < num_tuples; ++n) {

       DCHECK(!row_batch->AtCapacity());

       row_idx = row_batch->AddRow();

       DCHECK(row_idx != RowBatch::INVALID_ROW_INDEX);

       TupleRow* current_row = row_batch->GetRow(row_idx);

       current_row->SetTuple(scan_node_->tuple_idx(), template_tuple_);

       row_batch->CommitLastRow();

     }

   }

   return num_tuples;

 }


 // In this code path, no slots were materialized from the input files.  The only

 // slots are from partition keys.  This lets us simplify writing out the batches.

 //   1. template_tuple_ is the complete tuple.

 //   2. Eval conjuncts against the tuple.

 //   3. If it passes, stamp out 'num_tuples' copies of it into the row_batch.

 int HdfsScanner::WriteEmptyTuples(ScannerContext* context,

     TupleRow* row, int num_tuples) {

   DCHECK_GE(num_tuples, 0);

   if (num_tuples == 0) return 0;


   if (template_tuple_ == NULL) {

     // Must be conjuncts on constant exprs.

     if (!EvalConjuncts(row)) return 0;

     return num_tuples;

   } else {

     row->SetTuple(scan_node_->tuple_idx(), template_tuple_);

     if (!EvalConjuncts(row)) return 0;

     row = next_row(row);


     for (int n = 1; n < num_tuples; ++n) {

       row->SetTuple(scan_node_->tuple_idx(), template_tuple_);

       row = next_row(row);

     }

   }

   return num_tuples;

 }


 bool HdfsScanner::WriteCompleteTuple(MemPool* pool, FieldLocation* fields,

     Tuple* tuple, TupleRow* tuple_row, Tuple* template_tuple,

     uint8_t* error_fields, uint8_t* error_in_row) {

   *error_in_row = false;

   // Initialize tuple before materializing slots

   InitTuple(template_tuple, tuple);


   for (int i = 0; i < scan_node_->materialized_slots().size(); ++i) {

     int need_escape = false;

     int len = fields[i].len;

     if (UNLIKELY(len < 0)) {

       len = -len;

       need_escape = true;

     }


     SlotDescriptor* desc = scan_node_->materialized_slots()[i];

     bool error = !text_converter_->WriteSlot(desc, tuple,

         fields[i].start, len, false, need_escape, pool);

     error_fields[i] = error;

     *error_in_row |= error;

   }


   tuple_row->SetTuple(scan_node_->tuple_idx(), tuple);

   return EvalConjuncts(tuple_row);

 }


 // Codegen for WriteTuple(above).  The signature matches WriteTuple (except for the

 // this* first argument).  For writing out and evaluating a single string slot:

 // define i1 @WriteCompleteTuple(%"class.impala::HdfsScanner"* %this,

 //                               %"class.impala::MemPool"* %pool,

 //                               %"struct.impala::FieldLocation"* %fields,

 //                               %"class.impala::Tuple"* %tuple,

 //                               %"class.impala::TupleRow"* %tuple_row,

 //                               %"class.impala::Tuple"* %template,

 //                               i8* %error_fields, i8* %error_in_row) #20 {

 // entry:

 //   %tuple_ptr = bitcast %"class.impala::Tuple"* %tuple

 //                to { i8, %"struct.impala::StringValue" }*

 //   %tuple_ptr1 = bitcast %"class.impala::Tuple"* %template

 //                 to { i8, %"struct.impala::StringValue" }*

 //   %null_byte = getelementptr inbounds

 //                { i8, %"struct.impala::StringValue" }* %tuple_ptr, i32 0, i32 0

 //   store i8 0, i8* %null_byte

 //   %0 = bitcast %"class.impala::TupleRow"* %tuple_row

 //        to { i8, %"struct.impala::StringValue" }**

 //   %1 = getelementptr { i8, %"struct.impala::StringValue" }** %0, i32 0

 //   store { i8, %"struct.impala::StringValue" }* %tuple_ptr,

 //         { i8, %"struct.impala::StringValue" }** %1

 //   br label %parse

 //

 // parse:                                            ; preds = %entry

 //   %data_ptr = getelementptr %"struct.impala::FieldLocation"* %fields, i32 0, i32 0

 //   %len_ptr = getelementptr %"struct.impala::FieldLocation"* %fields, i32 0, i32 1

 //   %slot_error_ptr = getelementptr i8* %error_fields, i32 0

 //   %data = load i8** %data_ptr

 //   %len = load i32* %len_ptr

 //   %2 = call i1 @WriteSlot({ i8, %"struct.impala::StringValue" }* %tuple_ptr,

 //                           i8* %data, i32 %len)

 //   %slot_parse_error = xor i1 %2, true

 //   %error_in_row2 = or i1 false, %slot_parse_error

 //   %3 = zext i1 %slot_parse_error to i8

 //   store i8 %3, i8* %slot_error_ptr

 //   %4 = call %"class.impala::ExprContext"* @GetConjunctCtx(

 //       %"class.impala::HdfsScanner"* %this, i32 0)

 //   %conjunct_eval = call i16 @Eq_StringVal_StringValWrapper1(

 //       %"class.impala::ExprContext"* %4, %"class.impala::TupleRow"* %tuple_row)

 //   %5 = ashr i16 %conjunct_eval, 8

 //   %6 = trunc i16 %5 to i8

 //   %val = trunc i8 %6 to i1

 //   br i1 %val, label %parse3, label %eval_fail

 //

 // parse3:                                           ; preds = %parse

 //   %7 = zext i1 %error_in_row2 to i8

 //   store i8 %7, i8* %error_in_row

 //   ret i1 true

 //

 // eval_fail:                                        ; preds = %parse

 //   ret i1 false

 // }

 Function* HdfsScanner::CodegenWriteCompleteTuple(

     HdfsScanNode* node, LlvmCodeGen* codegen, const vector<ExprContext*>& conjunct_ctxs) {

   SCOPED_TIMER(codegen->codegen_timer());

   RuntimeState* state = node->runtime_state();


   // TODO: Timestamp is not yet supported

   for (int i = 0; i < node->materialized_slots().size(); ++i) {

     SlotDescriptor* slot_desc = node->materialized_slots()[i];

     if (slot_desc->type().type == TYPE_TIMESTAMP) return NULL;

     if (slot_desc->type().type == TYPE_DECIMAL) return NULL;

   }


   // Cast away const-ness.  The codegen only sets the cached typed llvm struct.

   TupleDescriptor* tuple_desc = const_cast<TupleDescriptor*>(node->tuple_desc());

   vector<Function*> slot_fns;

   for (int i = 0; i < node->materialized_slots().size(); ++i) {

     SlotDescriptor* slot_desc = node->materialized_slots()[i];

     Function* fn = TextConverter::CodegenWriteSlot(codegen, tuple_desc, slot_desc,

         node->hdfs_table()->null_column_value().data(),

         node->hdfs_table()->null_column_value().size(), true);

     if (fn == NULL) return NULL;

     slot_fns.push_back(fn);

   }


   // Compute order to materialize slots.  BE assumes that conjuncts should

   // be evaluated in the order specified (optimization is already done by FE)

   vector<int> materialize_order;

   node->ComputeSlotMaterializationOrder(&materialize_order);


   // Get types to construct matching function signature to WriteCompleteTuple

   PointerType* uint8_ptr_type = PointerType::get(codegen->GetType(TYPE_TINYINT), 0);


   StructType* field_loc_type = reinterpret_cast<StructType*>(

       codegen->GetType(FieldLocation::LLVM_CLASS_NAME));

   Type* tuple_row_type = codegen->GetType(TupleRow::LLVM_CLASS_NAME);

   Type* tuple_opaque_type = codegen->GetType(Tuple::LLVM_CLASS_NAME);

   Type* mem_pool_type = codegen->GetType(MemPool::LLVM_CLASS_NAME);

   Type* hdfs_scanner_type = codegen->GetType(HdfsScanner::LLVM_CLASS_NAME);


   DCHECK(tuple_opaque_type != NULL);

   DCHECK(tuple_row_type != NULL);

   DCHECK(field_loc_type != NULL);

   DCHECK(hdfs_scanner_type != NULL);


   PointerType* field_loc_ptr_type = PointerType::get(field_loc_type, 0);

   PointerType* tuple_opaque_ptr_type = PointerType::get(tuple_opaque_type, 0);

   PointerType* tuple_row_ptr_type = PointerType::get(tuple_row_type, 0);

   PointerType* mem_pool_ptr_type = PointerType::get(mem_pool_type, 0);

   PointerType* hdfs_scanner_ptr_type = PointerType::get(hdfs_scanner_type, 0);


   // Generate the typed llvm struct for the output tuple

   StructType* tuple_type = tuple_desc->GenerateLlvmStruct(codegen);

   if (tuple_type == NULL) return NULL;

   PointerType* tuple_ptr_type = PointerType::get(tuple_type, 0);


   // Initialize the function prototype.  This needs to match

   // HdfsScanner::WriteCompleteTuple's signature identically.

   LlvmCodeGen::FnPrototype prototype(

       codegen, "WriteCompleteTuple", codegen->GetType(TYPE_BOOLEAN));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("this", hdfs_scanner_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("pool", mem_pool_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("fields", field_loc_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple", tuple_opaque_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple_row", tuple_row_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("template", tuple_opaque_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("error_fields", uint8_ptr_type));

   prototype.AddArgument(LlvmCodeGen::NamedVariable("error_in_row", uint8_ptr_type));


   LLVMContext& context = codegen->context();

   LlvmCodeGen::LlvmBuilder builder(context);

   Value* args[8];

   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);


   BasicBlock* parse_block = BasicBlock::Create(context, "parse", fn);

   BasicBlock* eval_fail_block = BasicBlock::Create(context, "eval_fail", fn);


   // Extract the input args

   Value* this_arg = args[0];

   Value* fields_arg = args[2];

   Value* tuple_arg = builder.CreateBitCast(args[3], tuple_ptr_type, "tuple_ptr");

   Value* tuple_row_arg = args[4];

   Value* template_arg = builder.CreateBitCast(args[5], tuple_ptr_type, "tuple_ptr");

   Value* errors_arg = args[6];

   Value* error_in_row_arg = args[7];


   // Codegen for function body

   Value* error_in_row = codegen->false_value();

   // Initialize tuple

   if (node->num_materialized_partition_keys() == 0) {

     // No partition key slots, just zero the NULL bytes.

     for (int i = 0; i < tuple_desc->num_null_bytes(); ++i) {

       Value* null_byte = builder.CreateStructGEP(tuple_arg, i, "null_byte");

       builder.CreateStore(codegen->GetIntConstant(TYPE_TINYINT, 0), null_byte);

     }

   } else {

     // Copy template tuple.

     // TODO: only copy what's necessary from the template tuple.

     codegen->CodegenMemcpy(&builder, tuple_arg, template_arg, tuple_desc->byte_size());

   }


   // Put tuple in tuple_row

   Value* tuple_row_typed =

       builder.CreateBitCast(tuple_row_arg, PointerType::get(tuple_ptr_type, 0));

   Value* tuple_row_idxs[] = { codegen->GetIntConstant(TYPE_INT, node->tuple_idx()) };

   Value* tuple_in_row_addr = builder.CreateGEP(tuple_row_typed, tuple_row_idxs);

   builder.CreateStore(tuple_arg, tuple_in_row_addr);

   builder.CreateBr(parse_block);


   // Loop through all the conjuncts in order and materialize slots as necessary to

   // evaluate the conjuncts (e.g. conjunct_ctxs[0] will have the slots it references

   // first).

   // materialized_order[slot_idx] represents the first conjunct which needs that slot.

   // Slots are only materialized if its order matches the current conjunct being

   // processed.  This guarantees that each slot is materialized once when it is first

   // needed and that at the end of the materialize loop, the conjunct has everything

   // it needs (either from this iteration or previous iterations).

   builder.SetInsertPoint(parse_block);

   for (int conjunct_idx = 0; conjunct_idx <= conjunct_ctxs.size(); ++conjunct_idx) {

     for (int slot_idx = 0; slot_idx < materialize_order.size(); ++slot_idx) {

       // If they don't match, it means either the slot has already been

       // materialized for a previous conjunct or will be materialized later for

       // another conjunct.  Either case, the slot does not need to be materialized

       // yet.

       if (materialize_order[slot_idx] != conjunct_idx) continue;


       // Materialize slots[slot_idx] to evaluate conjunct_ctxs[conjunct_idx]

       // All slots[i] with materialized_order[i] < conjunct_idx have already been

       // materialized by prior iterations through the outer loop


       // Extract ptr/len from fields

       Value* data_idxs[] = {

         codegen->GetIntConstant(TYPE_INT, slot_idx),

         codegen->GetIntConstant(TYPE_INT, 0),

       };

       Value* len_idxs[] = {

         codegen->GetIntConstant(TYPE_INT, slot_idx),

         codegen->GetIntConstant(TYPE_INT, 1),

       };

       Value* error_idxs[] = {

         codegen->GetIntConstant(TYPE_INT, slot_idx),

       };

       Value* data_ptr = builder.CreateGEP(fields_arg, data_idxs, "data_ptr");

       Value* len_ptr = builder.CreateGEP(fields_arg, len_idxs, "len_ptr");

       Value* error_ptr = builder.CreateGEP(errors_arg, error_idxs, "slot_error_ptr");

       Value* data = builder.CreateLoad(data_ptr, "data");

       Value* len = builder.CreateLoad(len_ptr, "len");


       // Call slot parse function

       Function* slot_fn = slot_fns[slot_idx];

       Value* slot_parsed = builder.CreateCall3(slot_fn, tuple_arg, data, len);

       Value* slot_error = builder.CreateNot(slot_parsed, "slot_parse_error");

       error_in_row = builder.CreateOr(error_in_row, slot_error, "error_in_row");

       slot_error = builder.CreateZExt(slot_error, codegen->GetType(TYPE_TINYINT));

       builder.CreateStore(slot_error, error_ptr);

     }


     if (conjunct_idx == conjunct_ctxs.size()) {

       // In this branch, we've just materialized slots not referenced by any conjunct.

       // This slots are the last to get materialized.  If we are in this branch, the

       // tuple passed all conjuncts and should be added to the row batch.

       Value* error_ret = builder.CreateZExt(error_in_row, codegen->GetType(TYPE_TINYINT));

       builder.CreateStore(error_ret, error_in_row_arg);

       builder.CreateRet(codegen->true_value());

     } else {

       // All slots for conjunct_ctxs[conjunct_idx] are materialized, evaluate the partial

       // tuple against that conjunct and start a new parse_block for the next conjunct

       parse_block = BasicBlock::Create(context, "parse", fn, eval_fail_block);

       Function* conjunct_fn;

       Status status =

           conjunct_ctxs[conjunct_idx]->root()->GetCodegendComputeFn(state, &conjunct_fn);

       if (!status.ok()) {

         stringstream ss;

         ss << "Failed to codegen conjunct: " << status.GetDetail();

         state->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

         fn->eraseFromParent();

         return NULL;

       }


       Function* get_ctx_fn =

           codegen->GetFunction(IRFunction::HDFS_SCANNER_GET_CONJUNCT_CTX);

       Value* ctx = builder.CreateCall2(

           get_ctx_fn, this_arg, codegen->GetIntConstant(TYPE_INT, conjunct_idx));


       Value* conjunct_args[] = {ctx, tuple_row_arg};

       CodegenAnyVal result = CodegenAnyVal::CreateCallWrapped(

           codegen, &builder, TYPE_BOOLEAN, conjunct_fn, conjunct_args, "conjunct_eval");

       builder.CreateCondBr(result.GetVal(), parse_block, eval_fail_block);

       builder.SetInsertPoint(parse_block);

     }

   }


   // Block if eval failed.

   builder.SetInsertPoint(eval_fail_block);

   builder.CreateRet(codegen->false_value());


   codegen->OptimizeFunctionWithExprs(fn);

   return codegen->FinalizeFunction(fn);

 }


 Function* HdfsScanner::CodegenWriteAlignedTuples(HdfsScanNode* node,

     LlvmCodeGen* codegen, Function* write_complete_tuple_fn) {

   SCOPED_TIMER(codegen->codegen_timer());

   DCHECK(write_complete_tuple_fn != NULL);


   Function* write_tuples_fn =

       codegen->GetFunction(IRFunction::HDFS_SCANNER_WRITE_ALIGNED_TUPLES);

   DCHECK(write_tuples_fn != NULL);


   int replaced = 0;

   write_tuples_fn = codegen->ReplaceCallSites(write_tuples_fn, false,

       write_complete_tuple_fn, "WriteCompleteTuple", &replaced);

   DCHECK_EQ(replaced, 1) << "One call site should be replaced.";

   DCHECK(write_tuples_fn != NULL);


   return codegen->FinalizeFunction(write_tuples_fn);

 }


 Status HdfsScanner::UpdateDecompressor(const THdfsCompression::type& compression) {

   // Check whether the file in the stream has different compression from the last one.

   if (compression != decompression_type_) {

     if (decompression_type_ != THdfsCompression::NONE) {

       // Close the previous decompressor before creating a new one.

       DCHECK(decompressor_.get() != NULL);

       decompressor_->Close();

       decompressor_.reset(NULL);

     }

     // The LZO-compression scanner is implemented in a dynamically linked library and it

     // is not created at Codec::CreateDecompressor().

     if (compression != THdfsCompression::NONE && compression != THdfsCompression::LZO) {

       RETURN_IF_ERROR(Codec::CreateDecompressor(data_buffer_pool_.get(),

         scan_node_->tuple_desc()->string_slots().empty(), compression, &decompressor_));

     }

     decompression_type_ = compression;

   }

   return Status::OK;

 }


 Status HdfsScanner::UpdateDecompressor(const string& codec) {

   map<const string, const THdfsCompression::type>::const_iterator

     type = Codec::CODEC_MAP.find(codec);


   if (type == Codec::CODEC_MAP.end()) {

     stringstream ss;

     ss << Codec::UNKNOWN_CODEC_ERROR << codec;

     return Status(ss.str());

   }

   RETURN_IF_ERROR(UpdateDecompressor(type->second));

   return Status::OK;

 }


 bool HdfsScanner::ReportTupleParseError(FieldLocation* fields, uint8_t* errors,

     int row_idx) {

   for (int i = 0; i < scan_node_->materialized_slots().size(); ++i) {

     if (errors[i]) {

       const SlotDescriptor* desc = scan_node_->materialized_slots()[i];

       ReportColumnParseError(desc, fields[i].start, fields[i].len);

       errors[i] = false;

     }

   }


   // Call into subclass to log a more accurate error message.

   if (state_->LogHasSpace()) {

     stringstream ss;

     ss << "file: " << stream_->filename() << endl << "record: ";

     LogRowParseError(row_idx, &ss);

     state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

   }


   ++num_errors_in_file_;

   if (state_->abort_on_error()) {

     state_->ReportFileErrors(stream_->filename(), 1);

     DCHECK(!parse_status_.ok());

   }

   return parse_status_.ok();

 }


 void HdfsScanner::LogRowParseError(int row_idx, stringstream* ss) {

   // This is only called for text and seq files which should override this function.

   DCHECK(false);

 }


 void HdfsScanner::ReportColumnParseError(const SlotDescriptor* desc,

     const char* data, int len) {

   // len < 0 is used to indicate the data contains escape characters.  We don't care

   // about that here and can just output the raw string.

   if (len < 0) len = -len;


   if (state_->LogHasSpace() || state_->abort_on_error()) {

     stringstream ss;

     ss << "Error converting column: "

        << desc->col_pos() - scan_node_->num_partition_keys()

        << " TO " << desc->type()

        << " (Data is: " << string(data,len) << ")";

     if (state_->LogHasSpace()) {

       state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

     }


     if (state_->abort_on_error() && parse_status_.ok()) parse_status_ = Status(ss.str());

   }

 }

impala::HdfsScanNode::materialized_slots
const std::vector< SlotDescriptor * > & materialized_slots() const
Definition: hdfs-scan-node.h:119

impala::HdfsTableDescriptor::null_column_value
const std::string & null_column_value() const
Definition: descriptors.h:233

row-batch.h

impala::TupleDescriptor
Definition: descriptors.h:298

impala::Codec::CODEC_MAP
static const CodecMap CODEC_MAP
Definition: codec.h:52

impala::ExecNode::id
int id() const
Definition: exec-node.h:154

impala::HdfsScanner::decompressor_
boost::scoped_ptr< Codec > decompressor_
Decompressor class to use, if any.
Definition: hdfs-scanner.h:198

impala::HdfsScanner::ReportColumnParseError
void ReportColumnParseError(const SlotDescriptor *desc, const char *data, int len)
Definition: hdfs-scanner.cc:577

impala::CodegenAnyVal
Definition: codegen-anyval.h:52

impala::RuntimeState::CheckQueryState
Status CheckQueryState()
Definition: runtime-state.cc:286

impala::FieldLocation::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
Definition: hdfs-scanner.h:61

impala::RowBatch::num_rows
int num_rows() const
Definition: row-batch.h:215

impala::HdfsScanner::scan_node_
HdfsScanNode * scan_node_
The scan node that started this scanner.
Definition: hdfs-scanner.h:141

impala::HdfsScanner::LogRowParseError
virtual void LogRowParseError(int row_idx, std::stringstream *)
Definition: hdfs-scanner.cc:572

impala::Status::GetDetail
const std::string GetDetail() const
Definition: status.cc:184

impala::RowBatch::AddRows
int AddRows(int n)
Definition: row-batch.h:94

impala::HdfsScanner::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
Definition: hdfs-scanner.h:137

impala::CodegenAnyVal::CreateCallWrapped
static CodegenAnyVal CreateCallWrapped(LlvmCodeGen *cg, LlvmCodeGen::LlvmBuilder *builder, const ColumnType &type, llvm::Function *fn, llvm::ArrayRef< llvm::Value * > args, const char *name="", llvm::Value *result_ptr=NULL)
Same as above but wraps the result in a CodegenAnyVal.
Definition: codegen-anyval.cc:148

hdfs-scan-node.h

impala::HdfsScanNode::num_partition_keys
int num_partition_keys() const
Returns number of partition keys in the table, including non-materialized slots.
Definition: hdfs-scan-node.h:127

impala::FieldLocation
Definition: hdfs-scanner.h:52

impala::HdfsScanner::context_
ScannerContext * context_
Context for this scanner.
Definition: hdfs-scanner.h:147

impala::LlvmCodeGen::codegen_timer
RuntimeProfile::Counter * codegen_timer()
Definition: llvm-codegen.h:135

runtime-profile.h

hdfs-scanner.h

codegen-anyval.h

impala::HdfsScanner::tuple_byte_size_
int tuple_byte_size_
Fixed size of each tuple, in bytes.
Definition: hdfs-scanner.h:167

impala::HdfsScanner::data_buffer_pool_
boost::scoped_ptr< MemPool > data_buffer_pool_
Definition: hdfs-scanner.h:205

impala::HdfsScanner::CodegenWriteCompleteTuple
static llvm::Function * CodegenWriteCompleteTuple(HdfsScanNode *, LlvmCodeGen *, const std::vector< ExprContext * > &conjunct_ctxs)
Definition: hdfs-scanner.cc:296

impala::ExecNode::mem_tracker
MemTracker * mem_tracker()
Definition: exec-node.h:162

impala::Codec::CreateDecompressor
static Status CreateDecompressor(MemPool *mem_pool, bool reuse, THdfsCompression::type format, boost::scoped_ptr< Codec > *decompressor)

impala::HdfsScanner::text_converter_
boost::scoped_ptr< TextConverter > text_converter_
Helper class for converting text to other types;.
Definition: hdfs-scanner.h:186

text-converter.h

impala::HdfsScanner::write_tuples_fn_
WriteTuplesFn write_tuples_fn_
Jitted write tuples function pointer. Null if codegen is disabled.
Definition: hdfs-scanner.h:215

impala::LlvmCodeGen::NamedVariable
Utility struct that wraps a variable name and llvm type.
Definition: llvm-codegen.h:149

impala::HdfsScanner::tuple_mem_
uint8_t * tuple_mem_
The tuple memory of batch_.
Definition: hdfs-scanner.h:180

impala::HdfsScanNode::hdfs_table
const HdfsTableDescriptor * hdfs_table()
Definition: hdfs-scan-node.h:134

impala::Tuple
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48

impala::ScannerContext::num_completed_io_buffers
int num_completed_io_buffers() const
Definition: scanner-context.h:277

impala::HdfsScanner::parse_status_
Status parse_status_
Definition: hdfs-scanner.h:195

impala::ScannerContext::ReleaseCompletedResources
void ReleaseCompletedResources(RowBatch *batch, bool done)
Definition: scanner-context.cc:45

RETURN_IF_ERROR
#define RETURN_IF_ERROR(stmt)
some generally useful macros
Definition: status.h:242

raw-value.h

impala::HdfsScanner::conjunct_ctxs_
std::vector< ExprContext * > conjunct_ctxs_
Definition: hdfs-scanner.h:154

impala::RowBatch::GetRow
TupleRow * GetRow(int row_idx)
Definition: row-batch.h:140

impala::HdfsScanner::WriteEmptyTuples
int WriteEmptyTuples(RowBatch *row_batch, int num_tuples)
Definition: hdfs-scanner.cc:157

impala::RuntimeState::LogHasSpace
bool LogHasSpace()
Returns true if the error log has not reached max_errors_.
Definition: runtime-state.h:211

codec.h

ADD_TIMER
#define ADD_TIMER(profile, name)
Definition: runtime-profile.h:50

sse-util.h

expr-context.h

impala::HdfsScanner::AddFinalRowBatch
void AddFinalRowBatch()
Definition: hdfs-scanner.cc:145

impala::HdfsScanner::~HdfsScanner
virtual ~HdfsScanner()
Definition: hdfs-scanner.cc:67

impala::RowBatch::AtCapacity
bool AtCapacity()
Definition: row-batch.h:120

impala::TupleDescriptor::num_null_bytes
int num_null_bytes() const
Definition: descriptors.h:301

impala::TupleDescriptor::byte_size
int byte_size() const
Definition: descriptors.h:300

impala::HdfsScanner::template_tuple_
Tuple * template_tuple_
Definition: hdfs-scanner.h:164

llvm::IRBuilder
Definition: llvm-codegen.h:60

impala::SlotDescriptor
Definition: descriptors.h:75

impala::ScannerContext::cancelled
bool cancelled() const
If true, the ScanNode has been cancelled and the scanner thread should finish up. ...
Definition: scanner-context.cc:282

impala::TupleDescriptor::GenerateLlvmStruct
llvm::StructType * GenerateLlvmStruct(LlvmCodeGen *codegen)
Definition: descriptors.cc:556

impala::ExecNode::row_desc
const RowDescriptor & row_desc() const
Definition: exec-node.h:156

impala::ScannerContext
Definition: scanner-context.h:55

llvm-codegen.h

impala::HdfsScanner::next_row
TupleRow * next_row(TupleRow *r) const
Definition: hdfs-scanner.h:368

logging.h

impala::TupleRow
Definition: tuple-row.h:28

impala::TYPE_TIMESTAMP
Definition: types.h:37

impala::ScannerContext::Stream::filename
const char * filename()
Definition: scanner-context.h:118

SCOPED_TIMER
#define SCOPED_TIMER(c)
Definition: runtime-profile.h:53

hdfs-fs-cache.h

impala::Expr::Close
static void Close(const std::vector< ExprContext * > &ctxs, RuntimeState *state)
Convenience function for closing multiple expr trees.

impala::HdfsScanner::StartNewRowBatch
void StartNewRowBatch()
Set batch_ to a new row batch and update tuple_mem_ accordingly.
Definition: hdfs-scanner.cc:108

impala::HdfsScanner::decompression_type_
THdfsCompression::type decompression_type_
The most recently used decompression type.
Definition: hdfs-scanner.h:201

object-pool.h

impala::TupleDescriptor::string_slots
const std::vector< SlotDescriptor * > & string_slots() const
Definition: descriptors.h:303

impala::HdfsScanner::Close
virtual void Close()
Definition: hdfs-scanner.cc:82

impala::HdfsScanNode::IncNumScannersCodegenEnabled
void IncNumScannersCodegenEnabled()
Definition: hdfs-scan-node.h:168

impala::LlvmCodeGen
LLVM code generator. This is the top level object to generate jitted code.
Definition: llvm-codegen.h:107

impala::HdfsPartitionDescriptor::escape_char
char escape_char() const
Definition: descriptors.h:183

impala::Status
Definition: status.h:81

impala::HdfsScanner::state_
RuntimeState * state_
RuntimeState for error reporting.
Definition: hdfs-scanner.h:144

impala::TupleRow::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
Definition: tuple-row.h:76

impala::TYPE_INT
Definition: types.h:33

impala::ColumnType::type
PrimitiveType type
Definition: types.h:60

impala::LlvmCodeGen::FnPrototype::AddArgument
void AddArgument(const NamedVariable &var)
Add argument.
Definition: llvm-codegen.h:171

impala::MemPool
Definition: mem-pool.h:77

impala::HdfsScanner::UpdateDecompressor
Status UpdateDecompressor(const THdfsCompression::type &compression)
Definition: hdfs-scanner.cc:513

impala::Tuple::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
For C++/IR interop, we need to be able to look up types by name.
Definition: tuple.h:134

impala::RuntimeState::LogError
bool LogError(const ErrorMsg &msg)
Definition: runtime-state.cc:224

impala::HdfsScanner::InitTuple
void InitTuple(Tuple *template_tuple, Tuple *tuple)
Definition: hdfs-scanner.h:355

impala::HdfsScanner::GetMemory
int GetMemory(MemPool **pool, Tuple **tuple_mem, TupleRow **tuple_row_mem)
Definition: hdfs-scanner.cc:115

impala::RuntimeState
Definition: runtime-state.h:69

string-parser.h

impala::HdfsScanNode::ComputeSlotMaterializationOrder
void ComputeSlotMaterializationOrder(std::vector< int > *order) const
Definition: hdfs-scan-node.cc:956

impala::SlotDescriptor::type
const ColumnType & type() const
Definition: descriptors.h:78

impala::HdfsScanner::num_errors_in_file_
int num_errors_in_file_
number of errors in current file
Definition: hdfs-scanner.h:183

impala::RuntimeState::ReportFileErrors
void ReportFileErrors(const std::string &file_name, int num_errors)
Report that num_errors occurred while parsing file_name.
Definition: runtime-state.cc:219

impala::LlvmCodeGen::CodegenMemcpy
void CodegenMemcpy(LlvmBuilder *, llvm::Value *dst, llvm::Value *src, int size)
Definition: llvm-codegen.cc:933

debug-util.h

pool
ObjectPool pool
Definition: expr-benchmark.cc:89

impala::HdfsScanner::CommitRows
Status CommitRows(int num_rows)
Definition: hdfs-scanner.cc:124

impala::HdfsScanner::HdfsScanner
HdfsScanner(HdfsScanNode *scan_node, RuntimeState *state)
Definition: hdfs-scanner.cc:53

impala::LlvmCodeGen::GetFunction
llvm::Function * GetFunction(IRFunction::Type)
Definition: llvm-codegen.cc:421

impala::HdfsScanNode
Definition: hdfs-scan-node.h:104

impala::SlotDescriptor::col_pos
int col_pos() const
Definition: descriptors.h:84

impala::HdfsScanNode::runtime_state
RuntimeState * runtime_state()
Definition: hdfs-scan-node.h:136

impala::HdfsScanNode::AddMaterializedRowBatch
void AddMaterializedRowBatch(RowBatch *row_batch)
Definition: hdfs-scan-node.cc:688

impala::TYPE_BOOLEAN
Definition: types.h:30

impala::HdfsScanner::WriteCompleteTuple
bool WriteCompleteTuple(MemPool *pool, FieldLocation *fields, Tuple *tuple, TupleRow *tuple_row, Tuple *template_tuple, uint8_t *error_fields, uint8_t *error_in_row)
Definition: hdfs-scanner.cc:217

impala::Codec::UNKNOWN_CODEC_ERROR
static const char *const UNKNOWN_CODEC_ERROR
Definition: codec.h:48

impala::RowBatch::CommitLastRow
void CommitLastRow()
Definition: row-batch.h:109

impala::MemPool::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
Definition: mem-pool.h:177

impala::HdfsScanNode::GetCodegenFn
void * GetCodegenFn(THdfsFileFormat::type)
Definition: hdfs-scan-node.cc:224

impala::LlvmCodeGen::true_value
llvm::Value * true_value()
Returns true/false constants (bool type)
Definition: llvm-codegen.h:380

runtime-state.h

impala::RowBatch
Definition: row-batch.h:66

impala::Status::CANCELLED
static const Status CANCELLED
Definition: status.h:88

impala::RuntimeState::batch_size
int batch_size() const
Definition: runtime-state.h:98

impala::HdfsScanner::EvalConjuncts
bool IR_ALWAYS_INLINE EvalConjuncts(TupleRow *row)
Definition: hdfs-scanner.h:266

impala::RowBatch::tuple_data_pool
MemPool * tuple_data_pool()
Definition: row-batch.h:148

impala::HdfsScanner::CodegenWriteAlignedTuples
static llvm::Function * CodegenWriteAlignedTuples(HdfsScanNode *, LlvmCodeGen *, llvm::Function *write_tuple_fn)
Definition: hdfs-scanner.cc:495

impala::TupleRow::SetTuple
void SetTuple(int tuple_idx, Tuple *tuple)
Definition: tuple-row.h:34

impala::ErrorMsg
Definition: error-util.h:47

read-write-util.h

impala::ExprContext::FreeLocalAllocations
void FreeLocalAllocations()
Definition: expr-context.cc:109

impala::RowBatch::capacity
int capacity() const
Definition: row-batch.h:216

impala::RowBatch::AddRow
int AddRow()
Definition: row-batch.h:100

UNLIKELY
#define UNLIKELY(expr)
Definition: compiler-util.h:33

impala::FieldLocation::len
int len
Definition: hdfs-scanner.h:59

impala::LlvmCodeGen::false_value
llvm::Value * false_value()
Definition: llvm-codegen.h:381

impala::CodegenAnyVal::GetVal
llvm::Value * GetVal(const char *name="val")
Definition: codegen-anyval.cc:258

impala::Status::OK
static const Status OK
Definition: status.h:87

tuple.h

impala::LlvmCodeGen::GetType
llvm::Type * GetType(const ColumnType &type)
Returns llvm type for the column type.
Definition: llvm-codegen.cc:312

mem-pool.h

impala::HdfsPartitionDescriptor
Metadata for a single partition inside an Hdfs table.
Definition: descriptors.h:177

names.h

impala::LlvmCodeGen::GetIntConstant
llvm::Value * GetIntConstant(PrimitiveType type, int64_t val)
Returns the constant 'val' of 'type'.
Definition: llvm-codegen.cc:371

impala::LlvmCodeGen::FinalizeFunction
llvm::Function * FinalizeFunction(llvm::Function *function)
Definition: llvm-codegen.cc:596

tuple-row.h

impala::ScannerContext::GetStream
Stream * GetStream(int idx=0)
Definition: scanner-context.h:246

text-converter.inline.h

impala::HdfsScanNode::tuple_idx
int tuple_idx() const
Definition: hdfs-scan-node.h:124

impala::RuntimeState::abort_on_error
bool abort_on_error() const
Definition: runtime-state.h:99

impala::LlvmCodeGen::ReplaceCallSites
llvm::Function * ReplaceCallSites(llvm::Function *caller, bool update_in_place, llvm::Function *new_fn, const std::string &target_name, int *num_replaced)
Definition: llvm-codegen.cc:489

impala::HdfsScanner::batch_
RowBatch * batch_
Definition: hdfs-scanner.h:177

impala::Status::ok
bool ok() const
Definition: status.h:172

impala::HdfsScanner::decompress_timer_
RuntimeProfile::Counter * decompress_timer_
Time spent decompressing bytes.
Definition: hdfs-scanner.h:208

impala::TYPE_TINYINT
Definition: types.h:31

impala::LlvmCodeGen::context
llvm::LLVMContext & context()
Definition: llvm-codegen.h:214

descriptors.h

impala::HdfsScanner::InitializeWriteTuplesFn
Status InitializeWriteTuplesFn(HdfsPartitionDescriptor *partition, THdfsFileFormat::type type, const std::string &scanner_name)
Definition: hdfs-scanner.cc:87

impala::ScannerContext::partition_descriptor
HdfsPartitionDescriptor * partition_descriptor()
Definition: scanner-context.h:278

impala::LlvmCodeGen::OptimizeFunctionWithExprs
llvm::Function * OptimizeFunctionWithExprs(llvm::Function *fn)
Definition: llvm-codegen.cc:583

impala::HdfsScanNode::num_materialized_partition_keys
int num_materialized_partition_keys() const
Returns number of materialized partition key slots.
Definition: hdfs-scan-node.h:130

impala::RowBatch::CommitRows
void CommitRows(int n)
Definition: row-batch.h:102

string-value.h

impala::HdfsScanNode::InitTemplateTuple
Tuple * InitTemplateTuple(RuntimeState *state, const std::vector< ExprContext * > &value_ctxs)
Definition: hdfs-scan-node.cc:270

impala::HdfsScanNode::IncNumScannersCodegenDisabled
void IncNumScannersCodegenDisabled()
Definition: hdfs-scan-node.h:172

impala::HdfsPartitionDescriptor::partition_key_value_ctxs
const std::vector< ExprContext * > & partition_key_value_ctxs() const
Definition: descriptors.h:185

impala::HdfsScanner::ReportTupleParseError
bool ReportTupleParseError(FieldLocation *fields, uint8_t *errors, int row_idx)
Definition: hdfs-scanner.cc:546

impala::HdfsScanner::stream_
ScannerContext::Stream * stream_
The first stream for context_.
Definition: hdfs-scanner.h:150

impala::MemPool::Allocate
uint8_t * Allocate(int size)
Definition: mem-pool.h:92

impala::HdfsScanner::WriteTuplesFn
int(* WriteTuplesFn)(HdfsScanner *, MemPool *, TupleRow *, int, FieldLocation *, int, int, int, int)
Definition: hdfs-scanner.h:212

impala::HdfsScanNode::tuple_desc
const TupleDescriptor * tuple_desc()
Definition: hdfs-scan-node.h:132

impala::HdfsScanner::Prepare
virtual Status Prepare(ScannerContext *context)
One-time initialisation of state that is constant across scan ranges.
Definition: hdfs-scanner.cc:71

impala::LlvmCodeGen::FnPrototype
Definition: llvm-codegen.h:161

impala::TYPE_DECIMAL
Definition: types.h:42

impala::ExecNode::runtime_profile
RuntimeProfile * runtime_profile()
Definition: exec-node.h:161

impala::HdfsScanNode::GetConjunctCtxs
Status GetConjunctCtxs(std::vector< ExprContext * > *ctxs)
Definition: hdfs-scan-node.cc:692

impala::TextConverter::CodegenWriteSlot
static llvm::Function * CodegenWriteSlot(LlvmCodeGen *codegen, TupleDescriptor *tuple_desc, SlotDescriptor *slot_desc, const char *null_col_val, int len, bool check_null)
Definition: text-converter.cc:99

impala::RowBatch::INVALID_ROW_INDEX
static const int INVALID_ROW_INDEX
Definition: row-batch.h:87