doc/html/hdfs-text-scanner_8cc_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #include "exec/hdfs-text-scanner.h"


 #include "codegen/llvm-codegen.h"

 #include "exec/delimited-text-parser.h"

 #include "exec/delimited-text-parser.inline.h"

 #include "exec/hdfs-lzo-text-scanner.h"

 #include "exec/hdfs-scan-node.h"

 #include "exec/scanner-context.inline.h"

 #include "exec/text-converter.h"

 #include "exec/text-converter.inline.h"

 #include "runtime/row-batch.h"

 #include "runtime/runtime-state.h"

 #include "util/codec.h"

 #include "util/decompress.h"

 #include "util/cpu-info.h"

 #include "util/debug-util.h"


 #include "common/names.h"


 using boost::algorithm::ends_with;

 using boost::algorithm::to_lower;

 using namespace impala;

 using namespace llvm;


 DEFINE_bool(debug_disable_streaming_gzip, false, "Debug flag, will be removed. Disables "

     "streaming gzip decompression.");


 const char* HdfsTextScanner::LLVM_CLASS_NAME = "class.impala::HdfsTextScanner";


 // Suffix for lzo index file: hdfs-filename.index

 const string HdfsTextScanner::LZO_INDEX_SUFFIX = ".index";


 // Number of bytes to read when the previous attempt to streaming decompress did not make

 // progress.

 const int64_t GZIP_FIXED_READ_SIZE = 1 * 1024 * 1024;


 HdfsTextScanner::HdfsTextScanner(HdfsScanNode* scan_node, RuntimeState* state)

     : HdfsScanner(scan_node, state),

       byte_buffer_ptr_(NULL),

       byte_buffer_end_(NULL),

       byte_buffer_read_size_(0),

       only_parsing_header_(false),

       boundary_pool_(new MemPool(scan_node->mem_tracker())),

       boundary_row_(boundary_pool_.get()),

       boundary_column_(boundary_pool_.get()),

       slot_idx_(0),

       error_in_row_(false) {

 }


 HdfsTextScanner::~HdfsTextScanner() {

 }


 Status HdfsTextScanner::IssueInitialRanges(HdfsScanNode* scan_node,

     const vector<HdfsFileDesc*>& files) {

   vector<DiskIoMgr::ScanRange*> compressed_text_scan_ranges;

   vector<HdfsFileDesc*> lzo_text_files;

   bool warning_written = false;

   for (int i = 0; i < files.size(); ++i) {

     warning_written = false;

     THdfsCompression::type compression = files[i]->file_compression;

     switch (compression) {

       case THdfsCompression::NONE:

         // For uncompressed text we just issue all ranges at once.

         // TODO: Lz4 is splittable, should be treated similarly.

         RETURN_IF_ERROR(scan_node->AddDiskIoRanges(files[i]));

         break;


       case THdfsCompression::GZIP:

       case THdfsCompression::SNAPPY:

       case THdfsCompression::SNAPPY_BLOCKED:

       case THdfsCompression::BZIP2:

         for (int j = 0; j < files[i]->splits.size(); ++j) {

           // In order to decompress gzip-, snappy- and bzip2-compressed text files, we

           // need to read entire files. Only read a file if we're assigned the first split

           // to avoid reading multi-block files with multiple scanners.

           DiskIoMgr::ScanRange* split = files[i]->splits[j];


           // We only process the split that starts at offset 0.

           if (split->offset() != 0) {

             if (!warning_written) {

               // We are expecting each file to be one hdfs block (so all the scan range

               // offsets should be 0).  This is not incorrect but we will issue a warning.

               // We write a single warning per file per impalad to reduce the number of

               // log warnings.

               stringstream ss;

               ss << "For better performance, snappy, gzip and bzip-compressed files "

                  << "should not be split into multiple hdfs-blocks. file="

                  << files[i]->filename << " offset " << split->offset();

               scan_node->runtime_state()->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

               warning_written = true;

             }

             // We assign the entire file to one scan range, so mark all but one split

             // (i.e. the first split) as complete.

             scan_node->RangeComplete(THdfsFileFormat::TEXT, compression);

             continue;

           }


           // Populate the list of compressed text scan ranges.

           DCHECK_GT(files[i]->file_length, 0);

           ScanRangeMetadata* metadata =

               reinterpret_cast<ScanRangeMetadata*>(split->meta_data());

           DiskIoMgr::ScanRange* file_range = scan_node->AllocateScanRange(

               files[i]->fs, files[i]->filename.c_str(), files[i]->file_length, 0,

               metadata->partition_id, split->disk_id(), split->try_cache(),

               split->expected_local(), files[i]->mtime);

           compressed_text_scan_ranges.push_back(file_range);

           scan_node->max_compressed_text_file_length()->Set(files[i]->file_length);

         }

         break;


       case THdfsCompression::LZO:

         // lzo-compressed text need to be processed by the specialized HdfsLzoTextScanner.

         // Note that any LZO_INDEX files (no matter what the case of their suffix) will be

         // filtered by the planner.

         {

         #ifndef NDEBUG

           // No straightforward way to do this in one line inside a DCHECK, so for once

           // we'll explicitly use NDEBUG to avoid executing debug-only code.

           string lower_filename = files[i]->filename;

           to_lower(lower_filename);

           DCHECK(!ends_with(lower_filename, LZO_INDEX_SUFFIX));

         #endif

           lzo_text_files.push_back(files[i]);

         }

         break;


       default:

         DCHECK(false);

     }

   }


   if (compressed_text_scan_ranges.size() > 0) {

     RETURN_IF_ERROR(scan_node->AddDiskIoRanges(compressed_text_scan_ranges));

   }

   if (lzo_text_files.size() > 0) {

     // This will dlopen the lzo binary and can fail if the lzo binary is not present.

     RETURN_IF_ERROR(HdfsLzoTextScanner::IssueInitialRanges(scan_node, lzo_text_files));

   }

   return Status::OK;

 }


 Status HdfsTextScanner::ProcessSplit() {

   // Reset state for new scan range

   RETURN_IF_ERROR(InitNewRange());


   // Find the first tuple.  If tuple_found is false, it means we went through the entire

   // scan range without finding a single tuple.  The bytes will be picked up by the scan

   // range before.

   bool tuple_found;

   RETURN_IF_ERROR(FindFirstTuple(&tuple_found));


   if (tuple_found) {

     // Update the decompressor depending on the compression type of the file in the

     // context.

     DCHECK(stream_->file_desc()->file_compression != THdfsCompression::SNAPPY)

         << "FE should have generated SNAPPY_BLOCKED instead.";

     RETURN_IF_ERROR(UpdateDecompressor(stream_->file_desc()->file_compression));


     // Process the scan range.

     int dummy_num_tuples;

     RETURN_IF_ERROR(ProcessRange(&dummy_num_tuples, false));


     // Finish up reading past the scan range.

     RETURN_IF_ERROR(FinishScanRange());

   }


   // All done with this scan range.

   return Status::OK;

 }


 void HdfsTextScanner::Close() {

   // Need to close the decompressor before releasing the resources at AddFinalRowBatch(),

   // because in some cases there is memory allocated in decompressor_'s temp_memory_pool_.

   if (decompressor_.get() != NULL) {

     decompressor_->Close();

     decompressor_.reset(NULL);

   }

   AttachPool(data_buffer_pool_.get(), false);

   AttachPool(boundary_pool_.get(), false);

   AddFinalRowBatch();

   if (!only_parsing_header_) {

     scan_node_->RangeComplete(

         THdfsFileFormat::TEXT, stream_->file_desc()->file_compression);

   }

   HdfsScanner::Close();

 }


 Status HdfsTextScanner::InitNewRange() {

   // Compressed text does not reference data in the io buffers directly. In such case, we

   // can recycle the buffers in the stream_ more promptly.

   if (stream_->file_desc()->file_compression != THdfsCompression::NONE) {

     stream_->set_contains_tuple_data(false);

   }


   HdfsPartitionDescriptor* hdfs_partition = context_->partition_descriptor();

   char field_delim = hdfs_partition->field_delim();

   char collection_delim = hdfs_partition->collection_delim();

   if (scan_node_->materialized_slots().size() == 0) {

     field_delim = '\0';

     collection_delim = '\0';

   }


   delimited_text_parser_.reset(new DelimitedTextParser(

       scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(),

       scan_node_->is_materialized_col(), hdfs_partition->line_delim(),

       field_delim, collection_delim, hdfs_partition->escape_char()));

   text_converter_.reset(new TextConverter(hdfs_partition->escape_char(),

       scan_node_->hdfs_table()->null_column_value()));


   RETURN_IF_ERROR(ResetScanner());

   return Status::OK;

 }


 Status HdfsTextScanner::ResetScanner() {

   error_in_row_ = false;


   // Note - this initialisation relies on the assumption that N partition keys will occupy

   // entries 0 through N-1 in column_idx_to_slot_idx. If this changes, we will need

   // another layer of indirection to map text-file column indexes onto the

   // column_idx_to_slot_idx table used below.

   slot_idx_ = 0;


   boundary_column_.Clear();

   boundary_row_.Clear();

   delimited_text_parser_->ParserReset();


   partial_tuple_empty_ = true;

   byte_buffer_ptr_ = byte_buffer_end_ = NULL;


   partial_tuple_ =

       Tuple::Create(scan_node_->tuple_desc()->byte_size(), boundary_pool_.get());


   // Initialize codegen fn

   RETURN_IF_ERROR(InitializeWriteTuplesFn(

     context_->partition_descriptor(), THdfsFileFormat::TEXT, "HdfsTextScanner"));

   return Status::OK;

 }


 Status HdfsTextScanner::FinishScanRange() {

   if (scan_node_->ReachedLimit()) return Status::OK;


   // For text we always need to scan past the scan range to find the next delimiter

   while (true) {

     bool eosr = true;

     Status status = Status::OK;

     byte_buffer_read_size_ = 0;


     // If compressed text, then there is nothing more to be read.

     // TODO: calling FillByteBuffer() at eof() can cause

     // ScannerContext::Stream::GetNextBuffer to DCHECK. Fix this.

     if (decompressor_.get() == NULL && !stream_->eof()) {

       status = FillByteBuffer(&eosr, NEXT_BLOCK_READ_SIZE);

     }


     if (!status.ok() || byte_buffer_read_size_ == 0) {

       if (status.IsCancelled()) return status;


       if (!status.ok()) {

         stringstream ss;

         ss << "Read failed while trying to finish scan range: " << stream_->filename()

            << ":" << stream_->file_offset() << endl << status.GetDetail();

         if (state_->LogHasSpace()) {

           state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

         }

         if (state_->abort_on_error()) return Status(ss.str());

       } else if (!partial_tuple_empty_ || !boundary_column_.Empty() ||

           !boundary_row_.Empty() ||

           (delimited_text_parser_->HasUnfinishedTuple() &&

               (!scan_node_->materialized_slots().empty() ||

                   scan_node_->num_materialized_partition_keys() > 0))) {

         // Missing columns or row delimiter at end of the file is ok, fill the row in.

         char* col = boundary_column_.str().ptr;

         int num_fields = 0;

         delimited_text_parser_->FillColumns<true>(boundary_column_.Size(),

             &col, &num_fields, &field_locations_[0]);


         MemPool* pool;

         TupleRow* tuple_row_mem;

         int max_tuples = GetMemory(&pool, &tuple_, &tuple_row_mem);

         DCHECK_GE(max_tuples, 1);

         // Set variables for proper error outputting on boundary tuple

         batch_start_ptr_ = boundary_row_.str().ptr;

         row_end_locations_[0] = batch_start_ptr_ + boundary_row_.str().len;

         int num_tuples = WriteFields(pool, tuple_row_mem, num_fields, 1);

         DCHECK_LE(num_tuples, 1);

         DCHECK_GE(num_tuples, 0);

         COUNTER_ADD(scan_node_->rows_read_counter(), num_tuples);

         RETURN_IF_ERROR(CommitRows(num_tuples));

       } else if (delimited_text_parser_->HasUnfinishedTuple()) {

         DCHECK(scan_node_->materialized_slots().empty());

         DCHECK_EQ(scan_node_->num_materialized_partition_keys(), 0);

         // If no fields are materialized we do not update partial_tuple_empty_,

         // boundary_column_, or boundary_row_. However, we still need to handle the case

         // of partial tuple due to missing tuple delimiter at the end of file.

         RETURN_IF_ERROR(CommitRows(1));

       }

       break;

     }


     DCHECK(eosr);


     int num_tuples;

     RETURN_IF_ERROR(ProcessRange(&num_tuples, true));

     if (num_tuples == 1) break;

     DCHECK_EQ(num_tuples, 0);

   }


   return Status::OK;

 }


 Status HdfsTextScanner::ProcessRange(int* num_tuples, bool past_scan_range) {

   bool eosr = past_scan_range || stream_->eosr();


   while (true) {

     if (!eosr && byte_buffer_ptr_ == byte_buffer_end_) {

       RETURN_IF_ERROR(FillByteBuffer(&eosr));

     }


     MemPool* pool;

     TupleRow* tuple_row_mem;

     int max_tuples = GetMemory(&pool, &tuple_, &tuple_row_mem);


     if (past_scan_range) {

       // byte_buffer_ptr_ is already set from FinishScanRange()

       max_tuples = 1;

       eosr = true;

     }


     *num_tuples = 0;

     int num_fields = 0;


     DCHECK_GT(max_tuples, 0);


     batch_start_ptr_ = byte_buffer_ptr_;

     char* col_start = byte_buffer_ptr_;

     {

       // Parse the bytes for delimiters and store their offsets in field_locations_

       SCOPED_TIMER(parse_delimiter_timer_);

       RETURN_IF_ERROR(delimited_text_parser_->ParseFieldLocations(max_tuples,

           byte_buffer_end_ - byte_buffer_ptr_, &byte_buffer_ptr_,

           &row_end_locations_[0],

           &field_locations_[0], num_tuples, &num_fields, &col_start));

     }


     // Materialize the tuples into the in memory format for this query

     int num_tuples_materialized = 0;

     if (scan_node_->materialized_slots().size() != 0 &&

         (num_fields > 0 || *num_tuples > 0)) {

       // There can be one partial tuple which returned no more fields from this buffer.

       DCHECK_LE(*num_tuples, num_fields + 1);

       if (!boundary_column_.Empty()) {

         CopyBoundaryField(&field_locations_[0], pool);

         boundary_column_.Clear();

       }

       num_tuples_materialized = WriteFields(pool, tuple_row_mem, num_fields, *num_tuples);

       DCHECK_GE(num_tuples_materialized, 0);

       RETURN_IF_ERROR(parse_status_);

       if (*num_tuples > 0) {

         // If we saw any tuple delimiters, clear the boundary_row_.

         boundary_row_.Clear();

       }

     } else if (*num_tuples != 0) {

       SCOPED_TIMER(scan_node_->materialize_tuple_timer());

       // If we are doing count(*) then we return tuples only containing partition keys

       boundary_row_.Clear();

       num_tuples_materialized = WriteEmptyTuples(context_, tuple_row_mem, *num_tuples);

     }


     // Save contents that are split across buffers if we are going to return this column

     if (col_start != byte_buffer_ptr_ && delimited_text_parser_->ReturnCurrentColumn()) {

       DCHECK_EQ(byte_buffer_ptr_, byte_buffer_end_);

       boundary_column_.Append(col_start, byte_buffer_ptr_ - col_start);

       char* last_row = NULL;

       if (*num_tuples == 0) {

         last_row = batch_start_ptr_;

       } else {

         last_row = row_end_locations_[*num_tuples - 1] + 1;

       }

       boundary_row_.Append(last_row, byte_buffer_ptr_ - last_row);

     }

     COUNTER_ADD(scan_node_->rows_read_counter(), *num_tuples);


     // Commit the rows to the row batch and scan node

     RETURN_IF_ERROR(CommitRows(num_tuples_materialized));


     // Done with this buffer and the scan range

     if ((byte_buffer_ptr_ == byte_buffer_end_ && eosr) || past_scan_range) {

       break;

     }


     if (scan_node_->ReachedLimit()) return Status::OK;

   }

   return Status::OK;

 }


 Status HdfsTextScanner::FillByteBuffer(bool* eosr, int num_bytes) {

   *eosr = false;

   Status status;


   if (decompressor_.get() == NULL) {

     if (num_bytes > 0) {

       stream_->GetBytes(num_bytes, reinterpret_cast<uint8_t**>(&byte_buffer_ptr_),

                         &byte_buffer_read_size_, &status);

     } else {

       DCHECK_EQ(num_bytes, 0);

       status = stream_->GetBuffer(false, reinterpret_cast<uint8_t**>(&byte_buffer_ptr_),

                                   &byte_buffer_read_size_);

     }

     *eosr = stream_->eosr();

   } else if (!FLAGS_debug_disable_streaming_gzip &&

       decompression_type_ == THdfsCompression::GZIP) {

     DCHECK_EQ(num_bytes, 0);

     RETURN_IF_ERROR(FillByteBufferGzip(eosr));

   } else {

     DCHECK_EQ(num_bytes, 0);

     RETURN_IF_ERROR(FillByteBufferCompressedFile(eosr));

   }


   byte_buffer_end_ = byte_buffer_ptr_ + byte_buffer_read_size_;

   return status;

 }


 Status HdfsTextScanner::FillByteBufferGzip(bool* eosr) {

   // Attach any previously decompressed buffers to the row batch before decompressing

   // any more data.

   if (!decompressor_->reuse_output_buffer()) {

     AttachPool(data_buffer_pool_.get(), false);

   }


   // Gzip compressed text is decompressed as buffers are read from stream_ (unlike

   // other codecs which decompress the entire file in a single call). A compressed

   // buffer is passed to ProcessBlockStreaming but it may not consume all of the input.

   // In the unlikely case that decompressed output is not produced, we attempt to try

   // again with a reasonably large fixed size input buffer (explicitly calling

   // GetBytes()) before failing.

   bool try_read_fixed_size = false;

   uint8_t* decompressed_buffer = NULL;

   int64_t decompressed_len = 0;

   do {

     uint8_t* gzip_buffer_ptr = NULL;

     int64_t gzip_buffer_size = 0;

     // We don't know how many bytes ProcessBlockStreaming() will consume so we set

     // peak=true and then later advance the stream using SkipBytes().

     if (!try_read_fixed_size) {

       RETURN_IF_ERROR(stream_->GetBuffer(true, &gzip_buffer_ptr, &gzip_buffer_size));

     } else {

       Status status;

       stream_->GetBytes(GZIP_FIXED_READ_SIZE, &gzip_buffer_ptr, &gzip_buffer_size,

           &status, true);

       RETURN_IF_ERROR(status);

       try_read_fixed_size = false;

     }

     if (gzip_buffer_size == 0) {

       // If the compressed file was not properly ended, the decoder will not know that

       // the last buffer should have been eos.

       stringstream ss;

       ss << "Unexpected end of file decompressing gzip. File may be malformed. ";

       ss << "file: " << stream_->filename();

       return Status(ss.str());

     }


     int64_t gzip_buffer_bytes_read = 0;

     {

       SCOPED_TIMER(decompress_timer_);

       RETURN_IF_ERROR(decompressor_->ProcessBlockStreaming(gzip_buffer_size,

             gzip_buffer_ptr, &gzip_buffer_bytes_read, &decompressed_len,

             &decompressed_buffer, eosr));

       DCHECK_GE(gzip_buffer_size, gzip_buffer_bytes_read);

       DCHECK_GE(decompressed_len, 0);

     }


     // Skip the bytes in stream_ that were decompressed.

     Status status;

     stream_->SkipBytes(gzip_buffer_bytes_read, &status);

     RETURN_IF_ERROR(status);


     if (!*eosr && decompressed_len == 0) {

       // It's possible (but very unlikely) that ProcessBlockStreaming() wasn't able to

       // make progress if the compressed buffer returned by GetBytes() is too small.

       // (Note that this did not even occur in simple experiments where the input buffer

       // is always 1 byte, but we need to handle this case to be defensive.) In this

       // case, try again with a reasonably large fixed size buffer. If we still did not

       // make progress, then return an error.

       if (try_read_fixed_size) {

         stringstream ss;

         ss << "Unable to make progress decoding gzip text. ";

         ss << "file: " << stream_->filename();

         return Status(ss.str());

       }

       VLOG_FILE << "Unable to make progress decompressing gzip, trying again";

       try_read_fixed_size = true;

     }

   } while (try_read_fixed_size);


   byte_buffer_ptr_ = reinterpret_cast<char*>(decompressed_buffer);

   byte_buffer_read_size_ = decompressed_len;


   if (*eosr) {

     if (!stream_->eosr()) {

       // TODO: Add a test case that exercises this path.

       stringstream ss;

       ss << "Unexpected end of gzip stream before end of file: ";

       ss << stream_->filename();

       if (state_->LogHasSpace()) {

         state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

       }

       if (state_->abort_on_error()) parse_status_ = Status(ss.str());

       RETURN_IF_ERROR(parse_status_);

     }


     context_->ReleaseCompletedResources(NULL, true);

   }

   return Status::OK;

 }


 Status HdfsTextScanner::FillByteBufferCompressedFile(bool* eosr) {

   // For other compressed text: attempt to read and decompress the entire file, point

   // to the decompressed buffer, and then continue normal processing.

   DCHECK(decompression_type_ != THdfsCompression::SNAPPY);

   HdfsFileDesc* desc = scan_node_->GetFileDesc(stream_->filename());

   int64_t file_size = desc->file_length;

   DCHECK_GT(file_size, 0);

   Status status;

   stream_->GetBytes(file_size, reinterpret_cast<uint8_t**>(&byte_buffer_ptr_),

       &byte_buffer_read_size_, &status);

   RETURN_IF_ERROR(status);


   // If didn't read anything, return.

   if (byte_buffer_read_size_ == 0) {

     *eosr = true;

     return Status::OK;

   }


   // Need to read the entire file.

   if (file_size < byte_buffer_read_size_) {

     stringstream ss;

     ss << "Expected to read a compressed text file of size " << file_size << " bytes. "

        << "But only read " << byte_buffer_read_size_ << " bytes. This may indicate "

        << "data file corruption. (file: " << stream_->filename() << ").";

     return Status(ss.str());

   }


   // Decompress and adjust the byte_buffer_ptr_ and byte_buffer_read_size_ accordingly.

   int64_t decompressed_len = 0;

   uint8_t* decompressed_buffer = NULL;

   SCOPED_TIMER(decompress_timer_);

   // TODO: Once the writers are in, add tests with very large compressed files (4GB)

   // that could overflow.

   RETURN_IF_ERROR(decompressor_->ProcessBlock(false, byte_buffer_read_size_,

       reinterpret_cast<uint8_t*>(byte_buffer_ptr_), &decompressed_len,

       &decompressed_buffer));


   // Inform stream_ that the buffer with the compressed text can be released.

   context_->ReleaseCompletedResources(NULL, true);


   VLOG_FILE << "Decompressed " << byte_buffer_read_size_ << " to " << decompressed_len;

   byte_buffer_ptr_ = reinterpret_cast<char*>(decompressed_buffer);

   byte_buffer_read_size_ = decompressed_len;

   *eosr = stream_->eosr();

   return Status::OK;

 }


 Status HdfsTextScanner::FindFirstTuple(bool* tuple_found) {

   *tuple_found = true;

   if (stream_->scan_range()->offset() != 0) {

     *tuple_found = false;

     // Offset may not point to tuple boundary, skip ahead to the first full tuple

     // start.

     while (true) {

       bool eosr = false;

       RETURN_IF_ERROR(FillByteBuffer(&eosr));


       delimited_text_parser_->ParserReset();

       SCOPED_TIMER(parse_delimiter_timer_);

       int first_tuple_offset = delimited_text_parser_->FindFirstInstance(

           byte_buffer_ptr_, byte_buffer_read_size_);


       if (first_tuple_offset == -1) {

         // Didn't find tuple in this buffer, keep going with this scan range

         if (!eosr) continue;

       } else {

         byte_buffer_ptr_ += first_tuple_offset;

         *tuple_found = true;

       }

       break;

     }

   }

   DCHECK(delimited_text_parser_->AtTupleStart());

   return Status::OK;

 }


 // Codegen for materializing parsed data into tuples.  The function WriteCompleteTuple is

 // codegen'd using the IRBuilder for the specific tuple description.  This function

 // is then injected into the cross-compiled driving function, WriteAlignedTuples().

 Function* HdfsTextScanner::Codegen(HdfsScanNode* node,

                                    const vector<ExprContext*>& conjunct_ctxs) {

   if (!node->runtime_state()->codegen_enabled()) return NULL;

   LlvmCodeGen* codegen;

   if (!node->runtime_state()->GetCodegen(&codegen).ok()) return NULL;

   Function* write_complete_tuple_fn =

       CodegenWriteCompleteTuple(node, codegen, conjunct_ctxs);

   if (write_complete_tuple_fn == NULL) return NULL;

   return CodegenWriteAlignedTuples(node, codegen, write_complete_tuple_fn);

 }


 Status HdfsTextScanner::Prepare(ScannerContext* context) {

   RETURN_IF_ERROR(HdfsScanner::Prepare(context));


   parse_delimiter_timer_ = ADD_CHILD_TIMER(scan_node_->runtime_profile(),

       "DelimiterParseTime", ScanNode::SCANNER_THREAD_TOTAL_WALLCLOCK_TIME);


   // Allocate the scratch space for two pass parsing.  The most fields we can go

   // through in one parse pass is the batch size (tuples) * the number of fields per tuple

   // TODO: This should probably be based on L2/L3 cache sizes (as should the batch size)

   field_locations_.resize(state_->batch_size() * scan_node_->materialized_slots().size());

   row_end_locations_.resize(state_->batch_size());


   return Status::OK;

 }


 void HdfsTextScanner::LogRowParseError(int row_idx, stringstream* ss) {

   DCHECK_LT(row_idx, row_end_locations_.size());

   char* row_end = row_end_locations_[row_idx];

   char* row_start;

   if (row_idx == 0) {

     row_start = batch_start_ptr_;

   } else {

     // Row start at 1 past the row end (i.e. the row delimiter) for the previous row

     row_start = row_end_locations_[row_idx - 1] + 1;

   }


   if (!boundary_row_.Empty()) {

     // Log the beginning of the line from the previous file buffer(s)

     *ss << boundary_row_.str();

   }

   // Log the erroneous line (or the suffix of a line if !boundary_line.empty()).

   *ss << string(row_start, row_end - row_start);

 }


 // This function writes fields in 'field_locations_' to the row_batch.  This function

 // deals with tuples that straddle batches.  There are two cases:

 // 1. There is already a partial tuple in flight from the previous time around.

 //   This tuple can either be fully materialized (all the materialized columns have

 //   been processed but we haven't seen the tuple delimiter yet) or only partially

 //   materialized.  In this case num_tuples can be greater than num_fields

 // 2. There is a non-fully materialized tuple at the end.  The cols that have been

 //   parsed so far are written to 'tuple_' and the remained will be picked up (case 1)

 //   the next time around.

 int HdfsTextScanner::WriteFields(MemPool* pool, TupleRow* tuple_row,

     int num_fields, int num_tuples) {

   SCOPED_TIMER(scan_node_->materialize_tuple_timer());


   FieldLocation* fields = &field_locations_[0];


   int num_tuples_processed = 0;

   int num_tuples_materialized = 0;

   // Write remaining fields, if any, from the previous partial tuple.

   if (slot_idx_ != 0) {

     DCHECK(tuple_ != NULL);

     int num_partial_fields = scan_node_->materialized_slots().size() - slot_idx_;

     // Corner case where there will be no materialized tuples but at least one col

     // worth of string data.  In this case, make a deep copy and reuse the byte buffer.

     bool copy_strings = num_partial_fields > num_fields;

     num_partial_fields = min(num_partial_fields, num_fields);

     WritePartialTuple(fields, num_partial_fields, copy_strings);


     // This handles case 1.  If the tuple is complete and we've found a tuple delimiter

     // this time around (i.e. num_tuples > 0), add it to the row batch.  Otherwise,

     // it will get picked up the next time around

     if (slot_idx_ == scan_node_->materialized_slots().size() && num_tuples > 0) {

       if (UNLIKELY(error_in_row_)) {

         if (state_->LogHasSpace()) {

           stringstream ss;

           ss << "file: " << stream_->filename() << endl << "record: ";

           LogRowParseError(0, &ss);

           state_->LogError(ErrorMsg(TErrorCode::GENERAL, ss.str()));

         }

         if (state_->abort_on_error()) parse_status_ = Status(state_->ErrorLog());

         if (!parse_status_.ok()) return 0;

         error_in_row_ = false;

       }

       boundary_row_.Clear();


       memcpy(tuple_, partial_tuple_, scan_node_->tuple_desc()->byte_size());

       partial_tuple_empty_ = true;

       tuple_row->SetTuple(scan_node_->tuple_idx(), tuple_);


       slot_idx_ = 0;

       ++num_tuples_processed;

       --num_tuples;


       if (EvalConjuncts(tuple_row)) {

         ++num_tuples_materialized;

         tuple_ = next_tuple(tuple_);

         tuple_row = next_row(tuple_row);

       }

     }


     num_fields -= num_partial_fields;

     fields += num_partial_fields;

   }


   // Write complete tuples.  The current field, if any, is at the start of a tuple.

   if (num_tuples > 0) {

     int max_added_tuples = (scan_node_->limit() == -1) ?

           num_tuples : scan_node_->limit() - scan_node_->rows_returned();

     int tuples_returned = 0;

     // Call jitted function if possible

     if (write_tuples_fn_ != NULL) {

       tuples_returned = write_tuples_fn_(this, pool, tuple_row,

           batch_->row_byte_size(), fields, num_tuples, max_added_tuples,

           scan_node_->materialized_slots().size(), num_tuples_processed);

     } else {

       tuples_returned = WriteAlignedTuples(pool, tuple_row,

           batch_->row_byte_size(), fields, num_tuples, max_added_tuples,

           scan_node_->materialized_slots().size(), num_tuples_processed);

     }

     if (tuples_returned == -1) return 0;

     DCHECK_EQ(slot_idx_, 0);


     num_tuples_materialized += tuples_returned;

     num_fields -= num_tuples * scan_node_->materialized_slots().size();

     fields += num_tuples * scan_node_->materialized_slots().size();

   }


   DCHECK_GE(num_fields, 0);

   DCHECK_LE(num_fields, scan_node_->materialized_slots().size());


   // Write out the remaining slots (resulting in a partially materialized tuple)

   if (num_fields != 0) {

     DCHECK(tuple_ != NULL);

     InitTuple(template_tuple_, partial_tuple_);

     // If there have been no materialized tuples at this point, copy string data

     // out of byte_buffer and reuse the byte_buffer.  The copied data can be at

     // most one tuple's worth.

     WritePartialTuple(fields, num_fields, num_tuples_materialized == 0);

     partial_tuple_empty_ = false;

   }

   DCHECK_LE(slot_idx_, scan_node_->materialized_slots().size());

   return num_tuples_materialized;

 }


 void HdfsTextScanner::CopyBoundaryField(FieldLocation* data, MemPool* pool) {

   bool needs_escape = data->len < 0;

   int copy_len = needs_escape ? -data->len : data->len;

   int total_len = copy_len + boundary_column_.Size();

   char* str_data = reinterpret_cast<char*>(pool->Allocate(total_len));

   memcpy(str_data, boundary_column_.str().ptr, boundary_column_.Size());

   memcpy(str_data + boundary_column_.Size(), data->start, copy_len);

   data->start = str_data;

   data->len = needs_escape ? -total_len : total_len;

 }


 int HdfsTextScanner::WritePartialTuple(FieldLocation* fields,

     int num_fields, bool copy_strings) {

   int next_line_offset = 0;

   for (int i = 0; i < num_fields; ++i) {

     int need_escape = false;

     int len = fields[i].len;

     if (len < 0) {

       len = -len;

       need_escape = true;

     }

     next_line_offset += (len + 1);


     const SlotDescriptor* desc = scan_node_->materialized_slots()[slot_idx_];

     if (!text_converter_->WriteSlot(desc, partial_tuple_,

         fields[i].start, len, true, need_escape, data_buffer_pool_.get())) {

       ReportColumnParseError(desc, fields[i].start, len);

       error_in_row_ = true;

     }

     ++slot_idx_;

   }

   return next_line_offset;

 }

impala::HdfsScanNode::materialized_slots
const std::vector< SlotDescriptor * > & materialized_slots() const
Definition: hdfs-scan-node.h:119

impala::HdfsTableDescriptor::null_column_value
const std::string & null_column_value() const
Definition: descriptors.h:233

impala::HdfsTextScanner::ProcessRange
Status ProcessRange(int *num_tuples, bool past_scan_range)
Definition: hdfs-text-scanner.cc:325

row-batch.h

impala::HdfsTextScanner::InitNewRange
virtual Status InitNewRange()
Definition: hdfs-text-scanner.cc:202

impala::HdfsScanner::decompressor_
boost::scoped_ptr< Codec > decompressor_
Decompressor class to use, if any.
Definition: hdfs-scanner.h:198

impala::HdfsScanner::ReportColumnParseError
void ReportColumnParseError(const SlotDescriptor *desc, const char *data, int len)
Definition: hdfs-scanner.cc:577

impala::HdfsTextScanner::row_end_locations_
std::vector< char * > row_end_locations_
Definition: hdfs-text-scanner.h:158

impala::StringBuffer::str
const StringValue & str() const
Returns the underlying StringValue.
Definition: string-buffer.h:86

impala::HdfsScanner::scan_node_
HdfsScanNode * scan_node_
The scan node that started this scanner.
Definition: hdfs-scanner.h:141

impala::Status::GetDetail
const std::string GetDetail() const
Definition: status.cc:184

impala::HdfsTextScanner::LogRowParseError
virtual void LogRowParseError(int row_idx, std::stringstream *)
Definition: hdfs-text-scanner.cc:635

impala::StringBuffer::Size
int Size() const
Returns the length of the current string.
Definition: string-buffer.h:81

hdfs-scan-node.h

impala::HdfsScanNode::num_partition_keys
int num_partition_keys() const
Returns number of partition keys in the table, including non-materialized slots.
Definition: hdfs-scan-node.h:127

impala::ScannerContext::Stream::GetBuffer
Status GetBuffer(bool peek, uint8_t **buffer, int64_t *out_len)
Definition: scanner-context.cc:171

impala::FieldLocation
Definition: hdfs-scanner.h:52

impala::HdfsScanner::context_
ScannerContext * context_
Context for this scanner.
Definition: hdfs-scanner.h:147

impala::StringBuffer::Append
void Append(const char *str, int len)
Append 'str' to the current string, allocating a new buffer as necessary.
Definition: string-buffer.h:44

impala::HdfsTextScanner::boundary_column_
StringBuffer boundary_column_
Helper string for dealing with columns that span file blocks.
Definition: hdfs-text-scanner.h:145

impala::ScannerContext::Stream::eosr
bool eosr() const
Definition: scanner-context.h:113

impala::HdfsScanNode::max_compressed_text_file_length
RuntimeProfile::HighWaterMarkCounter * max_compressed_text_file_length()
Definition: hdfs-scan-node.h:140

impala::HdfsScanner::data_buffer_pool_
boost::scoped_ptr< MemPool > data_buffer_pool_
Definition: hdfs-scanner.h:205

impala::HdfsScanner::CodegenWriteCompleteTuple
static llvm::Function * CodegenWriteCompleteTuple(HdfsScanNode *, LlvmCodeGen *, const std::vector< ExprContext * > &conjunct_ctxs)
Definition: hdfs-scanner.cc:296

impala::RuntimeState::ErrorLog
std::string ErrorLog()
Returns the error log lines as a string joined with ' '.
Definition: runtime-state.cc:203

impala::HdfsScanner::text_converter_
boost::scoped_ptr< TextConverter > text_converter_
Helper class for converting text to other types;.
Definition: hdfs-scanner.h:186

text-converter.h

impala::HdfsScanner::write_tuples_fn_
WriteTuplesFn write_tuples_fn_
Jitted write tuples function pointer. Null if codegen is disabled.
Definition: hdfs-scanner.h:215

impala::HdfsTextScanner::byte_buffer_end_
char * byte_buffer_end_
Ending position of HDFS buffer.
Definition: hdfs-text-scanner.h:62

impala::HdfsTextScanner::FillByteBufferCompressedFile
Status FillByteBufferCompressedFile(bool *eosr)
Definition: hdfs-text-scanner.cc:530

impala::HdfsScanNode::hdfs_table
const HdfsTableDescriptor * hdfs_table()
Definition: hdfs-scan-node.h:134

impala::StringBuffer::Clear
void Clear()
Clear the underlying StringValue. The allocated buffer can be reused.
Definition: string-buffer.h:65

impala::StringValue::len
int len
Definition: string-value.h:38

impala::HdfsScanner::parse_status_
Status parse_status_
Definition: hdfs-scanner.h:195

impala::ScannerContext::ReleaseCompletedResources
void ReleaseCompletedResources(RowBatch *batch, bool done)
Definition: scanner-context.cc:45

RETURN_IF_ERROR
#define RETURN_IF_ERROR(stmt)
some generally useful macros
Definition: status.h:242

impala::ScanRangeMetadata
Definition: hdfs-scan-node.h:78

impala::StringBuffer::Empty
bool Empty() const
Returns whether the current string is empty.
Definition: string-buffer.h:76

impala::HdfsScanner::WriteEmptyTuples
int WriteEmptyTuples(RowBatch *row_batch, int num_tuples)
Definition: hdfs-scanner.cc:157

impala::RuntimeState::LogHasSpace
bool LogHasSpace()
Returns true if the error log has not reached max_errors_.
Definition: runtime-state.h:211

codec.h

impala::HdfsScanner::AddFinalRowBatch
void AddFinalRowBatch()
Definition: hdfs-scanner.cc:145

impala::HdfsTextScanner::NEXT_BLOCK_READ_SIZE
static const int NEXT_BLOCK_READ_SIZE
Definition: hdfs-text-scanner.h:71

impala::ScanNode::rows_read_counter
RuntimeProfile::Counter * rows_read_counter() const
Definition: scan-node.h:96

impala::HdfsTextScanner::partial_tuple_
Tuple * partial_tuple_
Definition: hdfs-text-scanner.h:173

impala::TupleDescriptor::byte_size
int byte_size() const
Definition: descriptors.h:300

impala::HdfsScanner::template_tuple_
Tuple * template_tuple_
Definition: hdfs-scanner.h:164

impala::TableDescriptor::num_cols
int num_cols() const
Definition: descriptors.h:152

GZIP_FIXED_READ_SIZE
const int64_t GZIP_FIXED_READ_SIZE
Definition: hdfs-text-scanner.cc:49

impala::DiskIoMgr::RequestRange::offset
int64_t offset() const
Definition: disk-io-mgr.h:267

impala::SlotDescriptor
Definition: descriptors.h:75

impala::HdfsTextScanner::FinishScanRange
Status FinishScanRange()
Reads past the end of the scan range for the next tuple end.
Definition: hdfs-text-scanner.cc:253

impala::Tuple::Create
static Tuple * Create(int size, MemPool *pool)
initialize individual tuple with data residing in mem pool
Definition: tuple.h:51

impala::ScannerContext
Definition: scanner-context.h:55

llvm-codegen.h

impala::HdfsScanner::next_row
TupleRow * next_row(TupleRow *r) const
Definition: hdfs-scanner.h:368

COUNTER_ADD
#define COUNTER_ADD(c, v)
Definition: runtime-profile.h:55

impala::ScanRangeMetadata::partition_id
int64_t partition_id
The partition id that this range is part of.
Definition: hdfs-scan-node.h:80

impala::TupleRow
Definition: tuple-row.h:28

impala::RowBatch::row_byte_size
int row_byte_size()
Definition: row-batch.h:147

impala::ScannerContext::Stream::file_offset
int64_t file_offset() const
Returns the buffer's current offset in the file.
Definition: scanner-context.h:123

impala::ExecNode::ReachedLimit
bool ReachedLimit()
Definition: exec-node.h:159

impala::HdfsTextScanner::field_locations_
std::vector< FieldLocation > field_locations_
Return field locations from the Delimited Text Parser.
Definition: hdfs-text-scanner.h:154

impala::HdfsScanNode::RangeComplete
void RangeComplete(const THdfsFileFormat::type &file_type, const THdfsCompression::type &compression_type)
Definition: hdfs-scan-node.cc:924

impala::HdfsTextScanner::boundary_row_
StringBuffer boundary_row_
Definition: hdfs-text-scanner.h:142

ADD_CHILD_TIMER
#define ADD_CHILD_TIMER(profile, name, parent)
Definition: runtime-profile.h:51

impala::ScannerContext::Stream::filename
const char * filename()
Definition: scanner-context.h:118

SCOPED_TIMER
#define SCOPED_TIMER(c)
Definition: runtime-profile.h:53

impala::HdfsTextScanner::WritePartialTuple
int WritePartialTuple(FieldLocation *, int num_fields, bool copy_strings)
Definition: hdfs-text-scanner.cc:768

impala::HdfsTextScanner::Prepare
virtual Status Prepare(ScannerContext *context)
Implementation of HdfsScanner interface.
Definition: hdfs-text-scanner.cc:620

impala::HdfsTextScanner::only_parsing_header_
bool only_parsing_header_
True if we are parsing the header for this scanner.
Definition: hdfs-text-scanner.h:68

impala::HdfsScanNode::is_materialized_col
const bool * is_materialized_col()
Definition: hdfs-scan-node.h:160

impala::HdfsScanner::decompression_type_
THdfsCompression::type decompression_type_
The most recently used decompression type.
Definition: hdfs-scanner.h:201

impala::HdfsTextScanner::IssueInitialRanges
static Status IssueInitialRanges(HdfsScanNode *scan_node, const std::vector< HdfsFileDesc * > &files)
Issue io manager byte ranges for 'files'.
Definition: hdfs-text-scanner.cc:67

impala::HdfsScanner::Close
virtual void Close()
Definition: hdfs-scanner.cc:82

decompress.h

impala::HdfsTextScanner::ResetScanner
Status ResetScanner()
Definition: hdfs-text-scanner.cc:228

impala::ScannerContext::Stream::GetBytes
bool GetBytes(int64_t requested_len, uint8_t **buffer, int64_t *out_len, Status *status, bool peek=false)
Definition: scanner-context.inline.h:31

impala::LlvmCodeGen
LLVM code generator. This is the top level object to generate jitted code.
Definition: llvm-codegen.h:107

impala::HdfsPartitionDescriptor::escape_char
char escape_char() const
Definition: descriptors.h:183

impala::Status
Definition: status.h:81

impala::HdfsScanner::state_
RuntimeState * state_
RuntimeState for error reporting.
Definition: hdfs-scanner.h:144

impala::TextConverter
Definition: text-converter.h:39

hdfs-lzo-text-scanner.h

impala::HdfsFileDesc::file_compression
THdfsCompression::type file_compression
Definition: hdfs-scan-node.h:67

impala::ScannerContext::Stream::file_desc
const HdfsFileDesc * file_desc()
Definition: scanner-context.h:120

impala::HdfsScanNode::GetFileDesc
HdfsFileDesc * GetFileDesc(const std::string &filename)
Returns the file desc for 'filename'. Returns NULL if filename is invalid.
Definition: hdfs-scan-node.cc:206

impala::MemPool
Definition: mem-pool.h:77

impala::HdfsScanner::UpdateDecompressor
Status UpdateDecompressor(const THdfsCompression::type &compression)
Definition: hdfs-scanner.cc:513

impala::RuntimeState::LogError
bool LogError(const ErrorMsg &msg)
Definition: runtime-state.cc:224

impala::HdfsScanner::InitTuple
void InitTuple(Tuple *template_tuple, Tuple *tuple)
Definition: hdfs-scanner.h:355

impala::HdfsScanner::GetMemory
int GetMemory(MemPool **pool, Tuple **tuple_mem, TupleRow **tuple_row_mem)
Definition: hdfs-scanner.cc:115

impala::HdfsScanNode::AddDiskIoRanges
Status AddDiskIoRanges(const std::vector< DiskIoMgr::ScanRange * > &ranges)
Adds ranges to the io mgr queue and starts up new scanner threads if possible.

impala::RuntimeState
Definition: runtime-state.h:69

impala::HdfsLzoTextScanner::IssueInitialRanges
static Status IssueInitialRanges(HdfsScanNode *scan_node, const std::vector< HdfsFileDesc * > &files)
Definition: hdfs-lzo-text-scanner.cc:60

impala::HdfsTextScanner::partial_tuple_empty_
bool partial_tuple_empty_
Definition: hdfs-text-scanner.h:177

impala::HdfsTextScanner::delimited_text_parser_
boost::scoped_ptr< DelimitedTextParser > delimited_text_parser_
Helper class for picking fields and rows from delimited text.
Definition: hdfs-text-scanner.h:151

impala::HdfsFileDesc::file_length
int64_t file_length
Definition: hdfs-scan-node.h:62

impala::Status::IsCancelled
bool IsCancelled() const
Definition: status.h:174

impala::ScannerContext::Stream::eof
bool eof() const
If true, the stream has reached the end of the file.
Definition: scanner-context.h:116

impala::DiskIoMgr::ScanRange::try_cache
bool try_cache() const
Definition: disk-io-mgr.h:313

debug-util.h

hdfs-text-scanner.h

pool
ObjectPool pool
Definition: expr-benchmark.cc:89

impala::HdfsTextScanner::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
Definition: hdfs-text-scanner.h:51

impala::HdfsTextScanner::FindFirstTuple
Status FindFirstTuple(bool *tuple_found)
Definition: hdfs-text-scanner.cc:577

delimited-text-parser.h

impala::HdfsTextScanner::HdfsTextScanner
HdfsTextScanner(HdfsScanNode *scan_node, RuntimeState *state)
Definition: hdfs-text-scanner.cc:51

impala::HdfsTextScanner::Codegen
static llvm::Function * Codegen(HdfsScanNode *, const std::vector< ExprContext * > &conjunct_ctxs)
Codegen writing tuples and evaluating predicates.
Definition: hdfs-text-scanner.cc:609

impala::ScannerContext::Stream::SkipBytes
bool SkipBytes(int64_t length, Status *)
Skip over the next length bytes in the specified HDFS file.
Definition: scanner-context.inline.h:70

impala::HdfsScanner::CommitRows
Status CommitRows(int num_rows)
Definition: hdfs-scanner.cc:124

impala::HdfsScanNode
Definition: hdfs-scan-node.h:104

impala::HdfsScanNode::runtime_state
RuntimeState * runtime_state()
Definition: hdfs-scan-node.h:136

impala::HdfsTextScanner::slot_idx_
int slot_idx_
Index into materialized_slots_ for the next slot to output for the current tuple. ...
Definition: hdfs-text-scanner.h:148

runtime-state.h

impala::HdfsScanNode::limit
int limit() const
Definition: hdfs-scan-node.h:117

impala::ExecNode::rows_returned
int64_t rows_returned() const
Definition: exec-node.h:157

impala::RuntimeState::batch_size
int batch_size() const
Definition: runtime-state.h:98

impala::HdfsScanner::EvalConjuncts
bool IR_ALWAYS_INLINE EvalConjuncts(TupleRow *row)
Definition: hdfs-scanner.h:266

impala::HdfsTextScanner::parse_delimiter_timer_
RuntimeProfile::Counter * parse_delimiter_timer_
Time parsing text files.
Definition: hdfs-text-scanner.h:180

impala::HdfsScanner::CodegenWriteAlignedTuples
static llvm::Function * CodegenWriteAlignedTuples(HdfsScanNode *, LlvmCodeGen *, llvm::Function *write_tuple_fn)
Definition: hdfs-scanner.cc:495

impala::StringValue::ptr
char * ptr
Definition: string-value.h:37

impala::HdfsTextScanner::batch_start_ptr_
char * batch_start_ptr_
Definition: hdfs-text-scanner.h:162

impala::HdfsScanner::AttachPool
void AttachPool(MemPool *pool, bool commit_batch)
Definition: hdfs-scanner.h:256

impala::HdfsPartitionDescriptor::line_delim
char line_delim() const
Definition: descriptors.h:180

impala::HdfsTextScanner::WriteFields
int WriteFields(MemPool *, TupleRow *tuple_row_mem, int num_fields, int num_tuples)
Definition: hdfs-text-scanner.cc:663

impala::HdfsTextScanner::FillByteBufferGzip
Status FillByteBufferGzip(bool *eosr)
Definition: hdfs-text-scanner.cc:437

delimited-text-parser.inline.h

impala::TupleRow::SetTuple
void SetTuple(int tuple_idx, Tuple *tuple)
Definition: tuple-row.h:34

impala::ErrorMsg
Definition: error-util.h:47

impala::HdfsTextScanner::boundary_pool_
boost::scoped_ptr< MemPool > boundary_pool_
Mem pool for boundary_row_ and boundary_column_.
Definition: hdfs-text-scanner.h:137

impala::HdfsPartitionDescriptor::field_delim
char field_delim() const
Definition: descriptors.h:181

impala::DiskIoMgr::RequestRange::disk_id
int disk_id() const
Definition: disk-io-mgr.h:269

UNLIKELY
#define UNLIKELY(expr)
Definition: compiler-util.h:33

impala::FieldLocation::len
int len
Definition: hdfs-scanner.h:59

scanner-context.inline.h

impala::RuntimeState::codegen_enabled
bool codegen_enabled() const
Returns true if codegen is enabled for this query.
Definition: runtime-state.h:183

DEFINE_bool
DEFINE_bool(debug_disable_streaming_gzip, false,"Debug flag, will be removed. Disables ""streaming gzip decompression.")

impala::RuntimeProfile::HighWaterMarkCounter::Set
virtual void Set(int64_t v)
Definition: runtime-profile.h:148

impala::HdfsTextScanner::ProcessSplit
virtual Status ProcessSplit()
Definition: hdfs-text-scanner.cc:156

impala::HdfsTextScanner::byte_buffer_ptr_
char * byte_buffer_ptr_
Current position in byte buffer.
Definition: hdfs-text-scanner.h:59

impala::DiskIoMgr::ScanRange::expected_local
bool expected_local() const
Definition: disk-io-mgr.h:314

impala::HdfsScanNode::AllocateScanRange
DiskIoMgr::ScanRange * AllocateScanRange(hdfsFS fs, const char *file, int64_t len, int64_t offset, int64_t partition_id, int disk_id, bool try_cache, bool expected_local, int64_t mtime)
Definition: hdfs-scan-node.cc:183

impala::Status::OK
static const Status OK
Definition: status.h:87

impala::HdfsScanner::WriteAlignedTuples
int WriteAlignedTuples(MemPool *pool, TupleRow *tuple_row_mem, int row_size, FieldLocation *fields, int num_tuples, int max_added_tuples, int slots_per_tuple, int row_start_indx)
Definition: hdfs-scanner-ir.cc:33

impala::DelimitedTextParser
Definition: delimited-text-parser.h:25

impala::HdfsScanner
Definition: hdfs-scanner.h:91

impala::RuntimeState::GetCodegen
Status GetCodegen(LlvmCodeGen **codegen, bool initialize=true)
Definition: runtime-state.cc:312

impala::HdfsTextScanner::CopyBoundaryField
void CopyBoundaryField(FieldLocation *data, MemPool *pool)
Definition: hdfs-text-scanner.cc:757

impala::HdfsPartitionDescriptor
Metadata for a single partition inside an Hdfs table.
Definition: descriptors.h:177

impala::HdfsTextScanner::LZO_INDEX_SUFFIX
static const std::string LZO_INDEX_SUFFIX
Suffix for lzo index files.
Definition: hdfs-text-scanner.h:49

impala::HdfsScanner::tuple_
Tuple * tuple_
Current tuple pointer into tuple_mem_.
Definition: hdfs-scanner.h:170

names.h

impala::HdfsTextScanner::byte_buffer_read_size_
int64_t byte_buffer_read_size_
Actual bytes received from last file read.
Definition: hdfs-text-scanner.h:65

impala::ScannerContext::Stream::scan_range
const DiskIoMgr::ScanRange * scan_range()
Definition: scanner-context.h:119

impala::HdfsTextScanner::Close
virtual void Close()
Definition: hdfs-text-scanner.cc:185

impala::HdfsTextScanner::error_in_row_
bool error_in_row_
Definition: hdfs-text-scanner.h:168

VLOG_FILE
#define VLOG_FILE
Definition: logging.h:58

text-converter.inline.h

impala::HdfsScanNode::tuple_idx
int tuple_idx() const
Definition: hdfs-scan-node.h:124

impala::RuntimeState::abort_on_error
bool abort_on_error() const
Definition: runtime-state.h:99

impala::HdfsScanner::batch_
RowBatch * batch_
Definition: hdfs-scanner.h:177

impala::HdfsPartitionDescriptor::collection_delim
char collection_delim() const
Definition: descriptors.h:182

impala::Status::ok
bool ok() const
Definition: status.h:172

impala::HdfsScanner::decompress_timer_
RuntimeProfile::Counter * decompress_timer_
Time spent decompressing bytes.
Definition: hdfs-scanner.h:208

impala::ScanNode::SCANNER_THREAD_TOTAL_WALLCLOCK_TIME
static const std::string SCANNER_THREAD_TOTAL_WALLCLOCK_TIME
Definition: scan-node.h:131

impala::FieldLocation::start
char * start
Definition: hdfs-scanner.h:56

impala::DiskIoMgr::ScanRange::meta_data
void * meta_data() const
Definition: disk-io-mgr.h:312

impala::HdfsScanner::InitializeWriteTuplesFn
Status InitializeWriteTuplesFn(HdfsPartitionDescriptor *partition, THdfsFileFormat::type type, const std::string &scanner_name)
Definition: hdfs-scanner.cc:87

impala::ScannerContext::partition_descriptor
HdfsPartitionDescriptor * partition_descriptor()
Definition: scanner-context.h:278

impala::HdfsScanNode::num_materialized_partition_keys
int num_materialized_partition_keys() const
Returns number of materialized partition key slots.
Definition: hdfs-scan-node.h:130

impala::HdfsTextScanner::FillByteBuffer
virtual Status FillByteBuffer(bool *eosr, int num_bytes=0)
Definition: hdfs-text-scanner.cc:410

impala::HdfsTextScanner::~HdfsTextScanner
virtual ~HdfsTextScanner()
Definition: hdfs-text-scanner.cc:64

impala::DiskIoMgr::ScanRange
Definition: disk-io-mgr.h:295

impala::HdfsScanner::stream_
ScannerContext::Stream * stream_
The first stream for context_.
Definition: hdfs-scanner.h:150

impala::ScannerContext::Stream::set_contains_tuple_data
void set_contains_tuple_data(bool v)
Definition: scanner-context.h:97

impala::MemPool::Allocate
uint8_t * Allocate(int size)
Definition: mem-pool.h:92

impala::HdfsScanNode::tuple_desc
const TupleDescriptor * tuple_desc()
Definition: hdfs-scan-node.h:132

impala::HdfsScanner::next_tuple
Tuple * next_tuple(Tuple *t) const
Definition: hdfs-scanner.h:363

impala::HdfsScanner::Prepare
virtual Status Prepare(ScannerContext *context)
One-time initialisation of state that is constant across scan ranges.
Definition: hdfs-scanner.cc:71

cpu-info.h

impala::HdfsFileDesc
Definition: hdfs-scan-node.h:53

impala::ExecNode::runtime_profile
RuntimeProfile * runtime_profile()
Definition: exec-node.h:161

impala::ScanNode::materialize_tuple_timer
RuntimeProfile::Counter * materialize_tuple_timer() const
Definition: scan-node.h:104