#include <hdfs-avro-scanner.h>

Inheritance diagram for impala::HdfsAvroScanner:

Collaboration diagram for impala::HdfsAvroScanner:

Classes
struct	AvroFileHeader

struct	SchemaElement

struct	ScopedAvroSchemaT
	Wrapper for avro_schema_t's that handles decrementing the ref count. More...

Public Member Functions
	HdfsAvroScanner (HdfsScanNode scan_node, RuntimeState state)

virtual Status	Prepare (ScannerContext *context)
	One-time initialisation of state that is constant across scan ranges. More...

virtual void	Close ()

virtual Status	ProcessSplit ()

Static Public Member Functions
static llvm::Function *	Codegen (HdfsScanNode , const std::vector< ExprContext > &conjunct_ctxs)
	Codegen parsing records, writing tuples and evaluating predicates. More...

static Status	IssueInitialRanges (HdfsScanNode scan_node, const std::vector< HdfsFileDesc > &files)
	Issue the initial ranges for all sequence container files. More...

Static Public Attributes
static const uint8_t	AVRO_VERSION_HEADER [4] = {'O', 'b', 'j', 1}

static const int	FILE_BLOCK_SIZE = 4096

Protected Types
typedef int(*	WriteTuplesFn )(HdfsScanner , MemPool , TupleRow , int, FieldLocation , int, int, int, int)

Protected Member Functions
virtual FileHeader *	AllocateFileHeader ()
	Implementation of BaseSeqeunceScanner super class methods. More...

virtual Status	ReadFileHeader ()
	TODO: check that file schema matches metadata schema. More...

virtual Status	InitNewRange ()
	Reset internal state for a new scan range. More...

virtual Status	ProcessRange ()

virtual THdfsFileFormat::type	file_format () const
	Returns type of scanner: e.g. rcfile, seqfile. More...

Status	ReadSync ()

Status	SkipToSync (const uint8_t *sync, int sync_size)

bool	finished ()

Status	InitializeWriteTuplesFn (HdfsPartitionDescriptor *partition, THdfsFileFormat::type type, const std::string &scanner_name)

void	StartNewRowBatch ()
	Set batch_ to a new row batch and update tuple_mem_ accordingly. More...

int	GetMemory (MemPool pool, Tuple tuple_mem, TupleRow **tuple_row_mem)

Status	CommitRows (int num_rows)

void	AddFinalRowBatch ()

void	AttachPool (MemPool *pool, bool commit_batch)

bool IR_ALWAYS_INLINE	EvalConjuncts (TupleRow *row)

int	WriteEmptyTuples (RowBatch *row_batch, int num_tuples)

int	WriteEmptyTuples (ScannerContext context, TupleRow tuple_row, int num_tuples)
	Write empty tuples and commit them to the context object. More...

int	WriteAlignedTuples (MemPool pool, TupleRow tuple_row_mem, int row_size, FieldLocation *fields, int num_tuples, int max_added_tuples, int slots_per_tuple, int row_start_indx)

Status	UpdateDecompressor (const THdfsCompression::type &compression)

Status	UpdateDecompressor (const std::string &codec)

bool	ReportTupleParseError (FieldLocation fields, uint8_t errors, int row_idx)

virtual void	LogRowParseError (int row_idx, std::stringstream *)

bool	WriteCompleteTuple (MemPool pool, FieldLocation fields, Tuple tuple, TupleRow tuple_row, Tuple template_tuple, uint8_t error_fields, uint8_t *error_in_row)

void	ReportColumnParseError (const SlotDescriptor desc, const char data, int len)

void	InitTuple (Tuple template_tuple, Tuple tuple)

Tuple *	next_tuple (Tuple *t) const

TupleRow *	next_row (TupleRow *r) const

ExprContext *	GetConjunctCtx (int idx) const

Static Protected Member Functions
static llvm::Function *	CodegenWriteCompleteTuple (HdfsScanNode , LlvmCodeGen , const std::vector< ExprContext * > &conjunct_ctxs)

static llvm::Function *	CodegenWriteAlignedTuples (HdfsScanNode , LlvmCodeGen , llvm::Function *write_tuple_fn)

Protected Attributes
FileHeader *	header_
	File header for this scan range. This is not owned by the parent scan node. More...

bool	only_parsing_header_
	If true, this scanner object is only for processing the header. More...

HdfsScanNode *	scan_node_
	The scan node that started this scanner. More...

RuntimeState *	state_
	RuntimeState for error reporting. More...

ScannerContext *	context_
	Context for this scanner. More...

ScannerContext::Stream *	stream_
	The first stream for context_. More...

std::vector< ExprContext * >	conjunct_ctxs_

Tuple *	template_tuple_

int	tuple_byte_size_
	Fixed size of each tuple, in bytes. More...

Tuple *	tuple_
	Current tuple pointer into tuple_mem_. More...

RowBatch *	batch_

uint8_t *	tuple_mem_
	The tuple memory of batch_. More...

int	num_errors_in_file_
	number of errors in current file More...

boost::scoped_ptr< TextConverter >	text_converter_
	Helper class for converting text to other types;. More...

int32_t	num_null_bytes_
	Number of null bytes in the tuple. More...

Status	parse_status_

boost::scoped_ptr< Codec >	decompressor_
	Decompressor class to use, if any. More...

THdfsCompression::type	decompression_type_
	The most recently used decompression type. More...

boost::scoped_ptr< MemPool >	data_buffer_pool_

RuntimeProfile::Counter *	decompress_timer_
	Time spent decompressing bytes. More...

WriteTuplesFn	write_tuples_fn_
	Jitted write tuples function pointer. Null if codegen is disabled. More...

Static Protected Attributes
static const int	SYNC_HASH_SIZE = 16
	Size of the sync hash field. More...

static const int	HEADER_SIZE = 1024

static const int	SYNC_MARKER = -1
	Sync indicator. More...

Private Types
typedef int(*	DecodeAvroDataFn )(HdfsAvroScanner , int, MemPool , uint8_t *, Tuple , TupleRow *)

Private Member Functions
Status	ParseMetadata ()
	Utility function for decoding and parsing file header metadata. More...

Status	ResolveSchemas (const avro_schema_t &table_root, const avro_schema_t &file_root)

Status	VerifyTypesMatch (SlotDescriptor slot_desc, avro_obj_t schema)

int	DecodeAvroData (int max_tuples, MemPool pool, uint8_t data, Tuple tuple, TupleRow *tuple_row)

void	MaterializeTuple (MemPool pool, uint8_t data, Tuple tuple)
	Materializes a single tuple from serialized record data. More...

void	ReadAvroBoolean (PrimitiveType type, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroInt32 (PrimitiveType type, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroInt64 (PrimitiveType type, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroFloat (PrimitiveType type, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroDouble (PrimitiveType type, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroVarchar (PrimitiveType type, int max_len, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroChar (PrimitiveType type, int max_len, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroString (PrimitiveType type, uint8_t *data, bool write_slot, void slot, MemPool *pool)

void	ReadAvroDecimal (int slot_byte_size, uint8_t *data, bool write_slot, void slot, MemPool *pool)

bool	ReadUnionType (int null_union_position, uint8_t **data)

Static Private Member Functions
static SchemaElement	ConvertSchema (const avro_schema_t &schema)
	Utility function that maps the Avro library's type representation to our own. More...

static llvm::Function *	CodegenDecodeAvroData (RuntimeState state, llvm::Function materialize_tuple_fn, const std::vector< ExprContext * > &conjunct_ctxs)

static llvm::Function *	CodegenMaterializeTuple (HdfsScanNode node, LlvmCodeGen codegen)

Private Attributes
AvroFileHeader *	avro_header_

DecodeAvroDataFn	codegend_decode_avro_data_
	The codegen'd version of DecodeAvroData() if available, NULL otherwise. More...

Static Private Attributes
static const std::string	AVRO_SCHEMA_KEY
	Metadata keys. More...

static const std::string	AVRO_CODEC_KEY

static const std::string	AVRO_NULL_CODEC
	Supported codecs, as they appear in the metadata. More...

static const std::string	AVRO_SNAPPY_CODEC

static const std::string	AVRO_DEFLATE_CODEC

static const char *	LLVM_CLASS_NAME = "class.impala::HdfsAvroScanner"

Detailed Description

Definition at line 80 of file hdfs-avro-scanner.h.

Member Typedef Documentation

typedef int(* impala::HdfsAvroScanner::DecodeAvroDataFn)(HdfsAvroScanner *, int, MemPool *, uint8_t **, Tuple *, TupleRow *)

private

Definition at line 171 of file hdfs-avro-scanner.h.

typedef int(* impala::HdfsScanner::WriteTuplesFn)(HdfsScanner *, MemPool *, TupleRow *, int, FieldLocation *, int, int, int, int)

protectedinherited

Matching typedef for WriteAlignedTuples for codegen. Refer to comments for that function.

Definition at line 212 of file hdfs-scanner.h.

Constructor & Destructor Documentation

HdfsAvroScanner::HdfsAvroScanner	(	HdfsScanNode *	scan_node,
		RuntimeState *	state
	)

Definition at line 77 of file hdfs-avro-scanner.cc.

Member Function Documentation

void HdfsScanner::AddFinalRowBatch ( )

protectedinherited

Attach all remaining resources from context_ to batch_ and send batch_ to the scan node. This must be called after all rows have been committed and no further resources are needed from context_ (in practice this will happen in each scanner subclass's Close() implementation).

Definition at line 145 of file hdfs-scanner.cc.

References impala::HdfsScanNode::AddMaterializedRowBatch(), impala::HdfsScanner::batch_, impala::HdfsScanner::context_, impala::ScannerContext::ReleaseCompletedResources(), and impala::HdfsScanner::scan_node_.

Referenced by impala::HdfsTextScanner::Close(), impala::BaseSequenceScanner::Close(), and impala::HdfsParquetScanner::Close().

BaseSequenceScanner::FileHeader * HdfsAvroScanner::AllocateFileHeader ( )

protectedvirtual

Implementation of BaseSeqeunceScanner super class methods.

Implements impala::BaseSequenceScanner.

Definition at line 93 of file hdfs-avro-scanner.cc.

References impala::HdfsAvroScanner::AvroFileHeader::template_tuple, and impala::HdfsScanner::template_tuple_.

void impala::HdfsScanner::AttachPool	(	MemPool *	pool,
		bool	commit_batch
	)

inlineprotectedinherited

Release all memory in 'pool' to batch_. If commit_batch is true, the row batch will be committed. commit_batch should be true if the attached pool is expected to be non-trivial (i.e. a decompression buffer) to minimize scanner mem usage.

Definition at line 256 of file hdfs-scanner.h.

References impala::MemPool::AcquireData(), impala::HdfsScanner::batch_, impala::HdfsScanner::CommitRows(), and impala::RowBatch::tuple_data_pool().

Referenced by impala::HdfsTextScanner::Close(), impala::BaseSequenceScanner::Close(), impala::HdfsParquetScanner::Close(), impala::HdfsTextScanner::FillByteBufferGzip(), ProcessRange(), impala::HdfsSequenceScanner::ReadCompressedBlock(), impala::HdfsParquetScanner::BaseColumnReader::ReadDataPage(), and impala::HdfsRCFileScanner::ResetRowGroup().

void BaseSequenceScanner::Close ( )

virtualinherited

Release all resources the scanner has allocated. This is the last chance for the scanner to attach any resources to the ScannerContext object.

Reimplemented from impala::HdfsScanner.

Definition at line 82 of file base-sequence-scanner.cc.

References impala::HdfsScanner::AddFinalRowBatch(), impala::HdfsScanner::AttachPool(), impala::ScannerContext::Stream::bytes_left(), impala::HdfsScanner::Close(), impala::BaseSequenceScanner::FileHeader::compression_type, impala::HdfsScanner::data_buffer_pool_, impala::HdfsScanner::decompressor_, impala::BaseSequenceScanner::file_format(), impala::BaseSequenceScanner::header_, impala::BaseSequenceScanner::num_syncs_, impala::BaseSequenceScanner::only_parsing_header_, impala::HdfsScanNode::RangeComplete(), impala::HdfsScanner::scan_node_, impala::HdfsScanner::stream_, impala::BaseSequenceScanner::total_block_size_, and VLOG_FILE.

Function * HdfsAvroScanner::Codegen	(	HdfsScanNode *	,
		const std::vector< ExprContext * > &	conjunct_ctxs
	)

static

Codegen parsing records, writing tuples and evaluating predicates.

Definition at line 83 of file hdfs-avro-scanner.cc.

References impala::RuntimeState::codegen_enabled(), CodegenDecodeAvroData(), CodegenMaterializeTuple(), impala::RuntimeState::GetCodegen(), impala::Status::ok(), and impala::HdfsScanNode::runtime_state().

Referenced by impala::HdfsScanNode::Prepare().

Function * HdfsAvroScanner::CodegenDecodeAvroData	(	RuntimeState *	state,
		llvm::Function *	materialize_tuple_fn,
		const std::vector< ExprContext * > &	conjunct_ctxs
	)

staticprivate

Produces a version of DecodeAvroData that uses codegen'd instead of interpreted functions.

Definition at line 885 of file hdfs-avro-scanner.cc.

References impala::LlvmCodeGen::codegen_timer(), impala::ExecNode::CodegenEvalConjuncts(), impala::RuntimeState::GetCodegen(), impala::LlvmCodeGen::GetFunction(), impala::Status::ok(), impala::LlvmCodeGen::OptimizeFunctionWithExprs(), impala::LlvmCodeGen::ReplaceCallSites(), and SCOPED_TIMER.

Referenced by Codegen().

Function * HdfsAvroScanner::CodegenMaterializeTuple	(	HdfsScanNode *	node,
		LlvmCodeGen *	codegen
	)

staticprivate

Codegens a version of MaterializeTuple() that reads records based on the table schema. TODO: Codegen a function for each unique file schema.

Definition at line 691 of file hdfs-avro-scanner.cc.

Referenced by Codegen().

Function * HdfsScanner::CodegenWriteAlignedTuples	(	HdfsScanNode *	,
		LlvmCodeGen *	,
		llvm::Function *	write_tuple_fn
	)

staticprotectedinherited

Codegen function to replace WriteAlignedTuples. WriteAlignedTuples is cross compiled to IR. This function loads the precompiled IR function, modifies it and returns the resulting function.

Definition at line 495 of file hdfs-scanner.cc.

References impala::LlvmCodeGen::codegen_timer(), impala::LlvmCodeGen::FinalizeFunction(), impala::LlvmCodeGen::GetFunction(), impala::LlvmCodeGen::ReplaceCallSites(), and SCOPED_TIMER.

Referenced by impala::HdfsTextScanner::Codegen(), and impala::HdfsSequenceScanner::Codegen().

Function * HdfsScanner::CodegenWriteCompleteTuple	(	HdfsScanNode *	,
		LlvmCodeGen *	,
		const std::vector< ExprContext * > &	conjunct_ctxs
	)

staticprotectedinherited

Codegen function to replace WriteCompleteTuple. Should behave identically to WriteCompleteTuple.

Definition at line 296 of file hdfs-scanner.cc.

Referenced by impala::HdfsTextScanner::Codegen(), and impala::HdfsSequenceScanner::Codegen().

Status HdfsScanner::CommitRows ( int num_rows )

protectedinherited

Commit num_rows to the current row batch. If this completes, the row batch is enqueued with the scan node and StartNewRowBatch() is called. Returns Status::OK if the query is not cancelled and hasn't exceeded any mem limits. Scanner can call this with 0 rows to flush any pending resources (attached pools and io buffers) to minimize memory consumption.

Definition at line 124 of file hdfs-scanner.cc.

References impala::HdfsScanNode::AddMaterializedRowBatch(), impala::RowBatch::AtCapacity(), impala::HdfsScanner::batch_, impala::TupleDescriptor::byte_size(), impala::Status::CANCELLED, impala::ScannerContext::cancelled(), impala::RowBatch::capacity(), impala::RuntimeState::CheckQueryState(), impala::RowBatch::CommitRows(), impala::HdfsScanner::conjunct_ctxs_, impala::HdfsScanner::context_, impala::ExprContext::FreeLocalAllocations(), impala::ScannerContext::num_completed_io_buffers(), impala::RowBatch::num_rows(), impala::Status::OK, impala::ScannerContext::ReleaseCompletedResources(), RETURN_IF_ERROR, impala::HdfsScanner::scan_node_, impala::HdfsScanner::StartNewRowBatch(), impala::HdfsScanner::state_, impala::HdfsScanNode::tuple_desc(), and impala::HdfsScanner::tuple_mem_.

Referenced by impala::HdfsParquetScanner::AssembleRows(), impala::HdfsScanner::AttachPool(), impala::HdfsTextScanner::FinishScanRange(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsParquetScanner::ProcessFooter(), impala::HdfsTextScanner::ProcessRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), impala::HdfsRCFileScanner::ProcessRange(), and impala::HdfsParquetScanner::ProcessSplit().

HdfsAvroScanner::SchemaElement HdfsAvroScanner::ConvertSchema ( const avro_schema_t & schema )

staticprivate

Utility function that maps the Avro library's type representation to our own.

Definition at line 383 of file hdfs-avro-scanner.cc.

References impala::HdfsAvroScanner::ScopedAvroSchemaT::get(), impala::HdfsAvroScanner::SchemaElement::null_union_position, and impala::HdfsAvroScanner::SchemaElement::schema.

Referenced by CodegenMaterializeTuple(), and ResolveSchemas().

int HdfsAvroScanner::DecodeAvroData	(	int	max_tuples,
		MemPool *	pool,
		uint8_t **	data,
		Tuple *	tuple,
		TupleRow *	tuple_row
	)

private

Decodes records and copies the data into tuples. Returns the number of tuples to be committed.

max_tuples: the maximum number of tuples to write
data: serialized record data. Is advanced as records are read.
pool: memory pool to allocate string data from
tuple: tuple pointer to copy objects to
tuple_row: tuple row of written tuples

Definition at line 23 of file hdfs-avro-scanner-ir.cc.

References impala::HdfsScanner::EvalConjuncts(), impala::HdfsScanner::InitTuple(), MaterializeTuple(), impala::HdfsScanner::next_row(), impala::HdfsScanner::next_tuple(), impala::HdfsScanner::scan_node_, impala::TupleRow::SetTuple(), impala::HdfsScanner::template_tuple_, and impala::HdfsScanNode::tuple_idx().

Referenced by ProcessRange().

bool IR_ALWAYS_INLINE impala::HdfsScanner::EvalConjuncts ( TupleRow * row )

inlineprotectedinherited

Convenience function for evaluating conjuncts using this scanner's ExprContexts. This must always be inlined so we can correctly replace the call to ExecNode::EvalConjuncts() during codegen.

Definition at line 266 of file hdfs-scanner.h.

References impala::HdfsScanner::conjunct_ctxs_, and impala::ExecNode::EvalConjuncts().

Referenced by impala::HdfsParquetScanner::AssembleRows(), DecodeAvroData(), impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsScanner::WriteCompleteTuple(), impala::HdfsScanner::WriteEmptyTuples(), and impala::HdfsTextScanner::WriteFields().

virtual THdfsFileFormat::type impala::HdfsAvroScanner::file_format ( ) const

inlineprotectedvirtual

Returns type of scanner: e.g. rcfile, seqfile.

Implements impala::BaseSequenceScanner.

Definition at line 100 of file hdfs-avro-scanner.h.

bool impala::BaseSequenceScanner::finished ( )

inlineprotectedinherited

Definition at line 117 of file base-sequence-scanner.h.

References impala::BaseSequenceScanner::finished_.

Referenced by impala::HdfsSequenceScanner::ProcessBlockCompressedScanRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsRCFileScanner::ProcessRange().

ExprContext * HdfsScanner::GetConjunctCtx ( int idx ) const

protectedinherited

Simple wrapper around conjunct_ctxs_. Used in the codegen'd version of WriteCompleteTuple() because it's easier than writing IR to access conjunct_ctxs_.

Definition at line 79 of file hdfs-scanner-ir.cc.

References impala::HdfsScanner::conjunct_ctxs_, and gen_ir_descriptions::idx.

int HdfsScanner::GetMemory	(	MemPool **	pool,
		Tuple **	tuple_mem,
		TupleRow **	tuple_row_mem
	)

protectedinherited

Gets memory for outputting tuples into batch_. *pool is the mem pool that should be used for memory allocated for those tuples. *tuple_mem should be the location to output tuples, and *tuple_row_mem for outputting tuple rows. Returns the maximum number of tuples/tuple rows that can be output (before the current row batch is complete and a new one is allocated). Memory returned from this call is invalidated after calling CommitRows. Callers must call GetMemory again after calling this function.

Definition at line 115 of file hdfs-scanner.cc.

References impala::RowBatch::AddRow(), impala::HdfsScanner::batch_, impala::RowBatch::capacity(), impala::RowBatch::GetRow(), impala::RowBatch::num_rows(), impala::RowBatch::tuple_data_pool(), and impala::HdfsScanner::tuple_mem_.

Referenced by impala::HdfsParquetScanner::AssembleRows(), impala::HdfsTextScanner::FinishScanRange(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsParquetScanner::ProcessFooter(), impala::HdfsTextScanner::ProcessRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsRCFileScanner::ProcessRange().

Status HdfsScanner::InitializeWriteTuplesFn	(	HdfsPartitionDescriptor *	partition,
		THdfsFileFormat::type	type,
		const std::string &	scanner_name
	)

protectedinherited

Initializes write_tuples_fn_ to the jitted function if codegen is possible.

partition - partition descriptor for this scanner/scan range
type - type for this scanner
scanner_name - debug string name for this scanner (e.g. HdfsTextScanner)

Definition at line 87 of file hdfs-scanner.cc.

References impala::HdfsPartitionDescriptor::escape_char(), impala::HdfsScanNode::GetCodegenFn(), impala::ExecNode::id(), impala::HdfsScanNode::IncNumScannersCodegenDisabled(), impala::HdfsScanNode::IncNumScannersCodegenEnabled(), impala::Status::OK, impala::HdfsScanner::scan_node_, impala::TupleDescriptor::string_slots(), impala::HdfsScanNode::tuple_desc(), and impala::HdfsScanner::write_tuples_fn_.

Referenced by impala::HdfsSequenceScanner::InitNewRange(), and impala::HdfsTextScanner::ResetScanner().

Status HdfsAvroScanner::InitNewRange ( )

protectedvirtual

Reset internal state for a new scan range.

Implements impala::HdfsScanner.

Definition at line 493 of file hdfs-avro-scanner.cc.

References avro_header_, codegend_decode_avro_data_, impala::BaseSequenceScanner::FileHeader::compression_type, impala::HdfsScanNode::GetCodegenFn(), impala::BaseSequenceScanner::header_, impala::ExecNode::id(), impala::HdfsScanNode::IncNumScannersCodegenDisabled(), impala::HdfsScanNode::IncNumScannersCodegenEnabled(), impala::BaseSequenceScanner::FileHeader::is_compressed, impala::Status::OK, impala::BaseSequenceScanner::only_parsing_header_, RETURN_IF_ERROR, impala::HdfsScanner::scan_node_, impala::HdfsAvroScanner::AvroFileHeader::template_tuple, impala::HdfsScanner::template_tuple_, impala::HdfsScanner::UpdateDecompressor(), and impala::HdfsAvroScanner::AvroFileHeader::use_codegend_decode_avro_data.

void impala::HdfsScanner::InitTuple	(	Tuple *	template_tuple,
		Tuple *	tuple
	)

inlineprotectedinherited

Initialize a tuple. TODO: only copy over non-null slots. TODO: InitTuple is called frequently, avoid the if, perhaps via templatization.

Definition at line 355 of file hdfs-scanner.h.

References impala::HdfsScanner::num_null_bytes_, and impala::HdfsScanner::tuple_byte_size_.

Referenced by impala::HdfsParquetScanner::AssembleRows(), DecodeAvroData(), impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsScanner::WriteCompleteTuple(), and impala::HdfsTextScanner::WriteFields().

Status BaseSequenceScanner::IssueInitialRanges	(	HdfsScanNode *	scan_node,
		const std::vector< HdfsFileDesc * > &	files
	)

staticinherited

Issue the initial ranges for all sequence container files.

Definition at line 40 of file base-sequence-scanner.cc.

References impala::HdfsScanNode::AddDiskIoRanges(), impala::HdfsScanNode::AllocateScanRange(), impala::BaseSequenceScanner::HEADER_SIZE, impala::Status::OK, impala::ScanRangeMetadata::partition_id, and RETURN_IF_ERROR.

Referenced by impala::HdfsScanNode::GetNext().

void HdfsScanner::LogRowParseError	(	int	row_idx,
		std::stringstream *
	)

protectedvirtualinherited

Utility function to append an error message for an invalid row. This is called from ReportTupleParseError() row_idx is the index of the row in the current batch. Subclasses should override this function (i.e. text needs to join boundary rows). Since this is only in the error path, vtable overhead is acceptable.

Reimplemented in impala::HdfsSequenceScanner, and impala::HdfsTextScanner.

Definition at line 572 of file hdfs-scanner.cc.

Referenced by impala::HdfsScanner::ReportTupleParseError().

void HdfsAvroScanner::MaterializeTuple	(	MemPool *	pool,
		uint8_t **	data,
		Tuple *	tuple
	)

private

Materializes a single tuple from serialized record data.

Definition at line 585 of file hdfs-avro-scanner.cc.

References avro_header_, impala::ColumnType::GetByteSize(), impala::Tuple::GetSlot(), impala::INVALID_TYPE, impala::ColumnType::len, impala::SlotDescriptor::null_indicator_offset(), impala::HdfsAvroScanner::SchemaElement::null_union_position, pool, ReadAvroBoolean(), ReadAvroChar(), ReadAvroDecimal(), ReadAvroDouble(), ReadAvroFloat(), ReadAvroInt32(), ReadAvroInt64(), ReadAvroString(), ReadAvroVarchar(), ReadUnionType(), impala::HdfsAvroScanner::SchemaElement::schema, impala::HdfsAvroScanner::AvroFileHeader::schema, impala::Tuple::SetNull(), impala::HdfsAvroScanner::SchemaElement::slot_desc, impala::SlotDescriptor::tuple_offset(), impala::ColumnType::type, impala::SlotDescriptor::type(), impala::TYPE_CHAR, impala::TYPE_DECIMAL, and impala::TYPE_VARCHAR.

Referenced by DecodeAvroData().

TupleRow* impala::HdfsScanner::next_row ( TupleRow * r ) const

inlineprotectedinherited

Definition at line 368 of file hdfs-scanner.h.

References impala::HdfsScanner::batch_, and impala::RowBatch::row_byte_size().

Referenced by impala::HdfsParquetScanner::AssembleRows(), DecodeAvroData(), impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsScanner::WriteEmptyTuples(), and impala::HdfsTextScanner::WriteFields().

Tuple* impala::HdfsScanner::next_tuple ( Tuple * t ) const

inlineprotectedinherited

Definition at line 363 of file hdfs-scanner.h.

References impala::HdfsScanner::tuple_byte_size_.

Referenced by impala::HdfsParquetScanner::AssembleRows(), DecodeAvroData(), impala::HdfsRCFileScanner::ProcessRange(), and impala::HdfsTextScanner::WriteFields().

Status HdfsAvroScanner::ParseMetadata ( )

private

Utility function for decoding and parsing file header metadata.

Definition at line 125 of file hdfs-avro-scanner.cc.

References AVRO_CODEC_KEY, AVRO_DEFLATE_CODEC, avro_header_, AVRO_NULL_CODEC, impala::HdfsTableDescriptor::avro_schema(), AVRO_SCHEMA_KEY, AVRO_SNAPPY_CODEC, impala::BaseSequenceScanner::FileHeader::codec, impala::BaseSequenceScanner::FileHeader::compression_type, impala::ScannerContext::Stream::filename(), impala::HdfsAvroScanner::ScopedAvroSchemaT::get(), impala::HdfsScanNode::hdfs_table(), impala::BaseSequenceScanner::header_, impala::BaseSequenceScanner::FileHeader::is_compressed, impala::Status::OK, impala::HdfsScanner::parse_status_, impala::ScannerContext::Stream::ReadBytes(), impala::ScannerContext::Stream::ReadZLong(), ResolveSchemas(), RETURN_IF_ERROR, RETURN_IF_FALSE, impala::HdfsScanner::scan_node_, impala::HdfsAvroScanner::AvroFileHeader::schema, impala::HdfsScanner::stream_, impala::HdfsAvroScanner::AvroFileHeader::use_codegend_decode_avro_data, VLOG_FILE, and VLOG_ROW.

Referenced by ReadFileHeader().

Status BaseSequenceScanner::Prepare ( ScannerContext * context )

virtualinherited

One-time initialisation of state that is constant across scan ranges.

Reimplemented from impala::HdfsScanner.

Reimplemented in impala::HdfsRCFileScanner, and impala::HdfsSequenceScanner.

Definition at line 74 of file base-sequence-scanner.cc.

References ADD_COUNTER, impala::BaseSequenceScanner::bytes_skipped_counter_, impala::Status::OK, impala::HdfsScanner::Prepare(), impala::BaseSequenceScanner::ReadPastSize(), RETURN_IF_ERROR, impala::ExecNode::runtime_profile(), impala::HdfsScanner::scan_node_, impala::ScannerContext::Stream::set_read_past_size_cb(), and impala::HdfsScanner::stream_.

Referenced by impala::HdfsSequenceScanner::Prepare(), and impala::HdfsRCFileScanner::Prepare().

Status HdfsAvroScanner::ProcessRange ( )

protectedvirtual

Process the current range until the end or an error occurred. Note this might be called multiple times if we skip over bad data. This function should read from the underlying ScannerContext materializing tuples to the context. When this function is called, it is guaranteed to be at the start of a data block (i.e. right after the sync marker).

Implements impala::BaseSequenceScanner.

Definition at line 517 of file hdfs-avro-scanner.cc.

Status BaseSequenceScanner::ProcessSplit ( )

virtualinherited

Process an entire split, reading bytes from the context's streams. Context is initialized with the split data (e.g. template tuple, partition descriptor, etc). This function should only return on error or end of scan range.

Implements impala::HdfsScanner.

Definition at line 100 of file base-sequence-scanner.cc.

void HdfsAvroScanner::ReadAvroBoolean	(	PrimitiveType	type,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

The following are cross-compiled functions for parsing a serialized Avro primitive type and writing it to a slot. They can also be used for skipping a field without writing it to a slot by setting 'write_slot' to false.

data: Serialized record data. Is advanced past the read field. The following arguments are used only if 'write_slot' is true:
slot: The tuple slot to write the parsed field into.
type: The type of the slot. (This is necessary because there is not a 1:1 mapping between Avro types and Impala's primitive types.)
pool: MemPool for string data.

Definition at line 50 of file hdfs-avro-scanner-ir.cc.

References impala::TYPE_BOOLEAN.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroChar	(	PrimitiveType	type,
		int	max_len,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 131 of file hdfs-avro-scanner-ir.cc.

References impala::MemPool::Allocate(), impala::ColumnType::CreateCharType(), impala::ColumnType::IsVarLen(), impala::StringValue::len, impala::StringValue::PadWithSpaces(), impala::StringValue::ptr, impala::ReadWriteUtil::ReadZLong(), and impala::TYPE_CHAR.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroDecimal	(	int	slot_byte_size,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Same as the above functions, except takes the size of the decimal slot (i.e. 4, 8, or 16) instead of the type (which should be TYPE_DECIMAL). The slot size is passed explicitly, rather than passing a ColumnType, so we can easily pass in a constant in the codegen'd MaterializeTuple() function. If 'write_slot' is false, 'slot_byte_size' is ignored.

Definition at line 164 of file hdfs-avro-scanner-ir.cc.

References impala::BitUtil::ByteSwap(), and impala::ReadWriteUtil::ReadZLong().

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroDouble	(	PrimitiveType	type,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 108 of file hdfs-avro-scanner-ir.cc.

References impala::TYPE_DOUBLE.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroFloat	(	PrimitiveType	type,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 93 of file hdfs-avro-scanner-ir.cc.

References impala::TYPE_DOUBLE, and impala::TYPE_FLOAT.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroInt32	(	PrimitiveType	type,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 59 of file hdfs-avro-scanner-ir.cc.

References impala::ReadWriteUtil::ReadZInt(), impala::TYPE_BIGINT, impala::TYPE_DOUBLE, impala::TYPE_FLOAT, and impala::TYPE_INT.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroInt64	(	PrimitiveType	type,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 77 of file hdfs-avro-scanner-ir.cc.

References impala::ReadWriteUtil::ReadZLong(), impala::TYPE_BIGINT, impala::TYPE_DOUBLE, and impala::TYPE_FLOAT.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroString	(	PrimitiveType	type,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 152 of file hdfs-avro-scanner-ir.cc.

References impala::StringValue::len, impala::StringValue::ptr, impala::ReadWriteUtil::ReadZLong(), and impala::TYPE_STRING.

Referenced by MaterializeTuple().

void HdfsAvroScanner::ReadAvroVarchar	(	PrimitiveType	type,
		int	max_len,
		uint8_t **	data,
		bool	write_slot,
		void *	slot,
		MemPool *	pool
	)

private

Definition at line 117 of file hdfs-avro-scanner-ir.cc.

References impala::StringValue::len, impala::StringValue::ptr, impala::ReadWriteUtil::ReadZLong(), and impala::TYPE_VARCHAR.

Referenced by MaterializeTuple().

Status HdfsAvroScanner::ReadFileHeader ( )

protectedvirtual

TODO: check that file schema matches metadata schema.

Implements impala::BaseSequenceScanner.

Definition at line 99 of file hdfs-avro-scanner.cc.

References avro_header_, AVRO_VERSION_HEADER, impala::BaseSequenceScanner::header_, impala::BaseSequenceScanner::FileHeader::header_size, impala::ReadWriteUtil::HexDump(), impala::Status::OK, impala::HdfsScanner::parse_status_, ParseMetadata(), impala::ScannerContext::Stream::ReadBytes(), RETURN_IF_ERROR, RETURN_IF_FALSE, impala::HdfsScanner::stream_, impala::BaseSequenceScanner::FileHeader::sync, impala::BaseSequenceScanner::SYNC_HASH_SIZE, and impala::ScannerContext::Stream::total_bytes_returned().

Status BaseSequenceScanner::ReadSync ( )

protectedinherited

Read and validate sync marker against header_->sync. Returns non-ok if the sync marker did not match. Scanners should always use this function to read sync markers, otherwise finished() might not be updated correctly. If finished() returns true after calling this function, scanners must not process any more records.

Definition at line 170 of file base-sequence-scanner.cc.

References impala::BaseSequenceScanner::block_start_, impala::ScannerContext::Stream::eof(), impala::ScannerContext::Stream::eosr(), impala::ScannerContext::Stream::file_offset(), impala::BaseSequenceScanner::finished_, impala::ScannerContext::Stream::GetBytes(), impala::hash, impala::BaseSequenceScanner::header_, impala::ReadWriteUtil::HexDump(), impala::BaseSequenceScanner::num_syncs_, impala::Status::OK, impala::HdfsScanner::parse_status_, RETURN_IF_FALSE, impala::HdfsScanner::stream_, impala::BaseSequenceScanner::FileHeader::sync, impala::BaseSequenceScanner::SYNC_HASH_SIZE, and impala::BaseSequenceScanner::total_block_size_.

Referenced by impala::HdfsSequenceScanner::ProcessBlockCompressedScanRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsRCFileScanner::ProcessRange().

bool HdfsAvroScanner::ReadUnionType	(	int	null_union_position,
		uint8_t **	data
	)

private

Reads and advances 'data' past the union branch index and returns true if the corresponding element is non-null. 'null_union_position' must be 0 or 1.

Definition at line 39 of file hdfs-avro-scanner-ir.cc.

Referenced by MaterializeTuple().

void HdfsScanner::ReportColumnParseError	(	const SlotDescriptor *	desc,
		const char *	data,
		int	len
	)

protectedinherited

Report parse error for column @ desc. If abort_on_error is true, sets parse_status_ to the error message.

Definition at line 577 of file hdfs-scanner.cc.

References impala::RuntimeState::abort_on_error(), impala::SlotDescriptor::col_pos(), impala::RuntimeState::LogError(), impala::RuntimeState::LogHasSpace(), impala::HdfsScanNode::num_partition_keys(), impala::Status::ok(), impala::HdfsScanner::parse_status_, impala::HdfsScanner::scan_node_, impala::HdfsScanner::state_, and impala::SlotDescriptor::type().

Referenced by impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsScanner::ReportTupleParseError(), and impala::HdfsTextScanner::WritePartialTuple().

bool HdfsScanner::ReportTupleParseError	(	FieldLocation *	fields,
		uint8_t *	errors,
		int	row_idx
	)

protectedinherited

Utility function to report parse errors for each field. If errors[i] is nonzero, fields[i] had a parse error. row_idx is the idx of the row in the current batch that had the parse error Returns false if parsing should be aborted. In this case parse_status_ is set to the error. This is called from WriteAlignedTuples.

Definition at line 546 of file hdfs-scanner.cc.

References impala::RuntimeState::abort_on_error(), impala::ScannerContext::Stream::filename(), impala::RuntimeState::LogError(), impala::RuntimeState::LogHasSpace(), impala::HdfsScanner::LogRowParseError(), impala::HdfsScanNode::materialized_slots(), impala::HdfsScanner::num_errors_in_file_, impala::Status::ok(), impala::HdfsScanner::parse_status_, impala::HdfsScanner::ReportColumnParseError(), impala::RuntimeState::ReportFileErrors(), impala::HdfsScanner::scan_node_, impala::HdfsScanner::state_, and impala::HdfsScanner::stream_.

Referenced by impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsScanner::WriteAlignedTuples().

Status HdfsAvroScanner::ResolveSchemas	(	const avro_schema_t &	table_root,
		const avro_schema_t &	file_root
	)

private

Populates avro_header_->schema with the result of resolving the the table's schema with the file's schema. Default values are written to avro_header_->template_tuple (which is initialized to template_tuple_ if necessary).

Definition at line 220 of file hdfs-avro-scanner.cc.

References avro_header_, impala::SlotDescriptor::col_pos(), ConvertSchema(), impala::HdfsAvroScanner::ScopedAvroSchemaT::get(), impala::HdfsScanNode::GetMaterializedSlotIdx(), impala::HdfsScanNode::hdfs_table(), impala::HdfsScanNode::InitEmptyTemplateTuple(), impala::RuntimeState::LogError(), impala::HdfsScanNode::materialized_slots(), impala::ExecNode::mem_tracker(), impala::SlotDescriptor::null_indicator_offset(), impala::TableDescriptor::num_cols(), impala::HdfsScanNode::num_partition_keys(), impala::Status::OK, pool, RETURN_IF_ERROR, impala::HdfsScanner::scan_node_, impala::HdfsAvroScanner::SchemaElement::schema, impala::HdfsAvroScanner::AvroFileHeader::schema, impala::Tuple::SetNull(), impala::HdfsScanNode::SKIP_COLUMN, impala::HdfsAvroScanner::SchemaElement::slot_desc, impala::HdfsScanner::state_, impala::HdfsAvroScanner::AvroFileHeader::template_tuple, impala::HdfsScanner::template_tuple_, impala::HdfsScanNode::TransferToScanNodePool(), VerifyTypesMatch(), and impala::RawValue::Write().

Referenced by ParseMetadata().

Status BaseSequenceScanner::SkipToSync	(	const uint8_t *	sync,
		int	sync_size
	)

protectedinherited

Utility function to advance past the next sync marker, reading bytes from stream_. If no sync is found in the scan range, return Status::OK and sets finished_ to true. It is safe to call this function past eosr.

sync: sync marker to search for (does not include 0xFFFFFFFF prefix)
sync_size: number of bytes for sync

Definition at line 212 of file base-sequence-scanner.cc.

Referenced by impala::BaseSequenceScanner::ProcessSplit().

void HdfsScanner::StartNewRowBatch ( )

protectedinherited

Set batch_ to a new row batch and update tuple_mem_ accordingly.

Definition at line 108 of file hdfs-scanner.cc.

References impala::MemPool::Allocate(), impala::HdfsScanner::batch_, impala::RuntimeState::batch_size(), impala::ExecNode::mem_tracker(), impala::ExecNode::row_desc(), impala::HdfsScanner::scan_node_, impala::HdfsScanner::state_, impala::HdfsScanner::tuple_byte_size_, impala::RowBatch::tuple_data_pool(), and impala::HdfsScanner::tuple_mem_.

Referenced by impala::HdfsScanner::CommitRows(), and impala::HdfsScanner::Prepare().

Status HdfsScanner::UpdateDecompressor ( const THdfsCompression::type & compression )

protectedinherited

Update the decompressor_ object given a compression type or codec name. Depending on the old compression type and the new one, it may close the old decompressor and/or create a new one of different type.

Definition at line 513 of file hdfs-scanner.cc.

References impala::Codec::CreateDecompressor(), impala::HdfsScanner::data_buffer_pool_, impala::HdfsScanner::decompression_type_, impala::HdfsScanner::decompressor_, impala::Status::OK, RETURN_IF_ERROR, impala::HdfsScanner::scan_node_, impala::TupleDescriptor::string_slots(), and impala::HdfsScanNode::tuple_desc().

Referenced by InitNewRange(), impala::HdfsSequenceScanner::InitNewRange(), and impala::HdfsTextScanner::ProcessSplit().

Status impala::HdfsScanner::UpdateDecompressor ( const std::string & codec )

protectedinherited

Status HdfsAvroScanner::VerifyTypesMatch	(	SlotDescriptor *	slot_desc,
		avro_obj_t *	schema
	)

private

Returns Status::OK iff a value with the given schema can be used to populate slot_desc. 'schema' can be either a avro_schema_t or avro_datum_t.

Definition at line 426 of file hdfs-avro-scanner.cc.

References impala::TableDescriptor::col_names(), impala::SlotDescriptor::col_pos(), impala::ScannerContext::Stream::filename(), impala::HdfsScanNode::hdfs_table(), impala::ColumnType::IsStringType(), impala::Status::OK, impala::ColumnType::precision, impala::ColumnType::scale, impala::HdfsScanner::scan_node_, impala::HdfsScanner::stream_, impala::ColumnType::type, impala::SlotDescriptor::type(), impala::TYPE_BIGINT, impala::TYPE_BOOLEAN, impala::TYPE_DECIMAL, impala::TYPE_DOUBLE, impala::TYPE_FLOAT, and impala::TYPE_INT.

Referenced by ResolveSchemas().

int HdfsScanner::WriteAlignedTuples	(	MemPool *	pool,
		TupleRow *	tuple_row_mem,
		int	row_size,
		FieldLocation *	fields,
		int	num_tuples,
		int	max_added_tuples,
		int	slots_per_tuple,
		int	row_start_indx
	)

protectedinherited

Processes batches of fields and writes them out to tuple_row_mem.

'pool' mempool to allocate from for auxiliary tuple memory
'tuple_row_mem' preallocated tuple_row memory this function must use.
'fields' must start at the beginning of a tuple.
'num_tuples' number of tuples to process
'max_added_tuples' the maximum number of tuples that should be added to the batch.
'row_start_index' is the number of rows that have already been processed as part of WritePartialTuple. Returns the number of tuples added to the row batch. This can be less than num_tuples/tuples_till_limit because of failed conjuncts. Returns -1 if parsing should be aborted due to parse errors.

Definition at line 33 of file hdfs-scanner-ir.cc.

References impala::HdfsScanner::ReportTupleParseError(), impala::HdfsScanner::template_tuple_, impala::HdfsScanner::tuple_, impala::HdfsScanner::tuple_byte_size_, UNLIKELY, and impala::HdfsScanner::WriteCompleteTuple().

Referenced by impala::HdfsSequenceScanner::ProcessDecompressedBlock(), and impala::HdfsTextScanner::WriteFields().

bool HdfsScanner::WriteCompleteTuple	(	MemPool *	pool,
		FieldLocation *	fields,
		Tuple *	tuple,
		TupleRow *	tuple_row,
		Tuple *	template_tuple,
		uint8_t *	error_fields,
		uint8_t *	error_in_row
	)

protectedinherited

Writes out all slots for 'tuple' from 'fields'. 'fields' must be aligned to the start of the tuple (e.g. fields[0] maps to slots[0]). After writing the tuple, it will be evaluated against the conjuncts.

error_fields is an out array. error_fields[i] will be set to true if the ith field had a parse error
error_in_row is an out bool. It is set to true if any field had parse errors Returns whether the resulting tuplerow passed the conjuncts. The parsing of the fields and evaluating against conjuncts is combined in this function. This is done so it can be possible to evaluate conjuncts as slots are materialized (on partial tuples). This function is replaced by a codegen'd function at runtime. This is the reason that the out error parameters are typed uint8_t instead of bool. We need to be able to match this function's signature identically for the codegen'd function. Bool's as out parameters can get converted to bytes by the compiler and rather than implicitly depending on that to happen, we will explicitly type them to bytes. TODO: revisit this

Definition at line 217 of file hdfs-scanner.cc.

References impala::HdfsScanner::EvalConjuncts(), impala::HdfsScanner::InitTuple(), impala::FieldLocation::len, impala::HdfsScanNode::materialized_slots(), impala::HdfsScanner::scan_node_, impala::TupleRow::SetTuple(), impala::HdfsScanner::text_converter_, impala::HdfsScanNode::tuple_idx(), and UNLIKELY.

Referenced by impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsScanner::WriteAlignedTuples().

int HdfsScanner::WriteEmptyTuples	(	RowBatch *	row_batch,
		int	num_tuples
	)

protectedinherited

Utility method to write out tuples when there are no materialized fields (e.g. select count(*) or only partition keys). num_tuples - Total number of tuples to write out. Returns the number of tuples added to the row batch.

Definition at line 157 of file hdfs-scanner.cc.

References impala::RowBatch::AddRow(), impala::RowBatch::AddRows(), impala::RowBatch::AtCapacity(), impala::RowBatch::capacity(), impala::RowBatch::CommitLastRow(), impala::RowBatch::CommitRows(), impala::HdfsScanner::EvalConjuncts(), impala::RowBatch::GetRow(), impala::RowBatch::INVALID_ROW_INDEX, impala::RowBatch::num_rows(), impala::HdfsScanner::scan_node_, impala::TupleRow::SetTuple(), impala::HdfsScanner::template_tuple_, and impala::HdfsScanNode::tuple_idx().

Referenced by impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsParquetScanner::ProcessFooter(), impala::HdfsTextScanner::ProcessRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsRCFileScanner::ProcessRange().

int HdfsScanner::WriteEmptyTuples	(	ScannerContext *	context,
		TupleRow *	tuple_row,
		int	num_tuples
	)

protectedinherited

Write empty tuples and commit them to the context object.

Definition at line 195 of file hdfs-scanner.cc.

References impala::HdfsScanner::EvalConjuncts(), impala::HdfsScanner::next_row(), impala::HdfsScanner::scan_node_, impala::TupleRow::SetTuple(), impala::HdfsScanner::template_tuple_, and impala::HdfsScanNode::tuple_idx().

Member Data Documentation

const string HdfsAvroScanner::AVRO_CODEC_KEY

staticprivate

Definition at line 164 of file hdfs-avro-scanner.h.

Referenced by ParseMetadata().

const string HdfsAvroScanner::AVRO_DEFLATE_CODEC

staticprivate

Definition at line 169 of file hdfs-avro-scanner.h.

Referenced by ParseMetadata().

AvroFileHeader* impala::HdfsAvroScanner::avro_header_

private

Definition at line 160 of file hdfs-avro-scanner.h.

Referenced by InitNewRange(), MaterializeTuple(), ParseMetadata(), ReadFileHeader(), and ResolveSchemas().

const string HdfsAvroScanner::AVRO_NULL_CODEC

staticprivate

Supported codecs, as they appear in the metadata.

Definition at line 167 of file hdfs-avro-scanner.h.

Referenced by ParseMetadata().

const string HdfsAvroScanner::AVRO_SCHEMA_KEY

staticprivate

Metadata keys.

Definition at line 163 of file hdfs-avro-scanner.h.

Referenced by ParseMetadata().

const string HdfsAvroScanner::AVRO_SNAPPY_CODEC

staticprivate

Definition at line 168 of file hdfs-avro-scanner.h.

Referenced by ParseMetadata().

const uint8_t HdfsAvroScanner::AVRO_VERSION_HEADER = {'O', 'b', 'j', 1}

static

The four byte Avro version header present at the beginning of every Avro file: {'O', 'b', 'j', 1}

Definition at line 84 of file hdfs-avro-scanner.h.

Referenced by ReadFileHeader().

RowBatch* impala::HdfsScanner::batch_

protectedinherited

The current row batch being populated. Creating new row batches, attaching context resources, and handing off to the scan node is handled by this class in CommitRows(), but AttachPool() must be called by scanner subclasses to attach any memory allocated by that subclass. All row batches created by this class are transferred to the scan node (i.e., all batches are ultimately owned by the scan node).

Definition at line 177 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::AddFinalRowBatch(), impala::HdfsScanner::AttachPool(), impala::HdfsScanner::CommitRows(), impala::HdfsScanner::GetMemory(), impala::HdfsScanner::next_row(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsParquetScanner::ProcessSplit(), impala::HdfsScanner::StartNewRowBatch(), impala::HdfsTextScanner::WriteFields(), and impala::HdfsScanner::~HdfsScanner().

DecodeAvroDataFn impala::HdfsAvroScanner::codegend_decode_avro_data_

private

The codegen'd version of DecodeAvroData() if available, NULL otherwise.

Definition at line 175 of file hdfs-avro-scanner.h.

Referenced by InitNewRange(), and ProcessRange().

std::vector<ExprContext*> impala::HdfsScanner::conjunct_ctxs_

protectedinherited

ExprContext for each conjunct. Each scanner has its own ExprContexts so the conjuncts can be safely evaluated in parallel.

Definition at line 154 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::Close(), impala::HdfsScanner::CommitRows(), impala::HdfsScanner::EvalConjuncts(), impala::HdfsScanner::GetConjunctCtx(), and impala::HdfsScanner::Prepare().

ScannerContext* impala::HdfsScanner::context_

protectedinherited

Context for this scanner.

Definition at line 147 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::AddFinalRowBatch(), impala::HdfsParquetScanner::AssembleRows(), impala::HdfsScanner::CommitRows(), impala::HdfsTextScanner::FillByteBufferCompressedFile(), impala::HdfsTextScanner::FillByteBufferGzip(), impala::HdfsParquetScanner::InitColumns(), impala::HdfsTextScanner::InitNewRange(), impala::HdfsSequenceScanner::InitNewRange(), impala::HdfsScanner::Prepare(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsParquetScanner::ProcessFooter(), impala::HdfsTextScanner::ProcessRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsParquetScanner::ProcessSplit(), and impala::HdfsTextScanner::ResetScanner().

boost::scoped_ptr<MemPool> impala::HdfsScanner::data_buffer_pool_

protectedinherited

Pool to allocate per data block memory. This should be used with the decompressor and any other per data block allocations.

Definition at line 205 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::Close(), impala::BaseSequenceScanner::Close(), impala::HdfsTextScanner::FillByteBufferGzip(), ProcessRange(), impala::HdfsSequenceScanner::ReadCompressedBlock(), impala::HdfsRCFileScanner::ReadRowGroup(), impala::HdfsRCFileScanner::ResetRowGroup(), impala::HdfsScanner::UpdateDecompressor(), and impala::HdfsTextScanner::WritePartialTuple().

RuntimeProfile::Counter* impala::HdfsScanner::decompress_timer_

protectedinherited

Time spent decompressing bytes.

Definition at line 208 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::FillByteBufferCompressedFile(), impala::HdfsTextScanner::FillByteBufferGzip(), impala::HdfsSequenceScanner::GetRecord(), impala::HdfsScanner::Prepare(), ProcessRange(), impala::HdfsRCFileScanner::ReadColumnBuffers(), impala::HdfsSequenceScanner::ReadCompressedBlock(), impala::HdfsParquetScanner::BaseColumnReader::ReadDataPage(), and impala::HdfsRCFileScanner::ReadKeyBuffers().

THdfsCompression::type impala::HdfsScanner::decompression_type_

protectedinherited

The most recently used decompression type.

Definition at line 201 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::FillByteBuffer(), impala::HdfsTextScanner::FillByteBufferCompressedFile(), and impala::HdfsScanner::UpdateDecompressor().

boost::scoped_ptr<Codec> impala::HdfsScanner::decompressor_

protectedinherited

Decompressor class to use, if any.

Definition at line 198 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::Close(), impala::BaseSequenceScanner::Close(), impala::HdfsScanner::Close(), impala::HdfsTextScanner::FillByteBuffer(), impala::HdfsTextScanner::FillByteBufferCompressedFile(), impala::HdfsTextScanner::FillByteBufferGzip(), impala::HdfsTextScanner::FinishScanRange(), impala::HdfsSequenceScanner::GetRecord(), impala::HdfsRCFileScanner::InitNewRange(), ProcessRange(), impala::HdfsRCFileScanner::ReadColumnBuffers(), impala::HdfsSequenceScanner::ReadCompressedBlock(), impala::HdfsRCFileScanner::ReadKeyBuffers(), and impala::HdfsScanner::UpdateDecompressor().

const int impala::HdfsScanner::FILE_BLOCK_SIZE = 4096

staticinherited

Assumed size of an OS file block. Used mostly when reading file format headers, etc. This probably ought to be a derived number from the environment.

Definition at line 95 of file hdfs-scanner.h.

FileHeader* impala::BaseSequenceScanner::header_

protectedinherited

File header for this scan range. This is not owned by the parent scan node.

Definition at line 127 of file base-sequence-scanner.h.

Referenced by impala::BaseSequenceScanner::Close(), impala::HdfsSequenceScanner::GetRecord(), InitNewRange(), impala::HdfsSequenceScanner::InitNewRange(), impala::HdfsRCFileScanner::InitNewRange(), ParseMetadata(), impala::HdfsSequenceScanner::ProcessBlockCompressedScanRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), impala::BaseSequenceScanner::ProcessSplit(), impala::HdfsRCFileScanner::ReadColumnBuffers(), ReadFileHeader(), impala::HdfsSequenceScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadKeyBuffers(), impala::HdfsRCFileScanner::ReadNumColumnsMetadata(), and impala::BaseSequenceScanner::ReadSync().

const int BaseSequenceScanner::HEADER_SIZE = 1024

staticprotectedinherited

Estimate of header size in bytes. This is initial number of bytes to issue per file. If the estimate is too low, more bytes will be read as necessary.

Definition at line 121 of file base-sequence-scanner.h.

Referenced by impala::BaseSequenceScanner::IssueInitialRanges().

const char * HdfsAvroScanner::LLVM_CLASS_NAME = "class.impala::HdfsAvroScanner"

staticprivate

Definition at line 258 of file hdfs-avro-scanner.h.

Referenced by CodegenMaterializeTuple().

int impala::HdfsScanner::num_errors_in_file_

protectedinherited

number of errors in current file

Definition at line 183 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::ReportTupleParseError().

int32_t impala::HdfsScanner::num_null_bytes_

protectedinherited

Number of null bytes in the tuple.

Definition at line 189 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::InitTuple().

bool impala::BaseSequenceScanner::only_parsing_header_

protectedinherited

If true, this scanner object is only for processing the header.

Definition at line 130 of file base-sequence-scanner.h.

Referenced by impala::BaseSequenceScanner::Close(), impala::BaseSequenceScanner::CloseFileRanges(), InitNewRange(), impala::HdfsSequenceScanner::InitNewRange(), impala::HdfsRCFileScanner::InitNewRange(), and impala::BaseSequenceScanner::ProcessSplit().

Status impala::HdfsScanner::parse_status_

protectedinherited

Contains current parse status to minimize the number of Status objects returned. This significantly minimizes the cross compile dependencies for llvm since status objects inline a bunch of string functions. Also, status objects aren't extremely cheap to create and destroy.

Definition at line 195 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::FillByteBufferGzip(), impala::HdfsSequenceScanner::GetRecord(), ParseMetadata(), impala::HdfsSequenceScanner::ProcessBlockCompressedScanRange(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsTextScanner::ProcessRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), impala::HdfsRCFileScanner::ProcessRange(), impala::BaseSequenceScanner::ProcessSplit(), impala::HdfsSequenceScanner::ReadBlockHeader(), impala::HdfsRCFileScanner::ReadColumnBuffers(), impala::HdfsSequenceScanner::ReadCompressedBlock(), ReadFileHeader(), impala::HdfsSequenceScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadKeyBuffers(), impala::HdfsRCFileScanner::ReadNumColumnsMetadata(), impala::HdfsRCFileScanner::ReadRowGroupHeader(), impala::BaseSequenceScanner::ReadSync(), impala::HdfsScanner::ReportColumnParseError(), impala::HdfsScanner::ReportTupleParseError(), impala::BaseSequenceScanner::SkipToSync(), and impala::HdfsTextScanner::WriteFields().

HdfsScanNode* impala::HdfsScanner::scan_node_

protectedinherited

The scan node that started this scanner.

Definition at line 141 of file hdfs-scanner.h.

RuntimeState* impala::HdfsScanner::state_

protectedinherited

RuntimeState for error reporting.

Definition at line 144 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::Close(), impala::HdfsScanner::CommitRows(), impala::HdfsTextScanner::FillByteBufferGzip(), impala::HdfsTextScanner::FinishScanRange(), impala::HdfsTextScanner::Prepare(), impala::HdfsScanner::Prepare(), impala::HdfsSequenceScanner::Prepare(), impala::HdfsSequenceScanner::ProcessBlockCompressedScanRange(), impala::HdfsRCFileScanner::ProcessRange(), impala::BaseSequenceScanner::ProcessSplit(), impala::HdfsSequenceScanner::ReadCompressedBlock(), impala::BaseSequenceScanner::ReadPastSize(), impala::HdfsRCFileScanner::ReadRowGroup(), impala::HdfsScanner::ReportColumnParseError(), impala::HdfsScanner::ReportTupleParseError(), ResolveSchemas(), impala::HdfsScanner::StartNewRowBatch(), impala::HdfsParquetScanner::ValidateColumn(), and impala::HdfsTextScanner::WriteFields().

ScannerContext::Stream* impala::HdfsScanner::stream_

protectedinherited

The first stream for context_.

Definition at line 150 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::Close(), impala::BaseSequenceScanner::Close(), impala::HdfsParquetScanner::CreateColumnReaders(), impala::HdfsRCFileScanner::DebugString(), impala::HdfsTextScanner::FillByteBuffer(), impala::HdfsTextScanner::FillByteBufferCompressedFile(), impala::HdfsTextScanner::FillByteBufferGzip(), impala::HdfsTextScanner::FindFirstTuple(), impala::HdfsTextScanner::FinishScanRange(), impala::HdfsSequenceScanner::GetRecord(), impala::HdfsTextScanner::InitNewRange(), impala::HdfsRCFileScanner::InitNewRange(), impala::HdfsRCFileScanner::NextField(), ParseMetadata(), impala::BaseSequenceScanner::Prepare(), impala::HdfsScanner::Prepare(), impala::HdfsSequenceScanner::ProcessBlockCompressedScanRange(), impala::HdfsParquetScanner::ProcessFooter(), impala::HdfsTextScanner::ProcessRange(), ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsTextScanner::ProcessSplit(), impala::BaseSequenceScanner::ProcessSplit(), impala::HdfsParquetScanner::ProcessSplit(), impala::HdfsSequenceScanner::ReadBlockHeader(), impala::HdfsRCFileScanner::ReadColumnBuffers(), impala::HdfsSequenceScanner::ReadCompressedBlock(), ReadFileHeader(), impala::HdfsSequenceScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadKeyBuffers(), impala::HdfsRCFileScanner::ReadNumColumnsMetadata(), impala::HdfsRCFileScanner::ReadRowGroupHeader(), impala::BaseSequenceScanner::ReadSync(), impala::HdfsScanner::ReportTupleParseError(), impala::BaseSequenceScanner::SkipToSync(), impala::HdfsParquetScanner::ValidateFileMetadata(), VerifyTypesMatch(), and impala::HdfsTextScanner::WriteFields().

const int impala::BaseSequenceScanner::SYNC_HASH_SIZE = 16

staticprotectedinherited

Size of the sync hash field.

Definition at line 49 of file base-sequence-scanner.h.

Referenced by impala::BaseSequenceScanner::ProcessSplit(), ReadFileHeader(), impala::HdfsSequenceScanner::ReadFileHeader(), impala::HdfsRCFileScanner::ReadFileHeader(), and impala::BaseSequenceScanner::ReadSync().

const int BaseSequenceScanner::SYNC_MARKER = -1

staticprotectedinherited

Sync indicator.

Definition at line 124 of file base-sequence-scanner.h.

Referenced by impala::HdfsSequenceScanner::ProcessRange(), and impala::HdfsRCFileScanner::ProcessRange().

Tuple* impala::HdfsScanner::template_tuple_

protectedinherited

A partially materialized tuple with only partition key slots set. The non-partition key slots are set to NULL. The template tuple must be copied into tuple_ before any of the other slots are materialized. Pointer is NULL if there are no partition key slots. This template tuple is computed once for each file and valid for the duration of that file. It is owned by the HDFS scan node.

Definition at line 164 of file hdfs-scanner.h.

Referenced by AllocateFileHeader(), impala::HdfsParquetScanner::AssembleRows(), impala::HdfsParquetScanner::CreateColumnReaders(), DecodeAvroData(), InitNewRange(), impala::HdfsScanner::Prepare(), impala::HdfsSequenceScanner::ProcessRange(), impala::HdfsRCFileScanner::ProcessRange(), ResolveSchemas(), impala::HdfsScanner::WriteAlignedTuples(), impala::HdfsScanner::WriteEmptyTuples(), and impala::HdfsTextScanner::WriteFields().

boost::scoped_ptr<TextConverter> impala::HdfsScanner::text_converter_

protectedinherited

Helper class for converting text to other types;.

Definition at line 186 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::InitNewRange(), impala::HdfsSequenceScanner::InitNewRange(), impala::HdfsRCFileScanner::Prepare(), impala::HdfsRCFileScanner::ProcessRange(), impala::HdfsScanner::WriteCompleteTuple(), and impala::HdfsTextScanner::WritePartialTuple().

Tuple* impala::HdfsScanner::tuple_

protectedinherited

Current tuple pointer into tuple_mem_.

Definition at line 170 of file hdfs-scanner.h.

Referenced by impala::HdfsTextScanner::FinishScanRange(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), impala::HdfsTextScanner::ProcessRange(), impala::HdfsSequenceScanner::ProcessRange(), impala::HdfsScanner::WriteAlignedTuples(), and impala::HdfsTextScanner::WriteFields().

int impala::HdfsScanner::tuple_byte_size_

protectedinherited

Fixed size of each tuple, in bytes.

Definition at line 167 of file hdfs-scanner.h.

Referenced by impala::HdfsParquetScanner::AssembleRows(), impala::HdfsScanner::InitTuple(), impala::HdfsScanner::next_tuple(), impala::HdfsScanner::StartNewRowBatch(), and impala::HdfsScanner::WriteAlignedTuples().

uint8_t* impala::HdfsScanner::tuple_mem_

protectedinherited

The tuple memory of batch_.

Definition at line 180 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::CommitRows(), impala::HdfsScanner::GetMemory(), and impala::HdfsScanner::StartNewRowBatch().

WriteTuplesFn impala::HdfsScanner::write_tuples_fn_

protectedinherited

Jitted write tuples function pointer. Null if codegen is disabled.

Definition at line 215 of file hdfs-scanner.h.

Referenced by impala::HdfsScanner::InitializeWriteTuplesFn(), impala::HdfsSequenceScanner::ProcessDecompressedBlock(), and impala::HdfsTextScanner::WriteFields().

The documentation for this class was generated from the following files:

Classes

Public Member Functions

Static Public Member Functions

Static Public Attributes

Protected Types

Protected Member Functions

Static Protected Member Functions

Protected Attributes

Static Protected Attributes

Private Types

Private Member Functions

Static Private Member Functions

Private Attributes

Static Private Attributes

Detailed Description

Member Typedef Documentation

Constructor & Destructor Documentation

Member Function Documentation

Member Data Documentation