doc/html/partitioned-aggregation-node_8h_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #ifndef IMPALA_EXEC_PARTITIONED_AGGREGATION_NODE_H

 #define IMPALA_EXEC_PARTITIONED_AGGREGATION_NODE_H


 #include <functional>

 #include <boost/scoped_ptr.hpp>


 #include "exec/exec-node.h"

 #include "exec/hash-table.h"

 #include "runtime/buffered-block-mgr.h"

 #include "runtime/buffered-tuple-stream.h"

 #include "runtime/descriptors.h"  // for TupleId

 #include "runtime/mem-pool.h"

 #include "runtime/string-value.h"


 namespace llvm {

   class Function;

 }


 namespace impala {


 class AggFnEvaluator;

 class LlvmCodeGen;

 class RowBatch;

 class RuntimeState;

 struct StringValue;

 class Tuple;

 class TupleDescriptor;

 class SlotDescriptor;


 //

 //

 //

 class PartitionedAggregationNode : public ExecNode {

  public:

   PartitionedAggregationNode(ObjectPool* pool,

       const TPlanNode& tnode, const DescriptorTbl& descs);


   virtual Status Init(const TPlanNode& tnode);

   virtual Status Prepare(RuntimeState* state);

   virtual Status Open(RuntimeState* state);

   virtual Status GetNext(RuntimeState* state, RowBatch* row_batch, bool* eos);

   virtual Status Reset(RuntimeState* state);

   virtual void Close(RuntimeState* state);


   static const char* LLVM_CLASS_NAME;


  protected:

   virtual Status QueryMaintenance(RuntimeState* state);


   virtual void DebugString(int indentation_level, std::stringstream* out) const;


  private:

   struct Partition;


   static const int PARTITION_FANOUT = 16;


   static const int NUM_PARTITIONING_BITS = 4;


   static const int MAX_PARTITION_DEPTH = 16;


   TupleId intermediate_tuple_id_;

   TupleDescriptor* intermediate_tuple_desc_;


   boost::scoped_ptr<RowDescriptor> intermediate_row_desc_;


   TupleId output_tuple_id_;

   TupleDescriptor* output_tuple_desc_;


   const bool needs_finalize_;


   bool needs_serialize_;


   std::vector<AggFnEvaluator*> aggregate_evaluators_;


   std::vector<impala_udf::FunctionContext*> agg_fn_ctxs_;

   boost::scoped_ptr<MemPool> agg_fn_pool_;


   std::vector<ExprContext*> probe_expr_ctxs_;


   std::vector<ExprContext*> build_expr_ctxs_;


   bool contains_var_len_grouping_exprs_;


   RuntimeState* state_;

   BufferedBlockMgr::Client* block_mgr_client_;


   bool using_small_buffers_;


   Tuple* singleton_output_tuple_;

   bool singleton_output_tuple_returned_;


   boost::scoped_ptr<MemPool> mem_pool_;


   boost::scoped_ptr<HashTableCtx> ht_ctx_;


   Partition* output_partition_;

   HashTable::Iterator output_iterator_;


   typedef Status (*ProcessRowBatchFn)(

       PartitionedAggregationNode*, RowBatch*, HashTableCtx*);

   ProcessRowBatchFn process_row_batch_fn_;


   RuntimeProfile::Counter* build_timer_;


   RuntimeProfile::Counter* ht_resize_timer_;


   RuntimeProfile::Counter* get_results_timer_;


   RuntimeProfile::Counter* num_hash_buckets_;


   RuntimeProfile::Counter* partitions_created_;


   RuntimeProfile::HighWaterMarkCounter* max_partition_level_;


   RuntimeProfile::Counter* num_row_repartitioned_;


   RuntimeProfile::Counter* num_repartitions_;


   RuntimeProfile::Counter* num_spilled_partitions_;


   RuntimeProfile::HighWaterMarkCounter* largest_partition_percent_;


   struct Partition {

     Partition(PartitionedAggregationNode* parent, int level)

       : parent(parent), is_closed(false), level(level) {}


     Status InitStreams();


     bool InitHashTable();


     void Close(bool finalize_rows);


     Status Spill(Tuple* tuple = NULL);


     bool is_spilled() const { return hash_tbl.get() == NULL; }


     PartitionedAggregationNode* parent;


     bool is_closed;


     const int level;


     boost::scoped_ptr<HashTable> hash_tbl;


     std::vector<impala_udf::FunctionContext*> agg_fn_ctxs;

     boost::scoped_ptr<MemPool> agg_fn_pool;


     boost::scoped_ptr<BufferedTupleStream> aggregated_row_stream;


     boost::scoped_ptr<BufferedTupleStream> unaggregated_row_stream;

   };


   std::vector<Partition*> hash_partitions_;


   std::list<Partition*> spilled_partitions_;


   std::list<Partition*> aggregated_partitions_;


   boost::scoped_ptr<BufferedTupleStream> serialize_stream_;


   Tuple* ConstructIntermediateTuple(

       const std::vector<impala_udf::FunctionContext*>& agg_fn_ctxs,

       MemPool* pool, BufferedTupleStream* stream);


   void UpdateTuple(impala_udf::FunctionContext** agg_fn_ctxs, Tuple* tuple, TupleRow* row,

                    bool is_merge = false);


   Tuple* GetOutputTuple(const std::vector<impala_udf::FunctionContext*>& agg_fn_ctxs,

                         Tuple* tuple, MemPool* pool);


   Status ProcessBatchNoGrouping(RowBatch* batch, HashTableCtx* ht_ctx = NULL);


   //

   template<bool AGGREGATED_ROWS>

   Status IR_ALWAYS_INLINE ProcessBatch(RowBatch* batch, HashTableCtx* ht_ctx);


   template<bool AGGREGATED_ROWS>

   Status ProcessStream(BufferedTupleStream* input_stream);


   Status CreateHashPartitions(int level);


   int64_t LargestSpilledPartition() const;


   Status NextPartition();


   Status SpillPartition(Partition* curr_partition = NULL,

       Tuple* curr_intermediate_tuple = NULL);


   Status MoveHashPartitions(int64_t input_rows);


   void CleanupHashTbl(const std::vector<impala_udf::FunctionContext*>& fn_ctxs,

       HashTable::Iterator it);


   llvm::Function* CodegenUpdateSlot(AggFnEvaluator* evaluator, SlotDescriptor* slot_desc);


   llvm::Function* CodegenUpdateTuple();


   llvm::Function* CodegenProcessBatch();


   Status ProcessBatch_false(RowBatch* batch, HashTableCtx* ht_ctx);

   Status ProcessBatch_true(RowBatch* batch, HashTableCtx* ht_ctx);


   int MinRequiredBuffers() const {

     return 2 * PARTITION_FANOUT + 1 + (needs_serialize_ ? 1 : 0);

   }

 };


 }


 #endif

impala::HashTable::Iterator
stl-like iterator interface.
Definition: hash-table.h:450

impala::PartitionedAggregationNode::output_partition_
Partition * output_partition_
Definition: partitioned-aggregation-node.h:198

impala::TupleDescriptor
Definition: descriptors.h:298

impala::DescriptorTbl
Definition: descriptors.h:338

impala::BufferedTupleStream
The underlying memory management is done by the BufferedBlockMgr.
Definition: buffered-tuple-stream.h:109

impala::PartitionedAggregationNode::MoveHashPartitions
Status MoveHashPartitions(int64_t input_rows)
Definition: partitioned-aggregation-node.cc:922

impala::PartitionedAggregationNode::MinRequiredBuffers
int MinRequiredBuffers() const
Definition: partitioned-aggregation-node.h:418

impala::PartitionedAggregationNode::spilled_partitions_
std::list< Partition * > spilled_partitions_
All partitions that have been spilled and need further processing.
Definition: partitioned-aggregation-node.h:296

impala::PartitionedAggregationNode::Partition::is_closed
bool is_closed
If true, this partition is closed and there is nothing left to do.
Definition: partitioned-aggregation-node.h:266

hash-table.h

impala::PartitionedAggregationNode::needs_serialize_
bool needs_serialize_
Contains any evaluators that require the serialize step.
Definition: partitioned-aggregation-node.h:151

impala::PartitionedAggregationNode::partitions_created_
RuntimeProfile::Counter * partitions_created_
Total number of partitions created.
Definition: partitioned-aggregation-node.h:219

impala::PartitionedAggregationNode::ProcessBatch_false
Status ProcessBatch_false(RowBatch *batch, HashTableCtx *ht_ctx)
Definition: partitioned-aggregation-node-ir.cc:155

impala::PartitionedAggregationNode::output_iterator_
HashTable::Iterator output_iterator_
Definition: partitioned-aggregation-node.h:199

impala::PartitionedAggregationNode::ConstructIntermediateTuple
Tuple * ConstructIntermediateTuple(const std::vector< impala_udf::FunctionContext * > &agg_fn_ctxs, MemPool *pool, BufferedTupleStream *stream)
Definition: partitioned-aggregation-node.cc:591

impala::PartitionedAggregationNode::LLVM_CLASS_NAME
static const char * LLVM_CLASS_NAME
Definition: partitioned-aggregation-node.h:103

impala::PartitionedAggregationNode::GetOutputTuple
Tuple * GetOutputTuple(const std::vector< impala_udf::FunctionContext * > &agg_fn_ctxs, Tuple *tuple, MemPool *pool)
Definition: partitioned-aggregation-node.cc:688

impala::PartitionedAggregationNode::num_hash_buckets_
RuntimeProfile::Counter * num_hash_buckets_
Total number of hash buckets across all partitions.
Definition: partitioned-aggregation-node.h:216

impala::PartitionedAggregationNode::Partition::unaggregated_row_stream
boost::scoped_ptr< BufferedTupleStream > unaggregated_row_stream
Unaggregated rows that are spilled.
Definition: partitioned-aggregation-node.h:289

impala::PartitionedAggregationNode::Partition::agg_fn_pool
boost::scoped_ptr< MemPool > agg_fn_pool
Definition: partitioned-aggregation-node.h:280

impala::PartitionedAggregationNode::Partition::Close
void Close(bool finalize_rows)
Definition: partitioned-aggregation-node.cc:570

impala::PartitionedAggregationNode::state_
RuntimeState * state_
Definition: partitioned-aggregation-node.h:175

impala::PartitionedAggregationNode::singleton_output_tuple_returned_
bool singleton_output_tuple_returned_
Definition: partitioned-aggregation-node.h:185

impala::AggFnEvaluator
Definition: agg-fn-evaluator.h:62

impala::Tuple
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48

buffered-tuple-stream.h

impala::PartitionedAggregationNode::intermediate_row_desc_
boost::scoped_ptr< RowDescriptor > intermediate_row_desc_
Row with the intermediate tuple as its only tuple.
Definition: partitioned-aggregation-node.h:137

impala::PartitionedAggregationNode::Partition
Definition: partitioned-aggregation-node.h:237

impala::PartitionedAggregationNode::num_spilled_partitions_
RuntimeProfile::Counter * num_spilled_partitions_
Number of partitions that have been spilled.
Definition: partitioned-aggregation-node.h:231

impala::PartitionedAggregationNode::serialize_stream_
boost::scoped_ptr< BufferedTupleStream > serialize_stream_
Definition: partitioned-aggregation-node.h:306

impala::PartitionedAggregationNode::CleanupHashTbl
void CleanupHashTbl(const std::vector< impala_udf::FunctionContext * > &fn_ctxs, HashTable::Iterator it)
Calls finalizes on all tuples starting at 'it'.
Definition: partitioned-aggregation-node.cc:352

impala::PartitionedAggregationNode::Partition::is_spilled
bool is_spilled() const
Definition: partitioned-aggregation-node.h:261

impala::BufferedBlockMgr::Client
Definition: buffered-block-mgr.cc:45

impala::PartitionedAggregationNode::agg_fn_ctxs_
std::vector< impala_udf::FunctionContext * > agg_fn_ctxs_
Definition: partitioned-aggregation-node.h:161

impala::PartitionedAggregationNode::PARTITION_FANOUT
static const int PARTITION_FANOUT
Number of initial partitions to create. Must be a power of 2.
Definition: partitioned-aggregation-node.h:115

impala::PartitionedAggregationNode::Partition::Spill
Status Spill(Tuple *tuple=NULL)
Definition: partitioned-aggregation-node.cc:467

impala::ObjectPool
Definition: object-pool.h:30

impala::PartitionedAggregationNode::CreateHashPartitions
Status CreateHashPartitions(int level)
Definition: partitioned-aggregation-node.cc:729

impala::PartitionedAggregationNode::contains_var_len_grouping_exprs_
bool contains_var_len_grouping_exprs_
Definition: partitioned-aggregation-node.h:173

impala::SlotDescriptor
Definition: descriptors.h:75

impala::PartitionedAggregationNode::NextPartition
Status NextPartition()
Definition: partitioned-aggregation-node.cc:771

impala::PartitionedAggregationNode::output_tuple_id_
TupleId output_tuple_id_
Definition: partitioned-aggregation-node.h:141

impala::PartitionedAggregationNode::block_mgr_client_
BufferedBlockMgr::Client * block_mgr_client_
Definition: partitioned-aggregation-node.h:176

impala::TupleRow
Definition: tuple-row.h:28

impala::PartitionedAggregationNode::singleton_output_tuple_
Tuple * singleton_output_tuple_
Definition: partitioned-aggregation-node.h:184

impala::TupleId
int TupleId
Definition: global-types.h:23

impala::RuntimeProfile::Counter
Definition: runtime-profile.h:85

impala::PartitionedAggregationNode::largest_partition_percent_
RuntimeProfile::HighWaterMarkCounter * largest_partition_percent_
Definition: partitioned-aggregation-node.h:235

IR_ALWAYS_INLINE
#define IR_ALWAYS_INLINE
Definition: impala-ir.h:31

impala::PartitionedAggregationNode::num_row_repartitioned_
RuntimeProfile::Counter * num_row_repartitioned_
Number of rows that have been repartitioned.
Definition: partitioned-aggregation-node.h:225

impala::PartitionedAggregationNode::Init
virtual Status Init(const TPlanNode &tnode)
Definition: partitioned-aggregation-node.cc:82

impala::PartitionedAggregationNode::SpillPartition
Status SpillPartition(Partition *curr_partition=NULL, Tuple *curr_intermediate_tuple=NULL)
Definition: partitioned-aggregation-node.cc:872

impala_udf::FunctionContext
Definition: udf.h:47

impala::PartitionedAggregationNode::Partition::InitHashTable
bool InitHashTable()
Initializes the hash table. Returns false on OOM.
Definition: partitioned-aggregation-node.cc:454

impala::Status
Definition: status.h:81

impala::ExecNode::DebugString
std::string DebugString() const
Returns a string representation in DFS order of the plan rooted at this.
Definition: exec-node.cc:345

impala::PartitionedAggregationNode::LargestSpilledPartition
int64_t LargestSpilledPartition() const
Definition: partitioned-aggregation-node.cc:758

impala::PartitionedAggregationNode::ProcessBatch_true
Status ProcessBatch_true(RowBatch *batch, HashTableCtx *ht_ctx)
Definition: partitioned-aggregation-node-ir.cc:160

impala::MemPool
Definition: mem-pool.h:77

impala::RuntimeState
Definition: runtime-state.h:69

impala::PartitionedAggregationNode::mem_pool_
boost::scoped_ptr< MemPool > mem_pool_
Definition: partitioned-aggregation-node.h:189

impala::PartitionedAggregationNode::NUM_PARTITIONING_BITS
static const int NUM_PARTITIONING_BITS
Definition: partitioned-aggregation-node.h:121

impala::PartitionedAggregationNode::intermediate_tuple_id_
TupleId intermediate_tuple_id_
Tuple into which Update()/Merge()/Serialize() results are stored.
Definition: partitioned-aggregation-node.h:133

impala::PartitionedAggregationNode::Partition::hash_tbl
boost::scoped_ptr< HashTable > hash_tbl
Definition: partitioned-aggregation-node.h:276

pool
ObjectPool pool
Definition: expr-benchmark.cc:89

exec-node.h

impala::PartitionedAggregationNode::MAX_PARTITION_DEPTH
static const int MAX_PARTITION_DEPTH
Definition: partitioned-aggregation-node.h:130

impala::PartitionedAggregationNode::Close
virtual void Close(RuntimeState *state)
Definition: partitioned-aggregation-node.cc:380

impala::PartitionedAggregationNode::max_partition_level_
RuntimeProfile::HighWaterMarkCounter * max_partition_level_
Level of max partition (i.e. number of repartitioning steps).
Definition: partitioned-aggregation-node.h:222

impala::PartitionedAggregationNode::using_small_buffers_
bool using_small_buffers_
If true, the partitions in hash_partitions_ are using small buffers.
Definition: partitioned-aggregation-node.h:179

impala::PartitionedAggregationNode::Reset
virtual Status Reset(RuntimeState *state)
Definition: partitioned-aggregation-node.cc:375

impala::PartitionedAggregationNode::Partition::parent
PartitionedAggregationNode * parent
Definition: partitioned-aggregation-node.h:263

impala::PartitionedAggregationNode::hash_partitions_
std::vector< Partition * > hash_partitions_
Current partitions we are partitioning into.
Definition: partitioned-aggregation-node.h:293

impala::PartitionedAggregationNode::build_timer_
RuntimeProfile::Counter * build_timer_
Time spent processing the child rows.
Definition: partitioned-aggregation-node.h:207

impala::RowBatch
Definition: row-batch.h:66

impala::PartitionedAggregationNode::CodegenUpdateTuple
llvm::Function * CodegenUpdateTuple()
Codegen UpdateTuple(). Returns NULL if codegen is unsuccessful.
Definition: partitioned-aggregation-node.cc:1241

impala::PartitionedAggregationNode::aggregated_partitions_
std::list< Partition * > aggregated_partitions_
Definition: partitioned-aggregation-node.h:301

impala::PartitionedAggregationNode::intermediate_tuple_desc_
TupleDescriptor * intermediate_tuple_desc_
Definition: partitioned-aggregation-node.h:134

impala::PartitionedAggregationNode::CodegenProcessBatch
llvm::Function * CodegenProcessBatch()
Definition: partitioned-aggregation-node.cc:1356

impala::PartitionedAggregationNode::num_repartitions_
RuntimeProfile::Counter * num_repartitions_
Number of partitions that have been repartitioned.
Definition: partitioned-aggregation-node.h:228

impala::PartitionedAggregationNode::needs_finalize_
const bool needs_finalize_
Definition: partitioned-aggregation-node.h:148

impala::PartitionedAggregationNode::Partition::Partition
Partition(PartitionedAggregationNode *parent, int level)
Definition: partitioned-aggregation-node.h:238

impala::PartitionedAggregationNode::output_tuple_desc_
TupleDescriptor * output_tuple_desc_
Definition: partitioned-aggregation-node.h:142

impala::RuntimeProfile::HighWaterMarkCounter
Definition: runtime-profile.h:125

impala::PartitionedAggregationNode::Partition::agg_fn_ctxs
std::vector< impala_udf::FunctionContext * > agg_fn_ctxs
Clone of parent's agg_fn_ctxs_ and backing MemPool.
Definition: partitioned-aggregation-node.h:279

impala::PartitionedAggregationNode::process_row_batch_fn_
ProcessRowBatchFn process_row_batch_fn_
Jitted ProcessRowBatch function pointer. Null if codegen is disabled.
Definition: partitioned-aggregation-node.h:204

impala::PartitionedAggregationNode::Partition::aggregated_row_stream
boost::scoped_ptr< BufferedTupleStream > aggregated_row_stream
Definition: partitioned-aggregation-node.h:286

impala::PartitionedAggregationNode::get_results_timer_
RuntimeProfile::Counter * get_results_timer_
Time spent returning the aggregated rows.
Definition: partitioned-aggregation-node.h:213

mem-pool.h

impala::PartitionedAggregationNode::QueryMaintenance
virtual Status QueryMaintenance(RuntimeState *state)
Frees local allocations from aggregate_evaluators_ and agg_fn_ctxs.
Definition: partitioned-aggregation-node.cc:968

impala::ExecNode
Definition: exec-node.h:46

impala::PartitionedAggregationNode::ProcessBatchNoGrouping
Status ProcessBatchNoGrouping(RowBatch *batch, HashTableCtx *ht_ctx=NULL)
Definition: partitioned-aggregation-node-ir.cc:24

impala::PartitionedAggregationNode::Prepare
virtual Status Prepare(RuntimeState *state)
Definition: partitioned-aggregation-node.cc:95

impala::PartitionedAggregationNode::agg_fn_pool_
boost::scoped_ptr< MemPool > agg_fn_pool_
Definition: partitioned-aggregation-node.h:162

impala::PartitionedAggregationNode::Partition::level
const int level
Definition: partitioned-aggregation-node.h:271

impala::PartitionedAggregationNode::aggregate_evaluators_
std::vector< AggFnEvaluator * > aggregate_evaluators_
Definition: partitioned-aggregation-node.h:153

buffered-block-mgr.h

impala::PartitionedAggregationNode::probe_expr_ctxs_
std::vector< ExprContext * > probe_expr_ctxs_
Exprs used to evaluate input rows.
Definition: partitioned-aggregation-node.h:165

impala::PartitionedAggregationNode::build_expr_ctxs_
std::vector< ExprContext * > build_expr_ctxs_
Definition: partitioned-aggregation-node.h:169

impala::PartitionedAggregationNode::ProcessStream
Status ProcessStream(BufferedTupleStream *input_stream)
Reads all the rows from input_stream and process them by calling ProcessBatch().
Definition: partitioned-aggregation-node.cc:849

impala::PartitionedAggregationNode::Partition::InitStreams
Status InitStreams()
Definition: partitioned-aggregation-node.cc:429

impala::PartitionedAggregationNode::ProcessRowBatchFn
Status(* ProcessRowBatchFn)(PartitionedAggregationNode *, RowBatch *, HashTableCtx *)
Definition: partitioned-aggregation-node.h:201

descriptors.h

string-value.h

impala::PartitionedAggregationNode::UpdateTuple
void UpdateTuple(impala_udf::FunctionContext **agg_fn_ctxs, Tuple *tuple, TupleRow *row, bool is_merge=false)
Definition: partitioned-aggregation-node.cc:676

impala::HashTableCtx
Definition: hash-table.h:104

impala::PartitionedAggregationNode::Open
virtual Status Open(RuntimeState *state)
Definition: partitioned-aggregation-node.cc:209

impala::PartitionedAggregationNode::GetNext
virtual Status GetNext(RuntimeState *state, RowBatch *row_batch, bool *eos)
Definition: partitioned-aggregation-node.cc:271

impala::PartitionedAggregationNode::ht_ctx_
boost::scoped_ptr< HashTableCtx > ht_ctx_
Definition: partitioned-aggregation-node.h:194

impala::PartitionedAggregationNode::CodegenUpdateSlot
llvm::Function * CodegenUpdateSlot(AggFnEvaluator *evaluator, SlotDescriptor *slot_desc)
Definition: partitioned-aggregation-node.cc:1055

impala::PartitionedAggregationNode
Definition: partitioned-aggregation-node.h:91

impala::PartitionedAggregationNode::ht_resize_timer_
RuntimeProfile::Counter * ht_resize_timer_
Total time spent resizing hash tables.
Definition: partitioned-aggregation-node.h:210

impala::PartitionedAggregationNode::ProcessBatch
Status IR_ALWAYS_INLINE ProcessBatch(RowBatch *batch, HashTableCtx *ht_ctx)

impala::PartitionedAggregationNode::PartitionedAggregationNode
PartitionedAggregationNode(ObjectPool *pool, const TPlanNode &tnode, const DescriptorTbl &descs)
Definition: partitioned-aggregation-node.cc:56