doc/html/AggregationNode_8java_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 package com.cloudera.impala.planner;


 import java.util.ArrayList;

 import java.util.List;

 import java.util.Set;


 import org.slf4j.Logger;

 import org.slf4j.LoggerFactory;


 import com.cloudera.impala.analysis.AggregateInfo;

 import com.cloudera.impala.analysis.Analyzer;

 import com.cloudera.impala.analysis.Expr;

 import com.cloudera.impala.analysis.FunctionCallExpr;

 import com.cloudera.impala.analysis.SlotId;

 import com.cloudera.impala.common.InternalException;

 import com.cloudera.impala.thrift.TAggregationNode;

 import com.cloudera.impala.thrift.TExplainLevel;

 import com.cloudera.impala.thrift.TExpr;

 import com.cloudera.impala.thrift.TPlanNode;

 import com.cloudera.impala.thrift.TPlanNodeType;

 import com.cloudera.impala.thrift.TQueryOptions;

 import com.google.common.base.Objects;

 import com.google.common.base.Preconditions;

 import com.google.common.collect.Lists;

 import com.google.common.collect.Sets;


 public class AggregationNode extends PlanNode {

   private final static Logger LOG = LoggerFactory.getLogger(AggregationNode.class);


   // Default per-host memory requirement used if no valid stats are available.

   // TODO: Come up with a more useful heuristic.

   private final static long DEFAULT_PER_HOST_MEM = 128L * 1024L * 1024L;


   // Conservative minimum size of hash table for low-cardinality aggregations.

   private final static long MIN_HASH_TBL_MEM = 10L * 1024L * 1024L;


   private final AggregateInfo aggInfo_;


   // Set to true if this aggregation node needs to run the Finalize step. This

   // node is the root node of a distributed aggregation.

   private boolean needsFinalize_;


   public AggregationNode(PlanNodeId id, PlanNode input, AggregateInfo aggInfo) {

     super(id, aggInfo.getOutputTupleId().asList(), "AGGREGATE");

     aggInfo_ = aggInfo;

     children_.add(input);

     nullableTupleIds_.addAll(input.getNullableTupleIds());

     needsFinalize_ = true;

   }


   private AggregationNode(PlanNodeId id, AggregationNode src) {

     super(id, src, "AGGREGATE");

     aggInfo_ = src.aggInfo_;

     needsFinalize_ = src.needsFinalize_;

   }


   public AggregateInfo getAggInfo() { return aggInfo_; }


   // Unsets this node as requiring finalize. Only valid to call this if it is

   // currently marked as needing finalize.

   public void unsetNeedsFinalize() {

     Preconditions.checkState(needsFinalize_);

     needsFinalize_ = false;

   }


   public void setIntermediateTuple() {

     Preconditions.checkState(!tupleIds_.isEmpty());

     Preconditions.checkState(tupleIds_.get(0).equals(aggInfo_.getOutputTupleId()));

     tupleIds_.clear();

     tupleIds_.add(aggInfo_.getIntermediateTupleId());

   }


   @Override

   public boolean isBlockingNode() { return true; }


   @Override

   public void init(Analyzer analyzer) throws InternalException {

     // Assign predicates to the top-most agg in the single-node plan that can evaluate

     // them, as follows: For non-distinct aggs place them in the 1st phase agg node. For

     // distinct aggs place them in the 2nd phase agg node. The conjuncts are

     // transferred to the proper place in the multi-node plan via transferConjuncts().

     if (tupleIds_.get(0).equals(aggInfo_.getResultTupleId()) && !aggInfo_.isMerge()) {

       // Ignore predicates bound to a group-by slot because those

       // are already evaluated below this agg node (e.g., in a scan).

       Set<SlotId> groupBySlots = Sets.newHashSet();

       for (int i = 0; i < aggInfo_.getGroupingExprs().size(); ++i) {

         groupBySlots.add(aggInfo_.getOutputTupleDesc().getSlots().get(i).getId());

       }

       ArrayList<Expr> bindingPredicates =

           analyzer.getBoundPredicates(tupleIds_.get(0), groupBySlots, true);

       conjuncts_.addAll(bindingPredicates);


       // also add remaining unassigned conjuncts_

       assignConjuncts(analyzer);


       analyzer.createEquivConjuncts(tupleIds_.get(0), conjuncts_, groupBySlots);

     }

     // Compute the mem layout for both tuples here for simplicity.

     aggInfo_.getOutputTupleDesc().computeMemLayout();

     aggInfo_.getIntermediateTupleDesc().computeMemLayout();


     // do this at the end so it can take all conjuncts into account

     computeStats(analyzer);


     // don't call createDefaultSMap(), it would point our conjuncts (= Having clause)

     // to our input; our conjuncts don't get substituted because they already

     // refer to our output

     outputSmap_ = getCombinedChildSmap();

     aggInfo_.substitute(outputSmap_, analyzer);

     // assert consistent aggregate expr and slot materialization

     aggInfo_.checkConsistency();

   }


   @Override

   public void computeStats(Analyzer analyzer) {

     super.computeStats(analyzer);

     // This is prone to overflow, because we keep multiplying cardinalities,

     // even if the grouping exprs are functionally dependent (example:

     // group by the primary key of a table plus a number of other columns from that

     // same table)

     // TODO: try to recognize functional dependencies

     // TODO: as a shortcut, instead of recognizing functional dependencies,

     // limit the contribution of a single table to the number of rows

     // of that table (so that when we're grouping by the primary key col plus

     // some others, the estimate doesn't overshoot dramatically)

     // cardinality: product of # of distinct values produced by grouping exprs


     // Any non-grouping aggregation has at least one distinct value

     cardinality_ = aggInfo_.getGroupingExprs().isEmpty() ? 1 :

       Expr.getNumDistinctValues(aggInfo_.getGroupingExprs());

     // take HAVING predicate into account

     LOG.trace("Agg: cardinality=" + Long.toString(cardinality_));

     if (cardinality_ > 0) {

       cardinality_ = Math.round((double) cardinality_ * computeSelectivity());

       LOG.trace("sel=" + Double.toString(computeSelectivity()));

     }

     // if we ended up with an overflow, the estimate is certain to be wrong

     if (cardinality_ < 0) cardinality_ = -1;

     // Sanity check the cardinality_ based on the input cardinality_.

     if (getChild(0).getCardinality() != -1) {

       if (cardinality_ == -1) {

         // A worst-case cardinality_ is better than an unknown cardinality_.

         cardinality_ = getChild(0).getCardinality();

       } else {

         // An AggregationNode cannot increase the cardinality_.

         cardinality_ = Math.min(getChild(0).getCardinality(), cardinality_);

       }

     }

     LOG.trace("stats Agg: cardinality=" + Long.toString(cardinality_));

   }


   @Override

   protected String debugString() {

     return Objects.toStringHelper(this)

         .add("aggInfo", aggInfo_.debugString())

         .addValue(super.debugString())

         .toString();

   }


   @Override

   protected void toThrift(TPlanNode msg) {

     msg.node_type = TPlanNodeType.AGGREGATION_NODE;


     List<TExpr> aggregateFunctions = Lists.newArrayList();

     // only serialize agg exprs that are being materialized

     for (FunctionCallExpr e: aggInfo_.getMaterializedAggregateExprs()) {

       aggregateFunctions.add(e.treeToThrift());

     }

     aggInfo_.checkConsistency();

     msg.agg_node = new TAggregationNode(

         aggregateFunctions,

         aggInfo_.getIntermediateTupleId().asInt(),

         aggInfo_.getOutputTupleId().asInt(), needsFinalize_);

     List<Expr> groupingExprs = aggInfo_.getGroupingExprs();

     if (groupingExprs != null) {

       msg.agg_node.setGrouping_exprs(Expr.treesToThrift(groupingExprs));

     }

   }


   @Override

   protected String getDisplayLabelDetail() {

     if (needsFinalize_) return "FINALIZE";

     return null;

   }


   @Override

   protected String getNodeExplainString(String prefix, String detailPrefix,

       TExplainLevel detailLevel) {

     StringBuilder output = new StringBuilder();

     String nameDetail = getDisplayLabelDetail();

     output.append(String.format("%s%s", prefix, getDisplayLabel()));

     if (nameDetail != null) output.append(" [" + nameDetail + "]");

     output.append("\n");


     if (detailLevel.ordinal() >= TExplainLevel.STANDARD.ordinal()) {

       if (aggInfo_.getAggregateExprs() != null &&

           aggInfo_.getAggregateExprs().size() > 0) {

         output.append(detailPrefix + "output: ")

         .append(getExplainString(aggInfo_.getAggregateExprs()) + "\n");

       }

       // TODO: is this the best way to display this. It currently would

       // have DISTINCT_PC(DISTINCT_PC(col)) for the merge phase but not

       // very obvious what that means if you don't already know.


       // TODO: group by can be very long. Break it into multiple lines

       if (!aggInfo_.getGroupingExprs().isEmpty()) {

         output.append(detailPrefix + "group by: ")

         .append(getExplainString(aggInfo_.getGroupingExprs()) + "\n");

       }

       if (!conjuncts_.isEmpty()) {

         output.append(detailPrefix + "having: ")

         .append(getExplainString(conjuncts_) + "\n");

       }

     }

     return output.toString();

   }


   @Override

   public void computeCosts(TQueryOptions queryOptions) {

     Preconditions.checkNotNull(fragment_,

         "PlanNode must be placed into a fragment before calling this method.");

     perHostMemCost_ = 0;

     long perHostCardinality = fragment_.getNumDistinctValues(aggInfo_.getGroupingExprs());

     if (perHostCardinality == -1) {

       perHostMemCost_ = DEFAULT_PER_HOST_MEM;

       return;

     }


     // Per-host cardinality cannot be greater than the total output cardinality.

     if (cardinality_ != -1) {

       perHostCardinality = Math.min(perHostCardinality, cardinality_);

     }

     perHostMemCost_ += Math.max(perHostCardinality * avgRowSize_ *

         PlannerContext.HASH_TBL_SPACE_OVERHEAD, MIN_HASH_TBL_MEM);

   }

 }

com.cloudera.impala.analysis.AggregateInfo.debugString
String debugString()
Definition: AggregateInfo.java:645

com.cloudera.impala.planner.AggregationNode.init
void init(Analyzer analyzer)
Definition: AggregationNode.java:105

com.cloudera.impala.planner.PlanNode.perHostMemCost_
long perHostMemCost_
Definition: PlanNode.java:114

com.cloudera.impala.planner.AggregationNode.needsFinalize_
boolean needsFinalize_
Definition: AggregationNode.java:59

com.cloudera.impala.planner.PlanNode.getCardinality
long getCardinality()
Definition: PlanNode.java:172

com.cloudera.impala.analysis.FunctionCallExpr
Definition: FunctionCallExpr.java:37

com.cloudera.impala.planner.PlanNode.assignConjuncts
void assignConjuncts(Analyzer analyzer)
Definition: PlanNode.java:401

com.cloudera.impala.analysis.AggregateInfoBase.getGroupingExprs
ArrayList< Expr > getGroupingExprs()
Definition: AggregateInfoBase.java:156

com.cloudera.impala.planner.PlanNode.tupleIds_
ArrayList< TupleId > tupleIds_
Definition: PlanNode.java:74

com.cloudera.impala.planner.PlanNode
Definition: PlanNode.java:59

com.cloudera.impala.planner.AggregationNode.AggregationNode
AggregationNode(PlanNodeId id, AggregationNode src)
Definition: AggregationNode.java:75

com.cloudera.impala.planner.PlanNode.getCombinedChildSmap
ExprSubstitutionMap getCombinedChildSmap()
Definition: PlanNode.java:410

com.cloudera.impala.planner.PlanNode.getExplainString
String getExplainString()
Definition: PlanNode.java:219

com.cloudera.impala.planner.AggregationNode.computeStats
void computeStats(Analyzer analyzer)
Definition: AggregationNode.java:143

com.cloudera.impala.planner.AggregationNode.DEFAULT_PER_HOST_MEM
static final long DEFAULT_PER_HOST_MEM
Definition: AggregationNode.java:50

com.cloudera.impala.analysis.AggregateInfoBase.getOutputTupleId
TupleId getOutputTupleId()
Definition: AggregateInfoBase.java:161

com.cloudera.impala.planner.PlanNode.cardinality_
long cardinality_
Definition: PlanNode.java:103

com.cloudera.impala.planner.AggregationNode.LOG
static final Logger LOG
Definition: AggregationNode.java:46

com.cloudera.impala.analysis.AggregateInfoBase.getAggregateExprs
ArrayList< FunctionCallExpr > getAggregateExprs()
Definition: AggregateInfoBase.java:157

com.cloudera.impala.planner.AggregationNode.debugString
String debugString()
Definition: AggregationNode.java:181

com.cloudera.impala.planner.AggregationNode.getNodeExplainString
String getNodeExplainString(String prefix, String detailPrefix, TExplainLevel detailLevel)
Definition: AggregationNode.java:215

com.cloudera.impala.planner.AggregationNode.toThrift
void toThrift(TPlanNode msg)
Definition: AggregationNode.java:189

com.cloudera.impala.analysis.Expr
Definition: Expr.java:48

com.cloudera.impala.planner.PlanNode.fragment_
PlanFragment fragment_
Definition: PlanNode.java:90

com.cloudera.impala.analysis.AggregateInfo.getMaterializedAggregateExprs
ArrayList< FunctionCallExpr > getMaterializedAggregateExprs()
Definition: AggregateInfo.java:252

impala::SlotId
int SlotId
Definition: global-types.h:24

com.cloudera.impala.planner.AggregationNode.computeCosts
void computeCosts(TQueryOptions queryOptions)
Definition: AggregationNode.java:247

com.cloudera.impala.planner.PlanNode.getId
PlanNodeId getId()
Definition: PlanNode.java:164

com.cloudera.impala.analysis.AggregateInfo
Definition: AggregateInfo.java:66

com.cloudera.impala.planner.PlanNode.conjuncts_
List< Expr > conjuncts_
Definition: PlanNode.java:86

com.cloudera.impala.planner.AggregationNode.setIntermediateTuple
void setIntermediateTuple()
Definition: AggregationNode.java:94

com.cloudera.impala.common.InternalException
Definition: InternalException.java:21

com.cloudera.impala.planner.PlanNode.avgRowSize_
float avgRowSize_
Definition: PlanNode.java:110

com.cloudera.impala.planner.PlanNodeId
Definition: PlanNodeId.java:20

com.cloudera.impala.analysis.Analyzer
Definition: Analyzer.java:105

com.cloudera.impala.planner.AggregationNode.AggregationNode
AggregationNode(PlanNodeId id, PlanNode input, AggregateInfo aggInfo)
Definition: AggregationNode.java:64

com.cloudera.impala.planner.PlanNode.computeMemLayout
void computeMemLayout(Analyzer analyzer)
Definition: PlanNode.java:475

com.cloudera.impala.planner.AggregationNode.getAggInfo
AggregateInfo getAggInfo()
Definition: AggregationNode.java:81

com.cloudera.impala.planner.AggregationNode
Definition: AggregationNode.java:45

com.cloudera.impala.planner.PlannerContext.HASH_TBL_SPACE_OVERHEAD
static final double HASH_TBL_SPACE_OVERHEAD
Definition: PlannerContext.java:31

com.cloudera.impala.planner.AggregationNode.unsetNeedsFinalize
void unsetNeedsFinalize()
Definition: AggregationNode.java:85

com.cloudera.impala.planner.AggregationNode.aggInfo_
final AggregateInfo aggInfo_
Definition: AggregationNode.java:55

com.cloudera.impala.planner.AggregationNode.MIN_HASH_TBL_MEM
static final long MIN_HASH_TBL_MEM
Definition: AggregationNode.java:53

com.cloudera.impala.planner.PlanNode.computeSelectivity
double computeSelectivity()
Definition: PlanNode.java:484

com.cloudera.impala.planner.AggregationNode.isBlockingNode
boolean isBlockingNode()
Definition: AggregationNode.java:102

com.cloudera.impala.planner.AggregationNode.getDisplayLabelDetail
String getDisplayLabelDetail()
Definition: AggregationNode.java:209

com.cloudera.impala.analysis.AggregateInfoBase.getIntermediateTupleId
TupleId getIntermediateTupleId()
Definition: AggregateInfoBase.java:160

com.cloudera.impala.planner.PlanNode.getDisplayLabel
final String getDisplayLabel()
Definition: PlanNode.java:225

com.cloudera.impala.planner.PlanNode.outputSmap_
ExprSubstitutionMap outputSmap_
Definition: PlanNode.java:93