Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ColumnStats.java
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 package com.cloudera.impala.catalog;
16 
17 import java.util.Set;
18 
19 import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;
20 import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;
21 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
22 import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
23 import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
24 import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
25 import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
26 import org.slf4j.Logger;
27 import org.slf4j.LoggerFactory;
28 
31 import com.cloudera.impala.thrift.TColumnStats;
32 import com.google.common.base.Objects;
33 import com.google.common.base.Preconditions;
34 import com.google.common.collect.Sets;
35 
39 public class ColumnStats {
40  private final static Logger LOG = LoggerFactory.getLogger(ColumnStats.class);
41 
42  // Set of the currently supported column stats column types.
43  private final static Set<PrimitiveType> SUPPORTED_COL_TYPES = Sets.newHashSet(
49 
50  // in bytes: excludes serialization overhead
51  private double avgSize_;
52  // in bytes; includes serialization overhead.
53  private double avgSerializedSize_;
54  private long maxSize_; // in bytes
55  private long numDistinctValues_;
56  private long numNulls_;
57 
58  public ColumnStats(Type colType) {
59  initColStats(colType);
60  }
61 
67  private void initColStats(Type colType) {
68  avgSize_ = -1;
69  avgSerializedSize_ = -1;
70  maxSize_ = -1;
71  numDistinctValues_ = -1;
72  numNulls_ = -1;
73  if (colType.isFixedLengthType()) {
74  avgSerializedSize_ = colType.getSlotSize();
75  avgSize_ = colType.getSlotSize();
76  maxSize_ = colType.getSlotSize();
77  }
78  }
79 
84  public static ColumnStats fromExpr(Expr expr) {
85  Preconditions.checkNotNull(expr);
86  Preconditions.checkState(expr.getType().isValid());
87  ColumnStats stats = new ColumnStats(expr.getType());
88  stats.setNumDistinctValues(expr.getNumDistinctValues());
89  SlotRef slotRef = expr.unwrapSlotRef(false);
90  if (slotRef == null) return stats;
91  ColumnStats slotStats = slotRef.getDesc().getStats();
92  if (slotStats == null) return stats;
93  stats.numNulls_ = slotStats.getNumNulls();
94  stats.avgSerializedSize_ = slotStats.getAvgSerializedSize();
95  stats.avgSize_ = slotStats.getAvgSize();
96  stats.maxSize_ = slotStats.getMaxSize();
97  return stats;
98  }
99 
107  public ColumnStats add(ColumnStats other) {
108  if (numDistinctValues_ == -1 || other.numDistinctValues_ == -1) {
109  numDistinctValues_ = -1;
110  } else {
111  numDistinctValues_ += other.numDistinctValues_;
112  }
113  if (numNulls_ == -1 || other.numNulls_ == -1) {
114  numNulls_ = -1;
115  } else {
116  numNulls_ += other.numNulls_;
117  }
118  return this;
119  }
120 
121  public void setAvgSerializedSize(float avgSize) { this.avgSerializedSize_ = avgSize; }
122  public void setMaxSize(long maxSize) { this.maxSize_ = maxSize; }
123  public long getNumDistinctValues() { return numDistinctValues_; }
124  public void setNumDistinctValues(long numDistinctValues) {
125  this.numDistinctValues_ = numDistinctValues;
126  }
127  public void setNumNulls(long numNulls) { this.numNulls_ = numNulls; }
128  public double getAvgSerializedSize() { return avgSerializedSize_; }
129  public double getAvgSize() { return avgSize_; }
130  public long getMaxSize() { return maxSize_; }
131  public boolean hasNulls() { return numNulls_ > 0; }
132  public long getNumNulls() { return numNulls_; }
133  public boolean hasAvgSerializedSize() { return avgSerializedSize_ >= 0; }
134  public boolean hasMaxSize() { return maxSize_ >= 0; }
135  public boolean hasNumDistinctValues() { return numDistinctValues_ >= 0; }
136  public boolean hasStats() { return numNulls_ != -1 || numDistinctValues_ != -1; }
137 
145  public boolean update(Type colType, ColumnStatisticsData statsData) {
146  Preconditions.checkState(isSupportedColType(colType));
147  initColStats(colType);
148  boolean isCompatible = false;
149  switch (colType.getPrimitiveType()) {
150  case BOOLEAN:
151  isCompatible = statsData.isSetBooleanStats();
152  if (isCompatible) {
153  BooleanColumnStatsData boolStats = statsData.getBooleanStats();
154  numNulls_ = boolStats.getNumNulls();
155  numDistinctValues_ = (numNulls_ > 0) ? 3 : 2;
156  }
157  break;
158  case TINYINT:
159  case SMALLINT:
160  case INT:
161  case BIGINT:
162  case TIMESTAMP: // Hive and Impala use LongColumnStatsData for timestamps.
163  isCompatible = statsData.isSetLongStats();
164  if (isCompatible) {
165  LongColumnStatsData longStats = statsData.getLongStats();
166  numDistinctValues_ = longStats.getNumDVs();
167  numNulls_ = longStats.getNumNulls();
168  }
169  break;
170  case FLOAT:
171  case DOUBLE:
172  isCompatible = statsData.isSetDoubleStats();
173  if (isCompatible) {
174  DoubleColumnStatsData doubleStats = statsData.getDoubleStats();
175  numDistinctValues_ = doubleStats.getNumDVs();
176  numNulls_ = doubleStats.getNumNulls();
177  }
178  break;
179  case CHAR:
180  case VARCHAR:
181  case STRING:
182  isCompatible = statsData.isSetStringStats();
183  if (isCompatible) {
184  StringColumnStatsData stringStats = statsData.getStringStats();
185  numDistinctValues_ = stringStats.getNumDVs();
186  numNulls_ = stringStats.getNumNulls();
187  maxSize_ = stringStats.getMaxColLen();
188  avgSize_ = Double.valueOf(stringStats.getAvgColLen()).floatValue();
189  avgSerializedSize_ = avgSize_ + PrimitiveType.STRING.getSlotSize();
190  }
191  break;
192  case BINARY:
193  isCompatible = statsData.isSetStringStats();
194  if (isCompatible) {
195  BinaryColumnStatsData binaryStats = statsData.getBinaryStats();
196  numNulls_ = binaryStats.getNumNulls();
197  maxSize_ = binaryStats.getMaxColLen();
198  avgSize_ = Double.valueOf(binaryStats.getAvgColLen()).floatValue();
199  avgSerializedSize_ = avgSize_ + PrimitiveType.BINARY.getSlotSize();
200  }
201  break;
202  case DECIMAL:
203  isCompatible = statsData.isSetDecimalStats();
204  if (isCompatible) {
205  DecimalColumnStatsData decimalStats = statsData.getDecimalStats();
206  numNulls_ = decimalStats.getNumNulls();
207  numDistinctValues_ = decimalStats.getNumDVs();
208  }
209  break;
210  default:
211  Preconditions.checkState(false,
212  "Unexpected column type: " + colType.toString());
213  break;
214  }
215  return isCompatible;
216  }
217 
221  public static boolean isSupportedColType(Type colType) {
222  if (!colType.isScalarType()) return false;
223  ScalarType scalarType = (ScalarType) colType;
224  return SUPPORTED_COL_TYPES.contains(scalarType.getPrimitiveType());
225  }
226 
227  public void update(Type colType, TColumnStats stats) {
228  initColStats(colType);
229  avgSize_ = Double.valueOf(stats.getAvg_size()).floatValue();
230  if (colType.getPrimitiveType() == PrimitiveType.STRING ||
231  colType.getPrimitiveType() == PrimitiveType.BINARY) {
232  avgSerializedSize_ = colType.getSlotSize() + avgSize_;
233  }
234  maxSize_ = stats.getMax_size();
235  numDistinctValues_ = stats.getNum_distinct_values();
236  numNulls_ = stats.getNum_nulls();
237  }
238 
239  public TColumnStats toThrift() {
240  TColumnStats colStats = new TColumnStats();
241  colStats.setAvg_size(avgSize_);
242  colStats.setMax_size(maxSize_);
243  colStats.setNum_distinct_values(numDistinctValues_);
244  colStats.setNum_nulls(numNulls_);
245  return colStats;
246  }
247 
248  @Override
249  public String toString() {
250  return Objects.toStringHelper(this.getClass())
251  .add("avgSerializedSize_", avgSerializedSize_)
252  .add("maxSize_", maxSize_)
253  .add("numDistinct_", numDistinctValues_)
254  .add("numNulls_", numNulls_)
255  .toString();
256  }
257 }
static final Set< PrimitiveType > SUPPORTED_COL_TYPES
static ColumnStats fromExpr(Expr expr)
void setNumDistinctValues(long numDistinctValues)
ColumnStats add(ColumnStats other)
static boolean isSupportedColType(Type colType)
PrimitiveType getPrimitiveType()
Definition: Type.java:188
void update(Type colType, TColumnStats stats)
boolean update(Type colType, ColumnStatisticsData statsData)