15 package com.cloudera.impala.analysis;
17 import java.util.Iterator;
18 import java.util.List;
20 import org.apache.hadoop.hive.metastore.api.FieldSchema;
21 import org.apache.log4j.Logger;
32 import com.cloudera.impala.thrift.TComputeStatsParams;
33 import com.cloudera.impala.thrift.TPartitionStats;
34 import com.cloudera.impala.thrift.TTableName;
35 import com.google.common.base.Joiner;
36 import com.google.common.base.Preconditions;
37 import com.google.common.collect.Lists;
53 private static final Logger
LOG = Logger.getLogger(ComputeStatsStmt.class);
56 "'%s' because its column definitions do not match those in the Avro schema.";
58 "column definitions, e.g., using the result of 'SHOW CREATE TABLE'";
112 this(tableName,
false, null);
123 Preconditions.checkState(tableName != null && !tableName.isEmpty());
124 Preconditions.checkState(isIncremental || partSpec == null);
125 this.tableName_ = tableName;
127 this.isIncremental_ = isIncremental;
128 this.partitionSpec_ = partSpec;
130 partitionSpec_.setTableName(tableName);
140 List<String> groupByCols) {
141 for (
int i = 0; i < table.getNumClusteringCols(); ++i) {
142 String colRefSql = ToSqlUtils.getIdentSql(table.getColumns().
get(i).getName());
143 groupByCols.add(colRefSql);
146 selectList.add(colRefSql);
151 List<String> columnStatsSelectList = Lists.newArrayList();
158 for (
int i = startColIdx; i < table_.getColumns().size(); ++i) {
159 Column c = table_.getColumns().
get(i);
160 Type type = c.getType();
164 if (!type.
isValid() || !type.isSupported()
165 || c.
getType().isComplexType()) {
170 String colRefSql = ToSqlUtils.getIdentSql(c.getName());
171 columnStatsSelectList.add(ndvUda +
"(" + colRefSql +
") AS " + colRefSql);
175 columnStatsSelectList.add(
"COUNT(IF(" + colRefSql +
" IS NULL, 1, NULL))");
182 columnStatsSelectList.add(
"CAST(-1 as BIGINT)");
187 columnStatsSelectList.add(
"MAX(length(" + colRefSql +
"))");
188 columnStatsSelectList.add(
"AVG(length(" + colRefSql +
"))");
193 Integer typeSize = type.getPrimitiveType().getSlotSize();
194 columnStatsSelectList.add(typeSize.toString());
195 columnStatsSelectList.add(
"CAST(" + typeSize.toString() +
" as DOUBLE)");
200 columnStatsSelectList.add(
"COUNT(" + colRefSql +
")");
203 return columnStatsSelectList;
244 String sqlTableName = table_.getTableName().
toSql();
247 "COMPUTE STATS not supported for view %s", sqlTableName));
261 HdfsTable hdfsTable = null;
262 if (
table_ instanceof HdfsTable) {
263 hdfsTable = (HdfsTable)
table_;
267 "Can't compute PARTITION stats on an unpartitioned table: %s",
270 partitionSpec_.setPartitionShouldExist();
271 partitionSpec_.analyze(analyzer);
276 if (!kv.isStatic()) {
286 List<String> filterPreds = Lists.newArrayList();
292 boolean tableIsMissingColStats =
false;
297 boolean allColumnsMissingStats =
true;
298 String exampleColumnMissingStats = null;
301 if (!col.getStats().hasStats()) {
302 if (!tableIsMissingColStats) {
303 tableIsMissingColStats =
true;
304 exampleColumnMissingStats = col.getName();
307 allColumnsMissingStats =
false;
311 if (tableIsMissingColStats && !allColumnsMissingStats) {
312 analyzer.addWarning(
"Column " + exampleColumnMissingStats +
313 " does not have statistics, recomputing stats for the whole table");
317 if (p.isDefaultPartition())
continue;
318 TPartitionStats partStats = p.getPartitionStats();
319 if (!p.hasIncrementalStats() || tableIsMissingColStats) {
320 if (partStats == null) LOG.trace(p.toString() +
" does not have stats");
321 if (!tableIsMissingColStats) filterPreds.add(p.getConjunctSql());
322 List<String> partValues = Lists.newArrayList();
323 for (
LiteralExpr partValue: p.getPartitionValues()) {
324 partValues.add(PartitionKeyValue.getPartitionKeyValueString(partValue,
327 expectedPartitions_.add(partValues);
329 LOG.trace(p.toString() +
" does have statistics");
330 validPartStats_.add(partStats);
334 expectedPartitions_.clear();
339 List<String> partitionConjuncts = Lists.newArrayList();
341 partitionConjuncts.add(kv.toPredicateSql());
343 filterPreds.add(
"(" + Joiner.on(
" AND ").join(partitionConjuncts) +
")");
345 hdfsTable.getPartition(partitionSpec_.getPartitionSpecKeyValues());
347 if (p.isDefaultPartition())
continue;
348 if (p == targetPartition)
continue;
349 TPartitionStats partStats = p.getPartitionStats();
350 if (partStats != null) validPartStats_.add(partStats);
354 if (filterPreds.size() == 0 && validPartStats_.size() != 0) {
355 LOG.info(
"No partitions selected for incremental stats update");
356 analyzer.addWarning(
"No partitions selected for incremental stats update");
365 "Too many partitions selected, doing full recomputation of incremental stats");
367 validPartStats_.clear();
370 List<String> groupByCols = Lists.newArrayList();
371 List<String> partitionColsSelectList = Lists.newArrayList();
373 if (hdfsTable != null) {
379 StringBuilder tableStatsQueryBuilder =
new StringBuilder(
"SELECT ");
380 List<String> tableStatsSelectList = Lists.newArrayList();
381 tableStatsSelectList.add(
"COUNT(*)");
383 tableStatsSelectList.addAll(partitionColsSelectList);
384 tableStatsQueryBuilder.append(Joiner.on(
", ").join(tableStatsSelectList));
385 tableStatsQueryBuilder.append(
" FROM " + sqlTableName);
390 if (
isIncremental_) columnStatsSelectList.addAll(partitionColsSelectList);
392 StringBuilder columnStatsQueryBuilder =
new StringBuilder(
"SELECT ");
393 columnStatsQueryBuilder.append(Joiner.on(
", ").join(columnStatsSelectList));
394 columnStatsQueryBuilder.append(
" FROM " + sqlTableName);
402 if (filterPreds.size() > 0 &&
404 String filterClause =
" WHERE " + Joiner.on(
" OR ").join(filterPreds);
405 columnStatsQueryBuilder.append(filterClause);
406 tableStatsQueryBuilder.append(filterClause);
409 if (groupByCols.size() > 0) {
410 String groupBy =
" GROUP BY " + Joiner.on(
", ").join(groupByCols);
412 tableStatsQueryBuilder.append(groupBy);
418 if (columnStatsSelectList.isEmpty()) {
420 LOG.info(
"No supported column types in table " + table_.getTableName() +
421 ", no column statistics will be gathered.");
438 Preconditions.checkState(table.isAvroTable());
439 org.apache.hadoop.hive.metastore.api.Table msTable = table.getMetaStoreTable();
441 Iterator<FieldSchema> colDefs = msTable.getSd().getCols().iterator();
445 Iterator<Column> avroSchemaCols = table.getColumns().iterator();
448 for (
int i = 0; i < table.getNumClusteringCols(); ++i) {
449 if (avroSchemaCols.hasNext()) avroSchemaCols.next();
452 while (colDefs.hasNext() || avroSchemaCols.hasNext()) {
453 if (colDefs.hasNext() && avroSchemaCols.hasNext()) {
454 FieldSchema colDef = colDefs.next();
455 Column avroSchemaCol = avroSchemaCols.next();
458 if (!colDef.getName().equalsIgnoreCase(avroSchemaCol.getName())) {
461 "\nDefinition of column '%s' of type '%s' does not match " +
462 "the Avro-schema column '%s' of type '%s' at position '%s'.\n" +
464 table.getName(), colDef.getName(), colDef.getType(),
465 avroSchemaCol.getName(), avroSchemaCol.getType(), pos));
475 if (colDefs.hasNext() && !avroSchemaCols.hasNext()) {
476 FieldSchema colDef = colDefs.next();
479 "\nMissing Avro-schema column corresponding to column " +
480 "definition '%s' of type '%s' at position '%s'.\n" +
482 table.getName(), colDef.getName(), colDef.getType(), pos));
484 if (!colDefs.hasNext() && avroSchemaCols.hasNext()) {
485 Column avroSchemaCol = avroSchemaCols.next();
488 "\nMissing column definition corresponding to Avro-schema " +
489 "column '%s' of type '%s' at position '%s'.\n" +
491 table.getName(), avroSchemaCol.getName(), avroSchemaCol.
getType(), pos));
503 return "COMPUTE STATS " + tableName_.toSql();
505 return "COMPUTE INCREMENTAL STATS " + tableName_.toSql() +
511 TComputeStatsParams params =
new TComputeStatsParams();
517 params.setCol_stats_queryIsSet(
false);
525 params.setNum_partition_cols(((
HdfsTable)
table_).getNumClusteringCols());
void checkIncompleteAvroSchema(HdfsTable table)
final List< TPartitionStats > validPartStats_
String tableStatsQueryStr_
String columnStatsQueryStr_
static final int MAX_INCREMENTAL_PARTITIONS
TComputeStatsParams toThrift()
List< HdfsPartition > getPartitions()
List< PartitionKeyValue > getPartitionSpecKeyValues()
PartitionSpec partitionSpec_
final List< List< String > > expectedPartitions_
List< Column > getNonClusteringColumns()
static String AVRO_SCHEMA_MSG_PREFIX
void addPartitionCols(HdfsTable table, List< String > selectList, List< String > groupByCols)
List< String > getBaseColumnStatsQuerySelectList(Analyzer analyzer)
String getColStatsQuery()
final TableName tableName_
void analyze(Analyzer analyzer)
ComputeStatsStmt(TableName tableName)
String getTblStatsQuery()
boolean expectAllPartitions_
ComputeStatsStmt(TableName tableName, boolean isIncremental, PartitionSpec partSpec)
static final boolean COUNT_NULLS
int getNumClusteringCols()
static String AVRO_SCHEMA_MSG_SUFFIX