Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
HBaseTable.java
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 package com.cloudera.impala.catalog;
16 
17 import java.io.IOException;
18 import java.util.ArrayList;
19 import java.util.Collections;
20 import java.util.List;
21 import java.util.Map;
22 import java.util.Set;
23 
24 import org.apache.hadoop.conf.Configuration;
25 import org.apache.hadoop.fs.FileSystem;
26 import org.apache.hadoop.fs.Path;
27 import org.apache.hadoop.hbase.Cell;
28 import org.apache.hadoop.hbase.HBaseConfiguration;
29 import org.apache.hadoop.hbase.HColumnDescriptor;
30 import org.apache.hadoop.hbase.HConstants;
31 import org.apache.hadoop.hbase.HRegionInfo;
32 import org.apache.hadoop.hbase.HRegionLocation;
33 import org.apache.hadoop.hbase.HTableDescriptor;
34 import org.apache.hadoop.hbase.KeyValue;
35 import org.apache.hadoop.hbase.client.HTable;
36 import org.apache.hadoop.hbase.client.Result;
37 import org.apache.hadoop.hbase.client.ResultScanner;
38 import org.apache.hadoop.hbase.client.Scan;
39 import org.apache.hadoop.hbase.io.compress.Compression;
40 import org.apache.hadoop.hbase.util.Bytes;
41 import org.apache.hadoop.hive.hbase.HBaseSerDe;
42 import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
43 import org.apache.hadoop.hive.metastore.api.FieldSchema;
44 import org.apache.hadoop.hive.metastore.api.MetaException;
45 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
46 import org.apache.hadoop.hive.serde2.SerDeException;
47 import org.apache.log4j.Logger;
48 
49 import com.cloudera.impala.common.Pair;
50 import com.cloudera.impala.thrift.TCatalogObjectType;
51 import com.cloudera.impala.thrift.TColumn;
52 import com.cloudera.impala.thrift.THBaseTable;
53 import com.cloudera.impala.thrift.TResultSet;
54 import com.cloudera.impala.thrift.TResultSetMetadata;
55 import com.cloudera.impala.thrift.TTable;
56 import com.cloudera.impala.thrift.TTableDescriptor;
57 import com.cloudera.impala.thrift.TTableType;
58 import com.cloudera.impala.util.StatsHelper;
60 import com.google.common.base.Preconditions;
61 import com.google.common.collect.Lists;
62 
76 public class HBaseTable extends Table {
77  // Maximum deviation from the average to stop querying more regions
78  // to estimate the row count
79  private static final double DELTA_FROM_AVERAGE = 0.15;
80 
81  private static final Logger LOG = Logger.getLogger(HBaseTable.class);
82 
83  // Copied from Hive's HBaseStorageHandler.java.
84  public static final String DEFAULT_PREFIX = "default.";
85 
86  // Number of rows fetched during the row count estimation per region
87  public static final int ROW_COUNT_ESTIMATE_BATCH_SIZE = 10;
88 
89  // Minimum number of regions that are checked to estimate the row count
90  private static final int MIN_NUM_REGIONS_TO_CHECK = 5;
91 
92  // Column referring to HBase row key.
93  // Hive (including metastore) currently doesn't support composite HBase keys.
94  protected HBaseColumn rowKey_;
95 
96  // Name of table in HBase.
97  // 'this.name' is the alias of the HBase table in Hive.
98  protected String hbaseTableName_;
99 
100  // Input format class for HBase tables read by Hive.
101  private static final String HBASE_INPUT_FORMAT =
102  "org.apache.hadoop.hive.hbase.HiveHBaseTableInputFormat";
103 
104  // Serialization class for HBase tables set in the corresponding Metastore table.
105  private static final String HBASE_SERIALIZATION_LIB =
106  "org.apache.hadoop.hive.hbase.HBaseSerDe";
107 
108  // Storage handler class for HBase tables read by Hive.
109  private static final String HBASE_STORAGE_HANDLER =
110  "org.apache.hadoop.hive.hbase.HBaseStorageHandler";
111 
112  // Column family of HBase row key
113  private static final String ROW_KEY_COLUMN_FAMILY = ":key";
114 
115  // Keep the conf around
116  private final static Configuration hbaseConf_ = HBaseConfiguration.create();
117 
118  private HTable hTable_ = null;
119  // Cached column families. Used primarily for speeding up row stats estimation
120  // (see CDH-19292).
121  private HColumnDescriptor[] columnFamilies_ = null;
122 
123  protected HBaseTable(TableId id, org.apache.hadoop.hive.metastore.api.Table msTbl,
124  Db db, String name, String owner) {
125  super(id, msTbl, db, name, owner);
126  }
127 
128  // Parse the column description string to the column families and column
129  // qualifies. This is a copy of HBaseSerDe.parseColumnMapping and
130  // parseColumnStorageTypes with parts we don't use removed. The hive functions
131  // are not public.
132  // tableDefaultStorageIsBinary - true if table is default to binary encoding
133  // columnsMappingSpec - input string format describing the table
134  // fieldSchemas - input field schema from metastore table
135  // columnFamilies/columnQualifiers/columnBinaryEncodings - out parameters that will be
136  // filled with the column family, column qualifier and encoding for each column.
137  private void parseColumnMapping(boolean tableDefaultStorageIsBinary,
138  String columnsMappingSpec, List<FieldSchema> fieldSchemas,
139  List<String> columnFamilies, List<String> columnQualifiers,
140  List<Boolean> colIsBinaryEncoded) throws SerDeException {
141  if (columnsMappingSpec == null) {
142  throw new SerDeException(
143  "Error: hbase.columns.mapping missing for this HBase table.");
144  }
145 
146  if (columnsMappingSpec.equals("") ||
147  columnsMappingSpec.equals(HBaseSerDe.HBASE_KEY_COL)) {
148  throw new SerDeException("Error: hbase.columns.mapping specifies only "
149  + "the HBase table row key. A valid Hive-HBase table must specify at "
150  + "least one additional column.");
151  }
152 
153  int rowKeyIndex = -1;
154  String[] columnSpecs = columnsMappingSpec.split(",");
155  // If there was an implicit key column mapping, the number of columns (fieldSchemas)
156  // will be one more than the number of column mapping specs.
157  int fsStartIdxOffset = fieldSchemas.size() - columnSpecs.length;
158  if (fsStartIdxOffset != 0 && fsStartIdxOffset != 1) {
159  // This should never happen - Hive blocks creating a mismatched table and both Hive
160  // and Impala currently block all column-level DDL on HBase tables.
161  throw new SerDeException(String.format("Number of entries in " +
162  "'hbase.columns.mapping' does not match the number of columns in the " +
163  "table: %d != %d (counting the key if implicit)",
164  columnSpecs.length, fieldSchemas.size()));
165  }
166 
167  for (int i = 0; i < columnSpecs.length; ++i) {
168  String mappingSpec = columnSpecs[i];
169  String[] mapInfo = mappingSpec.split("#");
170  // Trim column info so that serdeproperties with new lines still parse correctly.
171  String colInfo = mapInfo[0].trim();
172 
173  int idxFirst = colInfo.indexOf(":");
174  int idxLast = colInfo.lastIndexOf(":");
175 
176  if (idxFirst < 0 || !(idxFirst == idxLast)) {
177  throw new SerDeException("Error: the HBase columns mapping contains a "
178  + "badly formed column family, column qualifier specification.");
179  }
180 
181  if (colInfo.equals(HBaseSerDe.HBASE_KEY_COL)) {
182  Preconditions.checkState(fsStartIdxOffset == 0);
183  rowKeyIndex = i;
184  columnFamilies.add(colInfo);
185  columnQualifiers.add(null);
186  } else {
187  String[] parts = colInfo.split(":");
188  Preconditions.checkState(parts.length > 0 && parts.length <= 2);
189  columnFamilies.add(parts[0]);
190  if (parts.length == 2) {
191  columnQualifiers.add(parts[1]);
192  } else {
193  columnQualifiers.add(null);
194  }
195  }
196 
197  // Set column binary encoding
198  FieldSchema fieldSchema = fieldSchemas.get(i + fsStartIdxOffset);
199  boolean supportsBinaryEncoding = supportsBinaryEncoding(fieldSchema);
200  if (mapInfo.length == 1) {
201  // There is no column level storage specification. Use the table storage spec.
202  colIsBinaryEncoded.add(
203  new Boolean(tableDefaultStorageIsBinary && supportsBinaryEncoding));
204  } else if (mapInfo.length == 2) {
205  // There is a storage specification for the column
206  String storageOption = mapInfo[1];
207 
208  if (!(storageOption.equals("-") || "string".startsWith(storageOption) || "binary"
209  .startsWith(storageOption))) {
210  throw new SerDeException("Error: A column storage specification is one of"
211  + " the following: '-', a prefix of 'string', or a prefix of 'binary'. "
212  + storageOption + " is not a valid storage option specification for "
213  + fieldSchema.getName());
214  }
215 
216  boolean isBinaryEncoded = false;
217  if ("-".equals(storageOption)) {
218  isBinaryEncoded = tableDefaultStorageIsBinary;
219  } else if ("binary".startsWith(storageOption)) {
220  isBinaryEncoded = true;
221  }
222  if (isBinaryEncoded && !supportsBinaryEncoding) {
223  // Use string encoding and log a warning if the column spec is binary but the
224  // column type does not support it.
225  // TODO: Hive/HBase does not raise an exception, but should we?
226  LOG.warn("Column storage specification for column " + fieldSchema.getName()
227  + " is binary" + " but the column type " + fieldSchema.getType() +
228  " does not support binary encoding. Fallback to string format.");
229  isBinaryEncoded = false;
230  }
231  colIsBinaryEncoded.add(isBinaryEncoded);
232  } else {
233  // error in storage specification
234  throw new SerDeException("Error: " + HBaseSerDe.HBASE_COLUMNS_MAPPING
235  + " storage specification " + mappingSpec + " is not valid for column: "
236  + fieldSchema.getName());
237  }
238  }
239 
240  if (rowKeyIndex == -1) {
241  columnFamilies.add(0, HBaseSerDe.HBASE_KEY_COL);
242  columnQualifiers.add(0, null);
243  colIsBinaryEncoded.add(0,
244  supportsBinaryEncoding(fieldSchemas.get(0)) && tableDefaultStorageIsBinary);
245  }
246  }
247 
248  private boolean supportsBinaryEncoding(FieldSchema fs) {
249  try {
250  Type colType = parseColumnType(fs);
251  // Only boolean, integer and floating point types can use binary storage.
252  return colType.isBoolean() || colType.isIntegerType()
253  || colType.isFloatingPointType();
254  } catch (TableLoadingException e) {
255  return false;
256  }
257  }
258 
259  @Override
266  public void load(Table oldValue, HiveMetaStoreClient client,
267  org.apache.hadoop.hive.metastore.api.Table msTbl) throws TableLoadingException {
268  Preconditions.checkNotNull(getMetaStoreTable());
269  try {
271  hTable_ = new HTable(hbaseConf_, hbaseTableName_);
272  columnFamilies_ = null;
273  Map<String, String> serdeParams =
274  getMetaStoreTable().getSd().getSerdeInfo().getParameters();
275  String hbaseColumnsMapping = serdeParams.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
276  if (hbaseColumnsMapping == null) {
277  throw new MetaException("No hbase.columns.mapping defined in Serde.");
278  }
279 
280  String hbaseTableDefaultStorageType = getMetaStoreTable().getParameters().get(
281  HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE);
282  boolean tableDefaultStorageIsBinary = false;
283  if (hbaseTableDefaultStorageType != null &&
284  !hbaseTableDefaultStorageType.isEmpty()) {
285  if (hbaseTableDefaultStorageType.equalsIgnoreCase("binary")) {
286  tableDefaultStorageIsBinary = true;
287  } else if (!hbaseTableDefaultStorageType.equalsIgnoreCase("string")) {
288  throw new SerDeException("Error: " +
289  HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE +
290  " parameter must be specified as" +
291  " 'string' or 'binary'; '" + hbaseTableDefaultStorageType +
292  "' is not a valid specification for this table/serde property.");
293  }
294  }
295 
296  // Parse HBase column-mapping string.
297  List<FieldSchema> fieldSchemas = getMetaStoreTable().getSd().getCols();
298  List<String> hbaseColumnFamilies = new ArrayList<String>();
299  List<String> hbaseColumnQualifiers = new ArrayList<String>();
300  List<Boolean> hbaseColumnBinaryEncodings = new ArrayList<Boolean>();
301  parseColumnMapping(tableDefaultStorageIsBinary, hbaseColumnsMapping, fieldSchemas,
302  hbaseColumnFamilies, hbaseColumnQualifiers, hbaseColumnBinaryEncodings);
303  Preconditions.checkState(
304  hbaseColumnFamilies.size() == hbaseColumnQualifiers.size());
305  Preconditions.checkState(fieldSchemas.size() == hbaseColumnFamilies.size());
306 
307  // Populate tmp cols in the order they appear in the Hive metastore.
308  // We will reorder the cols below.
309  List<HBaseColumn> tmpCols = Lists.newArrayList();
310  // Store the key column separately.
311  // TODO: Change this to an ArrayList once we support composite row keys.
312  HBaseColumn keyCol = null;
313  for (int i = 0; i < fieldSchemas.size(); ++i) {
314  FieldSchema s = fieldSchemas.get(i);
315  Type t = Type.INVALID;
316  try {
317  t = parseColumnType(s);
318  } catch (TableLoadingException e) {
319  // Ignore hbase types we don't support yet. We can load the metadata
320  // but won't be able to select from it.
321  }
322  HBaseColumn col = new HBaseColumn(s.getName(), hbaseColumnFamilies.get(i),
323  hbaseColumnQualifiers.get(i), hbaseColumnBinaryEncodings.get(i),
324  t, s.getComment(), -1);
325  if (col.getColumnFamily().equals(ROW_KEY_COLUMN_FAMILY)) {
326  // Store the row key column separately from the rest
327  keyCol = col;
328  } else {
329  tmpCols.add(col);
330  }
331  }
332  Preconditions.checkState(keyCol != null);
333 
334  // The backend assumes that the row key column is always first and
335  // that the remaining HBase columns are ordered by columnFamily,columnQualifier,
336  // so the final position depends on the other mapped HBase columns.
337  // Sort columns and update positions.
338  Collections.sort(tmpCols);
339  clearColumns();
340 
341  keyCol.setPosition(0);
342  addColumn(keyCol);
343  // Update the positions of the remaining columns
344  for (int i = 0; i < tmpCols.size(); ++i) {
345  HBaseColumn col = tmpCols.get(i);
346  col.setPosition(i + 1);
347  addColumn(col);
348  }
349 
350  // Set table stats.
351  numRows_ = getRowCount(super.getMetaStoreTable().getParameters());
352 
353  // since we don't support composite hbase rowkeys yet, all hbase tables have a
354  // single clustering col
355  numClusteringCols_ = 1;
356  loadAllColumnStats(client);
357  } catch (Exception e) {
358  throw new TableLoadingException("Failed to load metadata for HBase table: " +
359  name_, e);
360  }
361  }
362 
363  @Override
364  protected void loadFromThrift(TTable table) throws TableLoadingException {
365  super.loadFromThrift(table);
366  try {
368  hTable_ = new HTable(hbaseConf_, hbaseTableName_);
369  columnFamilies_ = null;
370  } catch (Exception e) {
371  throw new TableLoadingException("Failed to load metadata for HBase table from " +
372  "thrift table: " + name_, e);
373  }
374  }
375 
376  // This method is completely copied from Hive's HBaseStorageHandler.java.
377  private String getHBaseTableName(org.apache.hadoop.hive.metastore.api.Table tbl) {
378  // Give preference to TBLPROPERTIES over SERDEPROPERTIES
379  // (really we should only use TBLPROPERTIES, so this is just
380  // for backwards compatibility with the original specs).
381  String tableName = tbl.getParameters().get(HBaseSerDe.HBASE_TABLE_NAME);
382  if (tableName == null) {
383  tableName = tbl.getSd().getSerdeInfo().getParameters().get(
384  HBaseSerDe.HBASE_TABLE_NAME);
385  }
386  if (tableName == null) {
387  tableName = tbl.getDbName() + "." + tbl.getTableName();
388  if (tableName.startsWith(DEFAULT_PREFIX)) {
389  tableName = tableName.substring(DEFAULT_PREFIX.length());
390  }
391  }
392  return tableName;
393  }
394 
399  private Pair<Long, Long> getEstimatedRowStatsForRegion(HRegionLocation location,
400  boolean isCompressed) throws IOException {
401  HRegionInfo info = location.getRegionInfo();
402 
403  Scan s = new Scan(info.getStartKey());
404  // Get a small sample of rows
405  s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE);
406  // Try and get every version so the row's size can be used to estimate.
407  s.setMaxVersions(Short.MAX_VALUE);
408  // Don't cache the blocks as we don't think these are
409  // necessarily important blocks.
410  s.setCacheBlocks(false);
411  // Try and get deletes too so their size can be counted.
412  s.setRaw(false);
413  ResultScanner rs = hTable_.getScanner(s);
414 
415  long currentRowSize = 0;
416  long currentRowCount = 0;
417  try {
418  // Get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows
419  // for a representative sample
420  for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; ++i) {
421  Result r = rs.next();
422  if (r == null)
423  break;
424  // Check for empty rows, see IMPALA-1451
425  if (r.isEmpty())
426  continue;
427  ++currentRowCount;
428  // To estimate the number of rows we simply use the amount of bytes
429  // returned from the underlying buffer. Since HBase internally works
430  // with these structures as well this gives us ok estimates.
431  Cell[] cells = r.rawCells();
432  for (Cell c : cells) {
433  if (c instanceof KeyValue) {
434  currentRowSize += KeyValue.getKeyValueDataStructureSize(c.getRowLength(),
435  c.getFamilyLength(), c.getQualifierLength(), c.getValueLength(),
436  c.getTagsLength());
437  } else {
438  throw new IllegalStateException("Celltype " + c.getClass().getName() +
439  " not supported.");
440  }
441  }
442  }
443  } finally {
444  rs.close();
445  }
446 
447  // If there are no rows then no need to estimate.
448  if (currentRowCount == 0)
449  return new Pair<Long, Long>(0L, 0L);
450  // Get the size on hdfs
451  long currentHdfsSize = getHdfsSize(info);
452  // estimate the number of rows.
453  double bytesPerRow = currentRowSize / (double) currentRowCount;
454  // Compression factor two is only a best effort guess
455  long estimatedRowCount =
456  (long) ((isCompressed ? 2 : 1) * (currentHdfsSize / bytesPerRow));
457  return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow);
458  }
459 
488  public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey,
489  byte[] endRowKey) {
490  Preconditions.checkNotNull(startRowKey);
491  Preconditions.checkNotNull(endRowKey);
492 
493  boolean isCompressed = false;
494  long rowCount = 0;
495  long rowSize = 0;
496 
497  try {
498  // Check to see if things are compressed.
499  // If they are we'll estimate a compression factor.
500  if (columnFamilies_ == null) {
501  columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies();
502  }
503  Preconditions.checkNotNull(columnFamilies_);
504  for (HColumnDescriptor desc : columnFamilies_) {
505  isCompressed |= desc.getCompression() != Compression.Algorithm.NONE;
506  }
507 
508  // Fetch all regions for the key range
509  List<HRegionLocation> locations =
510  getRegionsInRange(hTable_, startRowKey, endRowKey);
511  Collections.shuffle(locations);
512  // The following variables track the number and size of 'rows' in
513  // HBase and allow incremental calculation of the average and standard
514  // deviation.
515  StatsHelper<Long> statsCount = new StatsHelper<Long>();
516  StatsHelper<Long> statsSize = new StatsHelper<Long>();
517 
518  // Collects stats samples from at least MIN_NUM_REGIONS_TO_CHECK
519  // and at most all regions until the delta is small enough.
520  while ((statsSize.count() < MIN_NUM_REGIONS_TO_CHECK ||
521  statsSize.stddev() > statsSize.mean() * DELTA_FROM_AVERAGE) &&
522  statsSize.count() < locations.size()) {
523  Pair<Long, Long> tmp = getEstimatedRowStatsForRegion(
524  locations.get((int) statsCount.count()), isCompressed);
525  statsCount.addSample(tmp.first);
526  statsSize.addSample(tmp.second);
527  }
528 
529  rowCount = (long) (getHdfsSize(null) / statsSize.mean());
530  rowSize = (long) statsSize.mean();
531  } catch (IOException ioe) {
532  // Print the stack trace, but we'll ignore it
533  // as this is just an estimate.
534  // TODO: Put this into the per query log.
535  LOG.error("Error computing HBase row count estimate", ioe);
536  return new Pair<Long, Long>(-1l, -1l);
537  }
538  return new Pair<Long, Long>(rowCount, rowSize);
539  }
540 
545  public long getHdfsSize(HRegionInfo info) throws IOException {
546  Path tableDir = HTableDescriptor.getTableDir(
547  getRootDir(hbaseConf_), Bytes.toBytes(hbaseTableName_));
548  FileSystem fs = tableDir.getFileSystem(hbaseConf_);
549  if (info != null) {
550  Path regionDir = tableDir.suffix("/" + info.getEncodedName());
551  return fs.getContentSummary(regionDir).getLength();
552  } else {
553  return fs.getContentSummary(tableDir).getLength();
554  }
555  }
556 
562  public static Path getRootDir(final Configuration c) throws IOException {
563  Path p = new Path(c.get(HConstants.HBASE_DIR));
564  FileSystem fs = p.getFileSystem(c);
565  return p.makeQualified(fs);
566  }
567 
571  @Override
572  public ArrayList<Column> getColumnsInHiveOrder() {
573  return getColumns();
574  }
575 
576  @Override
577  public TTableDescriptor toThriftDescriptor(Set<Long> referencedPartitions) {
578  TTableDescriptor tableDescriptor =
579  new TTableDescriptor(id_.asInt(), TTableType.HBASE_TABLE, getColumns().size(),
581  tableDescriptor.setHbaseTable(getTHBaseTable());
582  tableDescriptor.setColNames(getColumnNames());
583  return tableDescriptor;
584  }
585 
586  public String getHBaseTableName() {
587  return hbaseTableName_;
588  }
589 
590  public HTable getHTable() {
591  return hTable_;
592  }
593 
594  public static Configuration getHBaseConf() {
595  return hbaseConf_;
596  }
597 
598  @Override
599  public int getNumNodes() {
600  // TODO: implement
601  return 100;
602  }
603 
604  @Override
605  public TCatalogObjectType getCatalogObjectType() {
606  return TCatalogObjectType.TABLE;
607  }
608 
609  @Override
610  public TTable toThrift() {
611  TTable table = super.toThrift();
612  table.setTable_type(TTableType.HBASE_TABLE);
613  table.setHbase_table(getTHBaseTable());
614  return table;
615  }
616 
617  private THBaseTable getTHBaseTable() {
618  THBaseTable tHbaseTable = new THBaseTable();
619  tHbaseTable.setTableName(hbaseTableName_);
620  for (Column c : getColumns()) {
621  HBaseColumn hbaseCol = (HBaseColumn) c;
622  tHbaseTable.addToFamilies(hbaseCol.getColumnFamily());
623  if (hbaseCol.getColumnQualifier() != null) {
624  tHbaseTable.addToQualifiers(hbaseCol.getColumnQualifier());
625  } else {
626  tHbaseTable.addToQualifiers("");
627  }
628  tHbaseTable.addToBinary_encoded(hbaseCol.isBinaryEncoded());
629  }
630  return tHbaseTable;
631  }
632 
650  public static List<HRegionLocation> getRegionsInRange(HTable hbaseTbl,
651  final byte[] startKey, final byte[] endKey) throws IOException {
652  final boolean endKeyIsEndOfTable = Bytes.equals(endKey, HConstants.EMPTY_END_ROW);
653  if ((Bytes.compareTo(startKey, endKey) > 0) && !endKeyIsEndOfTable) {
654  throw new IllegalArgumentException("Invalid range: " +
655  Bytes.toStringBinary(startKey) + " > " + Bytes.toStringBinary(endKey));
656  }
657  final List<HRegionLocation> regionList = new ArrayList<HRegionLocation>();
658  byte[] currentKey = startKey;
659  // Make sure only one thread is accessing the hbaseTbl.
660  synchronized (hbaseTbl) {
661  do {
662  // always reload region location info.
663  HRegionLocation regionLocation = hbaseTbl.getRegionLocation(currentKey, true);
664  regionList.add(regionLocation);
665  currentKey = regionLocation.getRegionInfo().getEndKey();
666  } while (!Bytes.equals(currentKey, HConstants.EMPTY_END_ROW) &&
667  (endKeyIsEndOfTable || Bytes.compareTo(currentKey, endKey) < 0));
668  }
669  return regionList;
670  }
671 
675  @Override
676  public String getStorageHandlerClassName() {
677  return HBASE_STORAGE_HANDLER;
678  }
679 
685  public TResultSet getTableStats() {
686  TResultSet result = new TResultSet();
687  TResultSetMetadata resultSchema = new TResultSetMetadata();
688  result.setSchema(resultSchema);
689  resultSchema.addToColumns(
690  new TColumn("Region Location", Type.STRING.toThrift()));
691  resultSchema.addToColumns(new TColumn("Start RowKey",
692  Type.STRING.toThrift()));
693  resultSchema.addToColumns(new TColumn("Est. #Rows", Type.BIGINT.toThrift()));
694  resultSchema.addToColumns(new TColumn("Size", Type.STRING.toThrift()));
695 
696  // TODO: Consider fancier stats maintenance techniques for speeding up this process.
697  // Currently, we list all regions and perform a mini-scan of each of them to
698  // estimate the number of rows, the data size, etc., which is rather expensive.
699  try {
700  long totalNumRows = 0;
701  long totalHdfsSize = 0;
702  List<HRegionLocation> regions = HBaseTable.getRegionsInRange(hTable_,
703  HConstants.EMPTY_END_ROW, HConstants.EMPTY_START_ROW);
704  for (HRegionLocation region : regions) {
705  TResultRowBuilder rowBuilder = new TResultRowBuilder();
706  HRegionInfo regionInfo = region.getRegionInfo();
707  Pair<Long, Long> estRowStats = getEstimatedRowStatsForRegion(region, false);
708 
709  long numRows = estRowStats.first.longValue();
710  long hdfsSize = getHdfsSize(regionInfo);
711  totalNumRows += numRows;
712  totalHdfsSize += hdfsSize;
713 
714  // Add the region location, start rowkey, number of rows and raw Hdfs size.
715  rowBuilder.add(String.valueOf(region.getHostname()))
716  .add(Bytes.toString(regionInfo.getStartKey())).add(numRows)
717  .addBytes(hdfsSize);
718  result.addToRows(rowBuilder.get());
719  }
720 
721  // Total num rows and raw Hdfs size.
722  if (regions.size() > 1) {
723  TResultRowBuilder rowBuilder = new TResultRowBuilder();
724  rowBuilder.add("Total").add("").add(totalNumRows).addBytes(totalHdfsSize);
725  result.addToRows(rowBuilder.get());
726  }
727  } catch (IOException e) {
728  throw new RuntimeException(e);
729  }
730  return result;
731  }
732 
739  public static boolean isHBaseTable(
740  org.apache.hadoop.hive.metastore.api.Table msTbl) {
741  if (msTbl.getParameters() != null &&
742  msTbl.getParameters().containsKey(HBASE_STORAGE_HANDLER)) {
743  return true;
744  }
745  StorageDescriptor sd = msTbl.getSd();
746  if (sd == null) return false;
747  if (sd.getInputFormat() != null && sd.getInputFormat().equals(HBASE_INPUT_FORMAT)) {
748  return true;
749  } else if (sd.getSerdeInfo() != null &&
750  sd.getSerdeInfo().getSerializationLib() != null &&
751  sd.getSerdeInfo().getSerializationLib().equals(HBASE_SERIALIZATION_LIB)) {
752  return true;
753  }
754  return false;
755  }
756 }
List< String > getColumnNames()
Definition: Table.java:354
static long getRowCount(Map< String, String > parameters)
Definition: Table.java:191
void load(Table oldValue, HiveMetaStoreClient client, org.apache.hadoop.hive.metastore.api.Table msTbl)
boolean supportsBinaryEncoding(FieldSchema fs)
static final ScalarType BIGINT
Definition: Type.java:50
void addColumn(Column col)
Definition: Table.java:114
static final String HBASE_STORAGE_HANDLER
static final ScalarType STRING
Definition: Type.java:53
static boolean isHBaseTable(org.apache.hadoop.hive.metastore.api.Table msTbl)
static final Configuration hbaseConf_
Type parseColumnType(FieldSchema fs)
Definition: Table.java:331
static Configuration getHBaseConf()
ArrayList< Column > getColumns()
Definition: Table.java:349
void loadAllColumnStats(HiveMetaStoreClient client)
Definition: Table.java:155
void parseColumnMapping(boolean tableDefaultStorageIsBinary, String columnsMappingSpec, List< FieldSchema > fieldSchemas, List< String > columnFamilies, List< String > columnQualifiers, List< Boolean > colIsBinaryEncoded)
static final String ROW_KEY_COLUMN_FAMILY
Pair< Long, Long > getEstimatedRowStatsForRegion(HRegionLocation location, boolean isCompressed)
String getHBaseTableName(org.apache.hadoop.hive.metastore.api.Table tbl)
synchronized Pair< Long, Long > getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey)
static final int ROW_COUNT_ESTIMATE_BATCH_SIZE
Definition: HBaseTable.java:87
static final String HBASE_SERIALIZATION_LIB
long getHdfsSize(HRegionInfo info)
static List< HRegionLocation > getRegionsInRange(HTable hbaseTbl, final byte[] startKey, final byte[] endKey)
static Path getRootDir(final Configuration c)
org.apache.hadoop.hive.metastore.api.Table getMetaStoreTable()
Definition: Table.java:398
static final double DELTA_FROM_AVERAGE
Definition: HBaseTable.java:79
void toThrift(TColumnType container)
Table(TableId id, org.apache.hadoop.hive.metastore.api.Table msTable, Db db, String name, String owner)
Definition: Table.java:91
ArrayList< Column > getColumnsInHiveOrder()
HBaseTable(TableId id, org.apache.hadoop.hive.metastore.api.Table msTbl, Db db, String name, String owner)
TCatalogObjectType getCatalogObjectType()
string name
Definition: cpu-info.cc:50
static final ScalarType INVALID
Definition: Type.java:44
TTableDescriptor toThriftDescriptor(Set< Long > referencedPartitions)