Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hash-table.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HASH_TABLE_H
17 #define IMPALA_EXEC_HASH_TABLE_H
18 
19 #include <vector>
20 #include <boost/cstdint.hpp>
21 #include <boost/scoped_ptr.hpp>
22 #include "codegen/impala-ir.h"
23 #include "common/logging.h"
27 #include "runtime/mem-tracker.h"
28 #include "runtime/tuple-row.h"
29 #include "util/bitmap.h"
30 #include "util/hash-util.h"
31 
32 namespace llvm {
33  class Function;
34 }
35 
36 namespace impala {
37 
38 class Expr;
39 class ExprContext;
40 class LlvmCodeGen;
41 class MemTracker;
42 class RowDescriptor;
43 class RuntimeState;
44 class Tuple;
45 class TupleRow;
46 class HashTable;
47 
53 //
58 //
63 //
79 //
86 //
101 
105  public:
115  HashTableCtx(const std::vector<ExprContext*>& build_expr_ctxs,
116  const std::vector<ExprContext*>& probe_expr_ctxs, bool stores_nulls,
117  bool finds_nulls, int32_t initial_seed, int max_levels,
118  int num_build_tuples);
119 
121  void Close();
122 
123  void set_level(int level);
124  int level() const { return level_; }
125  uint32_t seed(int level) { return seeds_.at(level); }
126 
127  TupleRow* row() const { return row_; }
128 
134  void* last_expr_value(int expr_idx) const {
136  }
137 
139  bool last_expr_value_null(int expr_idx) const {
140  return expr_value_null_bits_[expr_idx];
141  }
142 
150 
152 
156  llvm::Function* CodegenEvalRow(RuntimeState* state, bool build_row);
157 
160  llvm::Function* CodegenEquals(RuntimeState* state);
161 
167  llvm::Function* CodegenHashCurrentRow(RuntimeState* state, bool use_murmur);
168 
169  static const char* LLVM_CLASS_NAME;
170 
171  private:
172  friend class HashTable;
173 
178  DCHECK_LT(level_, seeds_.size());
179  if (var_result_begin_ == -1) {
186  } else {
188  }
189  }
190 
192  uint32_t inline Hash(const void* input, int len, int32_t hash) {
195  if (level_ == 0) return HashUtil::Hash(input, len, hash);
196  return HashUtil::MurmurHash2_64(input, len, hash);
197  }
198 
204  return EvalRow(row, build_expr_ctxs_);
205  }
206 
210  return EvalRow(row, probe_expr_ctxs_);
211  }
212 
215  uint32_t HashVariableLenRow();
216 
220  bool EvalRow(TupleRow* row, const std::vector<ExprContext*>& ctxs);
221 
225  bool IR_NO_INLINE Equals(TupleRow* build_row);
226 
227  const std::vector<ExprContext*>& build_expr_ctxs_;
228  const std::vector<ExprContext*>& probe_expr_ctxs_;
229 
234  const bool stores_nulls_;
235  const bool finds_nulls_;
236 
239  int level_;
240 
242  std::vector<uint32_t> seeds_;
243 
246  std::vector<int> expr_values_buffer_offsets_;
247 
252 
256 
260 
264 
267 
269  uint32_t GetHashSeed() const;
270 };
271 
281 class HashTable {
282  private:
283 
285  union HtData {
288  };
289 
291  struct DuplicateNode {
297  bool matched;
298 
299  DuplicateNode* next; // Chain to next duplicate node, NULL when end of list.
301  };
302 
303  struct Bucket {
305  bool filled;
306 
311  bool matched;
312 
316 
319  uint32_t hash;
320 
322  union {
325  } bucketData;
326  };
327 
328  public:
329  class Iterator;
330 
343  int num_build_tuples, BufferedTupleStream* tuple_stream,
344  int64_t max_num_buckets, int64_t initial_num_buckets = 1024);
345 
348  HashTable(MemPool* pool, bool quadratic_probing, int num_buckets);
349 
351  bool Init();
352 
354  void Close();
355 
364  bool IR_ALWAYS_INLINE Insert(HashTableCtx* ht_ctx,
365  const BufferedTupleStream::RowIdx& idx, TupleRow* row, uint32_t hash);
366 
369  bool IR_ALWAYS_INLINE Insert(HashTableCtx* ht_ctx, Tuple* tuple, uint32_t hash);
370 
377  Iterator IR_ALWAYS_INLINE Find(HashTableCtx* ht_ctx, uint32_t hash);
378 
380  int64_t size() const {
382  }
383 
385  int64_t EmptyBuckets() const { return num_buckets_ - num_filled_buckets_; }
386 
388  int64_t num_buckets() const { return num_buckets_; }
389 
391  double load_factor() const {
392  return static_cast<double>(num_filled_buckets_) / num_buckets_;
393  }
394 
398  static int64_t EstimateNumBuckets(int64_t num_rows) {
400  return BitUtil::NextPowerOfTwo(3 * num_rows / 2);
401  }
402  static int64_t EstimateSize(int64_t num_rows) {
403  int64_t num_buckets = EstimateNumBuckets(num_rows);
404  return num_buckets * sizeof(Bucket);
405  }
406 
409  int64_t CurrentMemSize() const;
410 
416  bool CheckAndResize(uint64_t buckets_to_fill, HashTableCtx* ht_ctx);
417 
419  int64_t byte_size() const { return total_data_page_size_; }
420 
423  Iterator Begin(HashTableCtx* ht_ctx);
424 
429 
431  bool HasMatches() const { return has_matches_; }
432 
434  Iterator End() { return Iterator(); }
435 
440  std::string DebugString(bool skip_empty, bool show_match,
441  const RowDescriptor* build_desc);
442 
444  void DebugStringTuple(std::stringstream& ss, HtData& htdata, const RowDescriptor* desc);
445 
447  std::string PrintStats() const;
448 
450  class Iterator {
451  private:
453  static const int64_t BUCKET_NOT_FOUND = -1;
454 
455  public:
456 
457  Iterator() : table_(NULL), row_(NULL), bucket_idx_(BUCKET_NOT_FOUND), node_(NULL) { }
458 
460  void IR_ALWAYS_INLINE Next();
461 
468 
471  void NextUnmatched();
472 
476  TupleRow* GetRow() const;
477  Tuple* GetTuple() const;
478 
482  void SetMatched();
483 
486  bool IsMatched() const;
487 
489  void SetAtEnd();
490 
492  bool AtEnd() const { return bucket_idx_ == BUCKET_NOT_FOUND; }
493 
494  private:
495  friend class HashTable;
496 
497  Iterator(HashTable* table, TupleRow* row, int bucket_idx, DuplicateNode* node,
498  uint32_t hash)
499  : table_(table),
500  row_(row),
501  bucket_idx_(bucket_idx),
502  node_(node) {
503  }
504 
507 
510  int64_t bucket_idx_;
511 
514  };
515 
516  private:
517  friend class Iterator;
518  friend class HashTableTest;
519 
534  //
536  int64_t IR_ALWAYS_INLINE Probe(Bucket* buckets, int64_t num_buckets,
537  HashTableCtx* ht_ctx, uint32_t hash, bool* found);
538 
542 
547  void NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node);
548 
550  bool ResizeBuckets(int64_t num_buckets, HashTableCtx* ht_ctx);
551 
555 
566 
569  void IR_ALWAYS_INLINE PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash);
570 
572  TupleRow* GetRow(HtData& htdata, TupleRow* row) const;
573 
576  TupleRow* GetRow(Bucket* bucket, TupleRow* row) const;
577 
579  bool GrowNodeArray();
580 
583  static const double MAX_FILL_FACTOR;
584 
586 
589 
594 
597 
602  const bool stores_tuples_;
603 
605  const bool quadratic_probing_;
606 
608  std::vector<BufferedBlockMgr::Block*> data_pages_;
609 
612 
615 
618 
621 
622  const int64_t max_num_buckets_;
623 
627 
629  int64_t num_buckets_;
630 
633 
637 
641 
646 
650  int64_t num_probes_;
651 
654 
657  int64_t travel_length_;
658 
662 
664  int64_t num_resizes_;
665 };
666 
667 }
668 
669 #endif
void set_level(int level)
uint32_t Hash(const void *input, int len, int32_t hash)
Wrapper function for calling correct HashUtil function in non-codegen'd case.
Definition: hash-table.h:192
stl-like iterator interface.
Definition: hash-table.h:450
#define IR_NO_INLINE
Definition: impala-ir.h:30
The underlying memory management is done by the BufferedBlockMgr.
RuntimeState * state_
Definition: hash-table.h:585
bool AtEnd() const
Returns true if this iterator is at the end, i.e. GetRow() cannot be called.
Definition: hash-table.h:492
std::string PrintStats() const
Update and print some statistics that can be used for performance debugging.
Definition: hash-table.cc:424
Iterator End()
Return end marker.
Definition: hash-table.h:434
bool IR_NO_INLINE Equals(TupleRow *build_row)
Definition: hash-table.cc:171
bool GrowNodeArray()
Grow the node array. Returns false on OOM.
Definition: hash-table.cc:345
int results_buffer_size() const
Definition: hash-table.h:151
static int64_t EstimateNumBuckets(int64_t num_rows)
Definition: hash-table.h:398
int64_t travel_length_
Definition: hash-table.h:657
void DebugStringTuple(std::stringstream &ss, HtData &htdata, const RowDescriptor *desc)
Print the content of a bucket or node.
Definition: hash-table.cc:373
bool EvalRow(TupleRow *row, const std::vector< ExprContext * > &ctxs)
Definition: hash-table.cc:124
void SetAtEnd()
Resets everything but the pointer to the hash table.
int64_t byte_size() const
Returns the number of bytes allocated to the hash table.
Definition: hash-table.h:419
llvm::Function * CodegenHashCurrentRow(RuntimeState *state, bool use_murmur)
Definition: hash-table.cc:647
uint8_t * expr_values_buffer_
Definition: hash-table.h:259
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48
const StringSearch UrlParser::hash_search & hash
Definition: url-parser.cc:41
BufferedBlockMgr::Client * block_mgr_client_
Client to allocate data pages with.
Definition: hash-table.h:588
static const double MAX_FILL_FACTOR
Definition: hash-table.h:583
DuplicateNode *IR_ALWAYS_INLINE AppendNextNode(Bucket *bucket)
const std::vector< ExprContext * > & build_expr_ctxs_
Definition: hash-table.h:227
HtData *IR_ALWAYS_INLINE InsertInternal(HashTableCtx *ht_ctx, uint32_t hash)
DuplicateNode * duplicates
Definition: hash-table.h:324
static const int64_t BUCKET_NOT_FOUND
Bucket index value when probe is not successful.
Definition: hash-table.h:453
bool Init()
Allocates the initial bucket structure. Returns false if OOM.
Definition: hash-table.cc:245
void * last_expr_value(int expr_idx) const
Definition: hash-table.h:134
Iterator Begin(HashTableCtx *ht_ctx)
static const char * LLVM_CLASS_NAME
Definition: hash-table.h:169
union impala::HashTable::Bucket::@6 bucketData
Either the data for this bucket or the linked list of duplicates.
bool CheckAndResize(uint64_t buckets_to_fill, HashTableCtx *ht_ctx)
Definition: hash-table.cc:282
int64_t EmptyBuckets() const
Returns the number of empty buckets.
Definition: hash-table.h:385
std::vector< BufferedBlockMgr::Block * > data_pages_
Data pages for all nodes. These are always pinned.
Definition: hash-table.h:608
Either the row in the tuple stream or a pointer to the single tuple of this row.
Definition: hash-table.h:285
bool HasMatches() const
Return true if there was a least one match.
Definition: hash-table.h:431
const bool quadratic_probing_
Quadratic probing enabled (as opposed to linear).
Definition: hash-table.h:605
const std::vector< ExprContext * > & probe_expr_ctxs_
Definition: hash-table.h:228
void IR_ALWAYS_INLINE NextDuplicate()
int64_t num_duplicate_nodes_
Number of duplicate nodes.
Definition: hash-table.h:620
Linked list of entries used for duplicates.
Definition: hash-table.h:291
#define IR_ALWAYS_INLINE
Definition: impala-ir.h:31
int64_t num_filled_buckets_
Number of non-empty buckets. Used to determine when to resize.
Definition: hash-table.h:632
const bool stores_nulls_
Definition: hash-table.h:234
bool filled
Whether this bucket contains a vaild entry, or it is empty.
Definition: hash-table.h:305
uint32_t GetHashSeed() const
Cross-compiled functions to access member variables used in CodegenHashCurrentRow().
std::vector< int > expr_values_buffer_offsets_
Definition: hash-table.h:246
const bool stores_tuples_
Definition: hash-table.h:602
static uint32_t Hash(const void *data, int32_t bytes, uint32_t seed)
Definition: hash-util.h:135
int level() const
Definition: hash-table.h:124
ObjectPool pool
llvm::Function * CodegenEquals(RuntimeState *state)
Definition: hash-table.cc:820
BufferedTupleStream::RowIdx idx
Definition: hash-table.h:286
void IR_ALWAYS_INLINE Next()
Iterates to the next element. It should be called only if !AtEnd().
int64_t num_resizes_
How many times this table has resized so far.
Definition: hash-table.h:664
llvm::Function * CodegenEvalRow(RuntimeState *state, bool build_row)
Definition: hash-table.cc:519
bool ResizeBuckets(int64_t num_buckets, HashTableCtx *ht_ctx)
Resize the hash table to 'num_buckets'. Returns false on OOM.
Definition: hash-table.cc:293
int64_t CurrentMemSize() const
Definition: hash-table.cc:278
uint32_t seed(int level)
Definition: hash-table.h:125
TupleRow * GetRow(HtData &htdata, TupleRow *row) const
Return the TupleRow pointed by 'htdata'.
int64_t num_probes_
Definition: hash-table.h:650
int64_t num_buckets_with_duplicates_
Definition: hash-table.h:636
DuplicateNode *IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx)
int64_t num_hash_collisions_
Definition: hash-table.h:661
void IR_ALWAYS_INLINE PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash)
int64_t num_buckets_
Total number of buckets (filled and empty).
Definition: hash-table.h:629
HashTableCtx(const std::vector< ExprContext * > &build_expr_ctxs, const std::vector< ExprContext * > &probe_expr_ctxs, bool stores_nulls, bool finds_nulls, int32_t initial_seed, int max_levels, int num_build_tuples)
Definition: hash-table.cc:83
int node_remaining_current_page_
Number of nodes left in the current page.
Definition: hash-table.h:617
uint8_t * expr_value_null_bits_
Definition: hash-table.h:263
MemPool * data_page_pool_
Only used for tests to allocate data pages instead of the block mgr.
Definition: hash-table.h:596
uint32_t IR_NO_INLINE HashCurrentRow()
Definition: hash-table.h:177
Iterator FirstUnmatched(HashTableCtx *ctx)
std::vector< uint32_t > seeds_
The seeds to use for hashing. Indexed by the level.
Definition: hash-table.h:242
friend class Iterator
Definition: hash-table.h:517
TupleRow * row_
Scratch buffer to generate rows on the fly.
Definition: hash-table.h:266
static int64_t NextPowerOfTwo(int64_t v)
Definition: bit-util.h:50
int64_t num_buckets() const
Returns the number of buckets.
Definition: hash-table.h:388
DuplicateNode * next_node_
Next duplicate node to insert. Vaild when node_remaining_current_page_ > 0.
Definition: hash-table.h:614
bool IR_NO_INLINE EvalProbeRow(TupleRow *row)
Definition: hash-table.h:209
int64_t total_data_page_size_
Byte size of all buffers in data_pages_.
Definition: hash-table.h:611
Iterator IR_ALWAYS_INLINE Find(HashTableCtx *ht_ctx, uint32_t hash)
const bool finds_nulls_
Definition: hash-table.h:235
HashTable(RuntimeState *state, BufferedBlockMgr::Client *client, int num_build_tuples, BufferedTupleStream *tuple_stream, int64_t max_num_buckets, int64_t initial_num_buckets=1024)
Definition: hash-table.cc:192
uint32_t HashVariableLenRow()
Definition: hash-table.cc:146
void NextFilledBucket(int64_t *bucket_idx, DuplicateNode **node)
bool last_expr_value_null(int expr_idx) const
Returns if the expr at 'expr_idx' evaluated to NULL for the last row.
Definition: hash-table.h:139
static int64_t EstimateSize(int64_t num_rows)
Definition: hash-table.h:402
int64_t IR_ALWAYS_INLINE Probe(Bucket *buckets, int64_t num_buckets, HashTableCtx *ht_ctx, uint32_t hash, bool *found)
There are wrappers of this function that perform the Find and Insert logic.
double load_factor() const
Returns the load factor (the number of non-empty buckets)
Definition: hash-table.h:391
void Close()
Call to cleanup any resources.
Definition: hash-table.cc:112
BufferedTupleStream * tuple_stream_
Definition: hash-table.h:593
void Close()
Call to cleanup any resources. Must be called once.
Definition: hash-table.cc:257
const int64_t max_num_buckets_
Definition: hash-table.h:622
bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow *row, uint32_t *hash)
DuplicateNode * node_
Pointer to the current duplicate node.
Definition: hash-table.h:513
Iterator(HashTable *table, TupleRow *row, int bucket_idx, DuplicateNode *node, uint32_t hash)
Definition: hash-table.h:497
bool IR_NO_INLINE EvalBuildRow(TupleRow *row)
Definition: hash-table.h:203
bool IR_ALWAYS_INLINE Insert(HashTableCtx *ht_ctx, const BufferedTupleStream::RowIdx &idx, TupleRow *row, uint32_t hash)
int64_t size() const
Returns number of elements inserted in the hash table.
Definition: hash-table.h:380
static uint64_t MurmurHash2_64(const void *input, int len, uint64_t seed)
Murmur2 hash implementation returning 64-bit hashes.
Definition: hash-util.h:64
std::string DebugString(bool skip_empty, bool show_match, const RowDescriptor *build_desc)
Definition: hash-table.cc:387
TupleRow * row() const
Definition: hash-table.h:127
bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow *row, uint32_t *hash)
int64_t num_failed_probes_
Number of probes that failed and had to fall back to linear probing without cap.
Definition: hash-table.h:653
Bucket * buckets_
Definition: hash-table.h:626