Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
old-hash-table.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_OLD_HASH_TABLE_H
17 #define IMPALA_EXEC_OLD_HASH_TABLE_H
18 
19 #include <vector>
20 #include <boost/cstdint.hpp>
21 #include "codegen/impala-ir.h"
22 #include "common/logging.h"
23 #include "runtime/mem-pool.h"
24 #include "util/bitmap.h"
25 #include "util/hash-util.h"
26 #include "util/runtime-profile.h"
27 
28 namespace llvm {
29  class Function;
30 }
31 
32 namespace impala {
33 
34 class Expr;
35 class ExprContext;
36 class LlvmCodeGen;
37 class MemTracker;
38 class RowDescriptor;
39 class RuntimeState;
40 class Tuple;
41 class TupleRow;
42 
46 
55 //
60 //
62 //
76 //
84 class OldHashTable {
85  private:
86  struct Node;
87 
88  public:
89  class Iterator;
90 
104  OldHashTable(RuntimeState* state, const std::vector<ExprContext*>& build_expr_ctxs,
105  const std::vector<ExprContext*>& probe_expr_ctxs, int num_build_tuples,
106  bool stores_nulls, bool finds_nulls, int32_t initial_seed,
107  MemTracker* mem_tracker, bool stores_tuples = false, int64_t num_buckets = 1024);
108 
110  void Close();
111 
121  if (UNLIKELY(mem_limit_exceeded_)) return false;
122  bool has_null = EvalBuildRow(row);
123  if (!stores_nulls_ && has_null) return true;
124 
128  if (UNLIKELY(mem_limit_exceeded_)) return false;
129  }
130  return InsertImpl(row);
131  }
132 
134  if (UNLIKELY(mem_limit_exceeded_)) return false;
135  bool has_null = EvalBuildRow(reinterpret_cast<TupleRow*>(&tuple));
136  if (!stores_nulls_ && has_null) return true;
137 
141  if (UNLIKELY(mem_limit_exceeded_)) return false;
142  }
143  return InsertImpl(tuple);
144  }
145 
151  bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow* row, uint32_t* hash);
152  bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow* row, uint32_t* hash);
153 
164 
166  int64_t size() const { return num_nodes_; }
167 
169  int64_t num_buckets() const { return buckets_.size(); }
170 
172  float load_factor() const {
173  return num_filled_buckets_ / static_cast<float>(buckets_.size());
174  }
175 
178  static int64_t EstimateSize(int64_t num_rows) {
180  int64_t num_buckets = num_rows * 2;
181  return num_buckets * sizeof(Bucket) + num_rows * sizeof(Node);
182  }
183 
185  int64_t byte_size() const {
186  int64_t nodes_mem = (num_nodes_ + node_remaining_current_page_) * sizeof(Node);
187  return nodes_mem + sizeof(Bucket) * buckets_.capacity();
188  }
189 
190  bool mem_limit_exceeded() const { return mem_limit_exceeded_; }
191 
197  void* last_expr_value(int expr_idx) const {
199  }
200 
202  bool last_expr_value_null(int expr_idx) const {
203  return expr_value_null_bits_[expr_idx];
204  }
205 
212  void AddBitmapFilters();
213 
216  Iterator Begin();
217 
221 
223  Iterator End() { return Iterator(); }
224 
228  llvm::Function* CodegenEvalTupleRow(RuntimeState* state, bool build_row);
229 
232  llvm::Function* CodegenHashCurrentRow(RuntimeState* state);
233 
236  llvm::Function* CodegenEquals(RuntimeState* state);
237 
238  static const char* LLVM_CLASS_NAME;
239 
243  std::string DebugString(bool skip_empty, bool show_match,
244  const RowDescriptor* build_desc);
245 
247  class Iterator {
248  public:
249  Iterator() : table_(NULL), bucket_idx_(-1), node_(NULL) {
250  }
251 
255  template<bool check_match>
256  void IR_ALWAYS_INLINE Next();
257 
261  bool NextUnmatched();
262 
266  DCHECK(!AtEnd());
267  DCHECK(!table_->stores_tuples_);
268  return reinterpret_cast<TupleRow*>(node_->data);
269  }
270 
272  DCHECK(!AtEnd());
273  DCHECK(table_->stores_tuples_);
274  return reinterpret_cast<Tuple*>(node_->data);
275  }
276 
277  void set_matched(bool v) {
278  DCHECK(!AtEnd());
279  node_->matched = v;
280  }
281 
282  bool matched() const {
283  DCHECK(!AtEnd());
284  return node_->matched;
285  }
286 
287  void reset() {
288  bucket_idx_ = -1;
289  node_ = NULL;
290  }
291 
293  bool AtEnd() const { return node_ == NULL; }
294  bool operator!=(const Iterator& rhs) { return !(*this == rhs); }
295 
296  bool operator==(const Iterator& rhs) {
297  return bucket_idx_ == rhs.bucket_idx_ && node_ == rhs.node_;
298  }
299 
300  private:
301  friend class OldHashTable;
302 
303  Iterator(OldHashTable* table, int bucket_idx, Node* node, uint32_t hash) :
304  table_(table),
305  bucket_idx_(bucket_idx),
306  node_(node),
307  scan_hash_(hash) {
308  }
309 
311 
313  int64_t bucket_idx_;
314 
317 
319  uint32_t scan_hash_;
320  };
321 
322  private:
323  friend class Iterator;
324  friend class OldHashTableTest;
325 
328  struct Node {
334  bool matched;
335 
336  uint32_t hash; // Cache of the hash for data_
337  Node* next; // Chain to next node for collisions
338  void* data; // Either the Tuple* or TupleRow*
339  };
340 
341  struct Bucket {
343  Bucket() : node(NULL) { }
344  };
345 
348  Bucket* NextBucket(int64_t* bucket_idx);
349 
351  void ResizeBuckets(int64_t num_buckets);
352 
354  bool IR_ALWAYS_INLINE InsertImpl(void* data);
355 
358  void AddToBucket(Bucket* bucket, Node* node);
359 
362  void MoveNode(Bucket* from_bucket, Bucket* to_bucket, Node* node,
363  Node* previous_node);
364 
368  bool EvalRow(TupleRow* row, const std::vector<ExprContext*>& ctxs);
369 
375  return EvalRow(row, build_expr_ctxs_);
376  }
377 
381  return EvalRow(row, probe_expr_ctxs_);
382  }
383 
388  if (var_result_begin_ == -1) {
392  } else {
393  return HashVariableLenRow();
394  }
395  }
396 
397  TupleRow* GetRow(Node* node) const {
398  if (stores_tuples_) {
399  return reinterpret_cast<TupleRow*>(&node->data);
400  } else {
401  return reinterpret_cast<TupleRow*>(node->data);
402  }
403  }
404 
407  uint32_t HashVariableLenRow();
408 
412  bool Equals(TupleRow* build_row);
413 
415  void GrowNodeArray();
416 
420  void MemLimitExceeded(int64_t allocation_size);
421 
424  static const float MAX_BUCKET_OCCUPANCY_FRACTION;
425 
427 
428  const std::vector<ExprContext*>& build_expr_ctxs_;
429  const std::vector<ExprContext*>& probe_expr_ctxs_;
430 
432  const int num_build_tuples_;
433 
437  const bool stores_nulls_;
438  const bool finds_nulls_;
439  const bool stores_tuples_;
440 
441  const int32_t initial_seed_;
442 
445 
447  int64_t num_nodes_;
448 
450  boost::scoped_ptr<MemPool> mem_pool_;
451 
454 
457 
460 
462 
466 
467  std::vector<Bucket> buckets_;
468 
470  int64_t num_buckets_;
471 
474 
477  std::vector<int> expr_values_buffer_offsets_;
478 
481 
484 
488 
492 };
493 
494 }
495 
496 #endif
stl-like iterator interface.
#define IR_NO_INLINE
Definition: impala-ir.h:30
std::vector< Bucket > buckets_
OldHashTable(RuntimeState *state, const std::vector< ExprContext * > &build_expr_ctxs, const std::vector< ExprContext * > &probe_expr_ctxs, int num_build_tuples, bool stores_nulls, bool finds_nulls, int32_t initial_seed, MemTracker *mem_tracker, bool stores_tuples=false, int64_t num_buckets=1024)
bool operator!=(const Iterator &rhs)
uint32_t IR_NO_INLINE HashCurrentRow()
int64_t num_filled_buckets_
Number of non-empty buckets. Used to determine when to grow and rehash.
void * last_expr_value(int expr_idx) const
uint32_t HashVariableLenRow()
Node * node_
Current node idx (within current bucket)
llvm::Function * CodegenHashCurrentRow(RuntimeState *state)
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48
const StringSearch UrlParser::hash_search & hash
Definition: url-parser.cc:41
bool AtEnd() const
Returns true if this iterator is at the end, i.e. GetRow() cannot be called.
std::vector< int > expr_values_buffer_offsets_
TupleRow * GetRow(Node *node) const
uint8_t * expr_values_buffer_
uint32_t scan_hash_
Cached hash value for the row passed to Find()
bool IR_NO_INLINE EvalProbeRow(TupleRow *row)
int64_t size() const
Returns number of elements in the hash table.
Bucket * NextBucket(int64_t *bucket_idx)
#define IR_ALWAYS_INLINE
Definition: impala-ir.h:31
const int32_t initial_seed_
bool IR_NO_INLINE EvalBuildRow(TupleRow *row)
bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow *row, uint32_t *hash)
llvm::Function * CodegenEvalTupleRow(RuntimeState *state, bool build_row)
static const float MAX_BUCKET_OCCUPANCY_FRACTION
int node_remaining_current_page_
Number of nodes left in the current page.
bool last_expr_value_null(int expr_idx) const
Returns if the expr at 'expr_idx' evaluated to NULL for the last row.
MemTracker * mem_tracker_
Node * next_node_
Next node to insert.
bool IR_ALWAYS_INLINE InsertImpl(void *data)
Insert row into the hash table.
bool mem_limit_exceeded() const
const std::vector< ExprContext * > & probe_expr_ctxs_
std::string DebugString(bool skip_empty, bool show_match, const RowDescriptor *build_desc)
The hash table does not support removes. The hash table is not thread safe.
static uint32_t Hash(const void *data, int32_t bytes, uint32_t seed)
Definition: hash-util.h:135
int var_result_begin_
byte offset into expr_values_buffer_ that begins the variable length results
void GrowNodeArray()
Grow the node array.
void MemLimitExceeded(int64_t allocation_size)
boost::scoped_ptr< MemPool > mem_pool_
MemPool used to allocate data pages.
bool IR_ALWAYS_INLINE Insert(TupleRow *row)
This class is thread-safe.
Definition: mem-tracker.h:61
bool operator==(const Iterator &rhs)
static const char * LLVM_CLASS_NAME
int64_t num_buckets_
equal to buckets_.size() but more efficient than the size function
int num_data_pages_
Number of data pages for nodes.
Iterator End()
Returns end marker.
void AddToBucket(Bucket *bucket, Node *node)
int64_t byte_size() const
Returns the number of bytes allocated to the hash table.
void IR_ALWAYS_INLINE Next()
void Close()
Call to cleanup any resources. Must be called once.
uint8_t * expr_value_null_bits_
const std::vector< ExprContext * > & build_expr_ctxs_
void ResizeBuckets(int64_t num_buckets)
Resize the hash table to 'num_buckets'.
Iterator IR_ALWAYS_INLINE Find(TupleRow *probe_row)
RuntimeState * state_
#define UNLIKELY(expr)
Definition: compiler-util.h:33
bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow *row, uint32_t *hash)
bool Equals(TupleRow *build_row)
void MoveNode(Bucket *from_bucket, Bucket *to_bucket, Node *node, Node *previous_node)
bool EvalRow(TupleRow *row, const std::vector< ExprContext * > &ctxs)
int64_t num_nodes_
number of nodes stored (i.e. size of hash table)
llvm::Function * CodegenEquals(RuntimeState *state)
int64_t num_buckets() const
Returns the number of buckets.
bool IR_ALWAYS_INLINE Insert(Tuple *tuple)
const int num_build_tuples_
Number of Tuple* in the build tuple row.
float load_factor() const
Returns the load factor (the number of non-empty buckets)
int64_t bucket_idx_
Current bucket idx.
int64_t num_buckets_till_resize_
The number of filled buckets to trigger a resize. This is cached for efficiency.
static int64_t EstimateSize(int64_t num_rows)
int results_buffer_size_
byte size of 'expr_values_buffer_'
Iterator(OldHashTable *table, int bucket_idx, Node *node, uint32_t hash)