Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
hash-table.inline.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_HASH_TABLE_INLINE_H
17 #define IMPALA_EXEC_HASH_TABLE_INLINE_H
18 
19 #include "exec/hash-table.h"
20 
21 namespace impala {
22 
23 inline bool HashTableCtx::EvalAndHashBuild(TupleRow* row, uint32_t* hash) {
24  bool has_null = EvalBuildRow(row);
25  if (!stores_nulls_ && has_null) return false;
26  *hash = HashCurrentRow();
27  return true;
28 }
29 
30 inline bool HashTableCtx::EvalAndHashProbe(TupleRow* row, uint32_t* hash) {
31  bool has_null = EvalProbeRow(row);
32  if ((!stores_nulls_ || !finds_nulls_) && has_null) return false;
33  *hash = HashCurrentRow();
34  return true;
35 }
36 
37 inline int64_t HashTable::Probe(Bucket* buckets, int64_t num_buckets,
38  HashTableCtx* ht_ctx, uint32_t hash, bool* found) {
39  DCHECK_NOTNULL(buckets);
40  DCHECK_GT(num_buckets, 0);
41  *found = false;
42  int64_t bucket_idx = hash & (num_buckets - 1);
43 
44  // In case of linear probing it counts the total number of steps for statistics and
45  // for knowing when to exit the loop (e.g. by capping the total travel length). In case
46  // of quadratic probing it is also used for calculating the length of the next jump.
47  int64_t step = 0;
48  do {
49  Bucket* bucket = &buckets[bucket_idx];
50  if (!bucket->filled) return bucket_idx;
51  if (hash == bucket->hash) {
52  if (ht_ctx != NULL && ht_ctx->Equals(GetRow(bucket, ht_ctx->row_))) {
53  *found = true;
54  return bucket_idx;
55  }
56  // Row equality failed, or not performed. This is a hash collision. Continue
57  // searching.
59  }
60  // Move to the next bucket.
61  ++step;
63  if (quadratic_probing_) {
64  // The i-th probe location is idx = (hash + (step * (step + 1)) / 2) mod num_buckets.
65  // This gives num_buckets unique idxs (between 0 and N-1) when num_buckets is a power
66  // of 2.
67  bucket_idx = (bucket_idx + step) & (num_buckets - 1);
68  } else {
69  bucket_idx = (bucket_idx + 1) & (num_buckets - 1);
70  }
71  } while (LIKELY(step < num_buckets));
72  DCHECK_EQ(num_filled_buckets_, num_buckets) << "Probing of a non-full table "
73  << "failed: " << quadratic_probing_ << " " << hash;
75 }
76 
78  uint32_t hash) {
79  ++num_probes_;
80  bool found = false;
81  int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, &found);
82  DCHECK_NE(bucket_idx, Iterator::BUCKET_NOT_FOUND);
83  if (found) {
84  // We need to insert a duplicate node, note that this may fail to allocate memory.
85  DuplicateNode* new_node = InsertDuplicateNode(bucket_idx);
86  if (UNLIKELY(new_node == NULL)) return NULL;
87  return &new_node->htdata;
88  } else {
89  PrepareBucketForInsert(bucket_idx, hash);
90  return &buckets_[bucket_idx].bucketData.htdata;
91  }
92 }
93 
94 inline bool HashTable::Insert(HashTableCtx* ht_ctx,
95  const BufferedTupleStream::RowIdx& idx, TupleRow* row, uint32_t hash) {
96  if (stores_tuples_) return Insert(ht_ctx, row->GetTuple(0), hash);
97  HtData* htdata = InsertInternal(ht_ctx, hash);
98  // If successful insert, update the contents of the newly inserted entry with 'idx'.
99  if (LIKELY(htdata != NULL)) {
100  htdata->idx = idx;
101  return true;
102  }
103  return false;
104 }
105 
106 inline bool HashTable::Insert(HashTableCtx* ht_ctx, Tuple* tuple, uint32_t hash) {
107  DCHECK(stores_tuples_);
108  HtData* htdata = InsertInternal(ht_ctx, hash);
109  // If successful insert, update the contents of the newly inserted entry with 'tuple'.
110  if (LIKELY(htdata != NULL)) {
111  htdata->tuple = tuple;
112  return true;
113  }
114  return false;
115 }
116 
118  ++num_probes_;
119  bool found = false;
120  int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, &found);
121  if (found) {
122  return Iterator(this, ht_ctx->row(), bucket_idx,
123  buckets_[bucket_idx].bucketData.duplicates, hash);
124  }
125  return End();
126 }
127 
129  int64_t bucket_idx = Iterator::BUCKET_NOT_FOUND;
130  DuplicateNode* node = NULL;
131  NextFilledBucket(&bucket_idx, &node);
132  return Iterator(this, ctx->row(), bucket_idx, node, 0);
133 }
134 
136  int64_t bucket_idx = Iterator::BUCKET_NOT_FOUND;
137  DuplicateNode* node = NULL;
138  NextFilledBucket(&bucket_idx, &node);
139  Iterator it(this, ctx->row(), bucket_idx, node, 0);
140  // Check whether the bucket, or its first duplicate node, is matched. If it is not
141  // matched, then return. Otherwise, move to the first unmatched entry (node or bucket).
142  Bucket* bucket = &buckets_[bucket_idx];
143  if ((!bucket->hasDuplicates && bucket->matched) ||
144  (bucket->hasDuplicates && node->matched)) {
145  it.NextUnmatched();
146  }
147  return it;
148 }
149 
150 inline void HashTable::NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node) {
151  ++*bucket_idx;
152  for (; *bucket_idx < num_buckets_; ++*bucket_idx) {
153  if (buckets_[*bucket_idx].filled) {
154  *node = buckets_[*bucket_idx].bucketData.duplicates;
155  return;
156  }
157  }
158  // Reached the end of the hash table.
159  *bucket_idx = Iterator::BUCKET_NOT_FOUND;
160  *node = NULL;
161 }
162 
163 inline void HashTable::PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash) {
164  DCHECK_GE(bucket_idx, 0);
165  DCHECK_LT(bucket_idx, num_buckets_);
166  Bucket* bucket = &buckets_[bucket_idx];
167  DCHECK(!bucket->filled);
169  bucket->filled = true;
170  bucket->matched = false;
171  bucket->hasDuplicates = false;
172  bucket->hash = hash;
173 }
174 
176  DCHECK_GT(node_remaining_current_page_, 0);
177  bucket->bucketData.duplicates = next_node_;
180  return next_node_++;
181 }
182 
184  DCHECK_GE(bucket_idx, 0);
185  DCHECK_LT(bucket_idx, num_buckets_);
186  Bucket* bucket = &buckets_[bucket_idx];
187  DCHECK(bucket->filled);
188  // Allocate one duplicate node for the new data and one for the preexisting data,
189  // if needed.
190  while (node_remaining_current_page_ < 1 + !bucket->hasDuplicates) {
191  if (UNLIKELY(!GrowNodeArray())) return NULL;
192  }
193  if (!bucket->hasDuplicates) {
194  // This is the first duplicate in this bucket. It means that we need to convert
195  // the current entry in the bucket to a node and link it from the bucket.
197  DCHECK(!bucket->matched);
198  next_node_->matched = false;
199  next_node_->next = NULL;
200  AppendNextNode(bucket);
201  bucket->hasDuplicates = true;
203  }
204  // Link a new node.
206  next_node_->matched = false;
207  return AppendNextNode(bucket);
208 }
209 
210 inline TupleRow* HashTable::GetRow(HtData& htdata, TupleRow* row) const {
211  if (stores_tuples_) {
212  return reinterpret_cast<TupleRow*>(&htdata.tuple);
213  } else {
214  tuple_stream_->GetTupleRow(htdata.idx, row);
215  return row;
216  }
217 }
218 
219 inline TupleRow* HashTable::GetRow(Bucket* bucket, TupleRow* row) const {
220  DCHECK_NOTNULL(bucket);
221  if (UNLIKELY(bucket->hasDuplicates)) {
222  DuplicateNode* duplicate = bucket->bucketData.duplicates;
223  DCHECK_NOTNULL(duplicate);
224  return GetRow(duplicate->htdata, row);
225  } else {
226  return GetRow(bucket->bucketData.htdata, row);
227  }
228 }
229 
231  DCHECK(!AtEnd());
232  DCHECK_NOTNULL(table_);
233  DCHECK_NOTNULL(row_);
234  Bucket* bucket = &table_->buckets_[bucket_idx_];
235  if (UNLIKELY(bucket->hasDuplicates)) {
236  DCHECK_NOTNULL(node_);
237  return table_->GetRow(node_->htdata, row_);
238  } else {
239  return table_->GetRow(bucket->bucketData.htdata, row_);
240  }
241 }
242 
244  DCHECK(!AtEnd());
245  DCHECK(table_->stores_tuples_);
246  Bucket* bucket = &table_->buckets_[bucket_idx_];
247  // TODO: To avoid the hasDuplicates check, store the HtData* in the Iterator.
248  if (UNLIKELY(bucket->hasDuplicates)) {
249  DCHECK_NOTNULL(node_);
250  return node_->htdata.tuple;
251  } else {
252  return bucket->bucketData.htdata.tuple;
253  }
254 }
255 
257  DCHECK(!AtEnd());
258  Bucket* bucket = &table_->buckets_[bucket_idx_];
259  if (bucket->hasDuplicates) {
260  node_->matched = true;
261  } else {
262  bucket->matched = true;
263  }
264  // Used for disabling spilling of hash tables in right and full-outer joins with
265  // matches. See IMPALA-1488.
266  table_->has_matches_ = true;
267 }
268 
269 inline bool HashTable::Iterator::IsMatched() const {
270  DCHECK(!AtEnd());
271  Bucket* bucket = &table_->buckets_[bucket_idx_];
272  if (bucket->hasDuplicates) {
273  return node_->matched;
274  }
275  return bucket->matched;
276 }
277 
279  bucket_idx_ = BUCKET_NOT_FOUND;
280  node_ = NULL;
281 }
282 
284  DCHECK(!AtEnd());
285  if (table_->buckets_[bucket_idx_].hasDuplicates && node_->next != NULL) {
286  node_ = node_->next;
287  } else {
288  table_->NextFilledBucket(&bucket_idx_, &node_);
289  }
290 }
291 
293  DCHECK(!AtEnd());
294  if (table_->buckets_[bucket_idx_].hasDuplicates && node_->next != NULL) {
295  node_ = node_->next;
296  } else {
297  bucket_idx_ = BUCKET_NOT_FOUND;
298  node_ = NULL;
299  }
300 }
301 
303  DCHECK(!AtEnd());
304  Bucket* bucket = &table_->buckets_[bucket_idx_];
305  // Check if there is any remaining unmatched duplicate node in the current bucket.
306  if (bucket->hasDuplicates) {
307  while (node_->next != NULL) {
308  node_ = node_->next;
309  if (!node_->matched) return;
310  }
311  }
312  // Move to the next filled bucket and return if this bucket is not matched or
313  // iterate to the first not matched duplicate node.
314  table_->NextFilledBucket(&bucket_idx_, &node_);
315  while (bucket_idx_ != Iterator::BUCKET_NOT_FOUND) {
316  bucket = &table_->buckets_[bucket_idx_];
317  if (!bucket->hasDuplicates) {
318  if (!bucket->matched) return;
319  } else {
320  while (node_->matched && node_->next != NULL) {
321  node_ = node_->next;
322  }
323  if (!node_->matched) return;
324  }
325  table_->NextFilledBucket(&bucket_idx_, &node_);
326  }
327 }
328 
329 inline void HashTableCtx::set_level(int level) {
330  DCHECK_GE(level, 0);
331  DCHECK_LT(level, seeds_.size());
332  level_ = level;
333 }
334 
335 }
336 
337 #endif
void set_level(int level)
stl-like iterator interface.
Definition: hash-table.h:450
bool AtEnd() const
Returns true if this iterator is at the end, i.e. GetRow() cannot be called.
Definition: hash-table.h:492
Iterator End()
Return end marker.
Definition: hash-table.h:434
bool IR_NO_INLINE Equals(TupleRow *build_row)
Definition: hash-table.cc:171
bool GrowNodeArray()
Grow the node array. Returns false on OOM.
Definition: hash-table.cc:345
int64_t travel_length_
Definition: hash-table.h:657
void SetAtEnd()
Resets everything but the pointer to the hash table.
Tuple * GetTuple(int tuple_idx)
Definition: tuple-row.h:30
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48
const StringSearch UrlParser::hash_search & hash
Definition: url-parser.cc:41
DuplicateNode *IR_ALWAYS_INLINE AppendNextNode(Bucket *bucket)
HtData *IR_ALWAYS_INLINE InsertInternal(HashTableCtx *ht_ctx, uint32_t hash)
DuplicateNode * duplicates
Definition: hash-table.h:324
static const int64_t BUCKET_NOT_FOUND
Bucket index value when probe is not successful.
Definition: hash-table.h:453
Iterator Begin(HashTableCtx *ht_ctx)
union impala::HashTable::Bucket::@6 bucketData
Either the data for this bucket or the linked list of duplicates.
Either the row in the tuple stream or a pointer to the single tuple of this row.
Definition: hash-table.h:285
const bool quadratic_probing_
Quadratic probing enabled (as opposed to linear).
Definition: hash-table.h:605
void IR_ALWAYS_INLINE NextDuplicate()
int64_t num_duplicate_nodes_
Number of duplicate nodes.
Definition: hash-table.h:620
Linked list of entries used for duplicates.
Definition: hash-table.h:291
int64_t num_filled_buckets_
Number of non-empty buckets. Used to determine when to resize.
Definition: hash-table.h:632
const bool stores_nulls_
Definition: hash-table.h:234
bool filled
Whether this bucket contains a vaild entry, or it is empty.
Definition: hash-table.h:305
const bool stores_tuples_
Definition: hash-table.h:602
BufferedTupleStream::RowIdx idx
Definition: hash-table.h:286
void IR_ALWAYS_INLINE Next()
Iterates to the next element. It should be called only if !AtEnd().
TupleRow * GetRow(HtData &htdata, TupleRow *row) const
Return the TupleRow pointed by 'htdata'.
int64_t num_probes_
Definition: hash-table.h:650
int64_t num_buckets_with_duplicates_
Definition: hash-table.h:636
DuplicateNode *IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx)
int64_t num_hash_collisions_
Definition: hash-table.h:661
void IR_ALWAYS_INLINE PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash)
int64_t num_buckets_
Total number of buckets (filled and empty).
Definition: hash-table.h:629
int node_remaining_current_page_
Number of nodes left in the current page.
Definition: hash-table.h:617
uint32_t IR_NO_INLINE HashCurrentRow()
Definition: hash-table.h:177
Iterator FirstUnmatched(HashTableCtx *ctx)
friend class Iterator
Definition: hash-table.h:517
TupleRow * row_
Scratch buffer to generate rows on the fly.
Definition: hash-table.h:266
#define UNLIKELY(expr)
Definition: compiler-util.h:33
DuplicateNode * next_node_
Next duplicate node to insert. Vaild when node_remaining_current_page_ > 0.
Definition: hash-table.h:614
bool IR_NO_INLINE EvalProbeRow(TupleRow *row)
Definition: hash-table.h:209
Iterator IR_ALWAYS_INLINE Find(HashTableCtx *ht_ctx, uint32_t hash)
const bool finds_nulls_
Definition: hash-table.h:235
#define LIKELY(expr)
Definition: compiler-util.h:32
void NextFilledBucket(int64_t *bucket_idx, DuplicateNode **node)
int64_t IR_ALWAYS_INLINE Probe(Bucket *buckets, int64_t num_buckets, HashTableCtx *ht_ctx, uint32_t hash, bool *found)
There are wrappers of this function that perform the Find and Insert logic.
BufferedTupleStream * tuple_stream_
Definition: hash-table.h:593
bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow *row, uint32_t *hash)
DuplicateNode * node_
Pointer to the current duplicate node.
Definition: hash-table.h:513
bool IR_NO_INLINE EvalBuildRow(TupleRow *row)
Definition: hash-table.h:203
bool IR_ALWAYS_INLINE Insert(HashTableCtx *ht_ctx, const BufferedTupleStream::RowIdx &idx, TupleRow *row, uint32_t hash)
TupleRow * row() const
Definition: hash-table.h:127
bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow *row, uint32_t *hash)
void GetTupleRow(const RowIdx &idx, TupleRow *row) const
Bucket * buckets_
Definition: hash-table.h:626