doc/html/hash-table_8inline_8h_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #ifndef IMPALA_EXEC_HASH_TABLE_INLINE_H

 #define IMPALA_EXEC_HASH_TABLE_INLINE_H


 #include "exec/hash-table.h"


 namespace impala {


 inline bool HashTableCtx::EvalAndHashBuild(TupleRow* row, uint32_t* hash) {

   bool has_null = EvalBuildRow(row);

   if (!stores_nulls_ && has_null) return false;

   *hash = HashCurrentRow();

   return true;

 }


 inline bool HashTableCtx::EvalAndHashProbe(TupleRow* row, uint32_t* hash) {

   bool has_null = EvalProbeRow(row);

   if ((!stores_nulls_ || !finds_nulls_) && has_null) return false;

   *hash = HashCurrentRow();

   return true;

 }


 inline int64_t HashTable::Probe(Bucket* buckets, int64_t num_buckets,

     HashTableCtx* ht_ctx, uint32_t hash, bool* found) {

   DCHECK_NOTNULL(buckets);

   DCHECK_GT(num_buckets, 0);

   *found = false;

   int64_t bucket_idx = hash & (num_buckets - 1);


   // In case of linear probing it counts the total number of steps for statistics and

   // for knowing when to exit the loop (e.g. by capping the total travel length). In case

   // of quadratic probing it is also used for calculating the length of the next jump.

   int64_t step = 0;

   do {

     Bucket* bucket = &buckets[bucket_idx];

     if (!bucket->filled) return bucket_idx;

     if (hash == bucket->hash) {

       if (ht_ctx != NULL && ht_ctx->Equals(GetRow(bucket, ht_ctx->row_))) {

         *found = true;

         return bucket_idx;

       }

       // Row equality failed, or not performed. This is a hash collision. Continue

       // searching.

       ++num_hash_collisions_;

     }

     // Move to the next bucket.

     ++step;

     ++travel_length_;

     if (quadratic_probing_) {

       // The i-th probe location is idx = (hash + (step * (step + 1)) / 2) mod num_buckets.

       // This gives num_buckets unique idxs (between 0 and N-1) when num_buckets is a power

       // of 2.

       bucket_idx = (bucket_idx + step) & (num_buckets - 1);

     } else {

       bucket_idx = (bucket_idx + 1) & (num_buckets - 1);

     }

   } while (LIKELY(step < num_buckets));

   DCHECK_EQ(num_filled_buckets_, num_buckets) << "Probing of a non-full table "

       << "failed: " << quadratic_probing_ << " " << hash;

   return Iterator::BUCKET_NOT_FOUND;

 }


 inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx,

     uint32_t hash) {

   ++num_probes_;

   bool found = false;

   int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, &found);

   DCHECK_NE(bucket_idx, Iterator::BUCKET_NOT_FOUND);

   if (found) {

     // We need to insert a duplicate node, note that this may fail to allocate memory.

     DuplicateNode* new_node = InsertDuplicateNode(bucket_idx);

     if (UNLIKELY(new_node == NULL)) return NULL;

     return &new_node->htdata;

   } else {

     PrepareBucketForInsert(bucket_idx, hash);

     return &buckets_[bucket_idx].bucketData.htdata;

   }

 }


 inline bool HashTable::Insert(HashTableCtx* ht_ctx,

     const BufferedTupleStream::RowIdx& idx, TupleRow* row, uint32_t hash) {

   if (stores_tuples_) return Insert(ht_ctx, row->GetTuple(0), hash);

   HtData* htdata = InsertInternal(ht_ctx, hash);

   // If successful insert, update the contents of the newly inserted entry with 'idx'.

   if (LIKELY(htdata != NULL)) {

     htdata->idx = idx;

     return true;

   }

   return false;

 }


 inline bool HashTable::Insert(HashTableCtx* ht_ctx, Tuple* tuple, uint32_t hash) {

   DCHECK(stores_tuples_);

   HtData* htdata = InsertInternal(ht_ctx, hash);

   // If successful insert, update the contents of the newly inserted entry with 'tuple'.

   if (LIKELY(htdata != NULL)) {

     htdata->tuple = tuple;

     return true;

   }

   return false;

 }


 inline HashTable::Iterator HashTable::Find(HashTableCtx* ht_ctx, uint32_t hash) {

   ++num_probes_;

   bool found = false;

   int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, &found);

   if (found) {

     return Iterator(this, ht_ctx->row(), bucket_idx,

         buckets_[bucket_idx].bucketData.duplicates, hash);

   }

   return End();

 }


 inline HashTable::Iterator HashTable::Begin(HashTableCtx* ctx) {

   int64_t bucket_idx = Iterator::BUCKET_NOT_FOUND;

   DuplicateNode* node = NULL;

   NextFilledBucket(&bucket_idx, &node);

   return Iterator(this, ctx->row(), bucket_idx, node, 0);

 }


 inline HashTable::Iterator HashTable::FirstUnmatched(HashTableCtx* ctx) {

   int64_t bucket_idx = Iterator::BUCKET_NOT_FOUND;

   DuplicateNode* node = NULL;

   NextFilledBucket(&bucket_idx, &node);

   Iterator it(this, ctx->row(), bucket_idx, node, 0);

   // Check whether the bucket, or its first duplicate node, is matched. If it is not

   // matched, then return. Otherwise, move to the first unmatched entry (node or bucket).

   Bucket* bucket = &buckets_[bucket_idx];

   if ((!bucket->hasDuplicates && bucket->matched) ||

       (bucket->hasDuplicates && node->matched)) {

     it.NextUnmatched();

   }

   return it;

 }


 inline void HashTable::NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node) {

   ++*bucket_idx;

   for (; *bucket_idx < num_buckets_; ++*bucket_idx) {

     if (buckets_[*bucket_idx].filled) {

       *node = buckets_[*bucket_idx].bucketData.duplicates;

       return;

     }

   }

   // Reached the end of the hash table.

   *bucket_idx = Iterator::BUCKET_NOT_FOUND;

   *node = NULL;

 }


 inline void HashTable::PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash) {

   DCHECK_GE(bucket_idx, 0);

   DCHECK_LT(bucket_idx, num_buckets_);

   Bucket* bucket = &buckets_[bucket_idx];

   DCHECK(!bucket->filled);

   ++num_filled_buckets_;

   bucket->filled = true;

   bucket->matched = false;

   bucket->hasDuplicates = false;

   bucket->hash = hash;

 }


 inline HashTable::DuplicateNode* HashTable::AppendNextNode(Bucket* bucket) {

   DCHECK_GT(node_remaining_current_page_, 0);

   bucket->bucketData.duplicates = next_node_;

   ++num_duplicate_nodes_;

   --node_remaining_current_page_;

   return next_node_++;

 }


 inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(int64_t bucket_idx) {

   DCHECK_GE(bucket_idx, 0);

   DCHECK_LT(bucket_idx, num_buckets_);

   Bucket* bucket = &buckets_[bucket_idx];

   DCHECK(bucket->filled);

   // Allocate one duplicate node for the new data and one for the preexisting data,

   // if needed.

   while (node_remaining_current_page_ < 1 + !bucket->hasDuplicates) {

     if (UNLIKELY(!GrowNodeArray())) return NULL;

   }

   if (!bucket->hasDuplicates) {

     // This is the first duplicate in this bucket. It means that we need to convert

     // the current entry in the bucket to a node and link it from the bucket.

     next_node_->htdata.idx = bucket->bucketData.htdata.idx;

     DCHECK(!bucket->matched);

     next_node_->matched = false;

     next_node_->next = NULL;

     AppendNextNode(bucket);

     bucket->hasDuplicates = true;

     ++num_buckets_with_duplicates_;

   }

   // Link a new node.

   next_node_->next = bucket->bucketData.duplicates;

   next_node_->matched = false;

   return AppendNextNode(bucket);

 }


 inline TupleRow* HashTable::GetRow(HtData& htdata, TupleRow* row) const {

   if (stores_tuples_) {

     return reinterpret_cast<TupleRow*>(&htdata.tuple);

   } else {

     tuple_stream_->GetTupleRow(htdata.idx, row);

     return row;

   }

 }


 inline TupleRow* HashTable::GetRow(Bucket* bucket, TupleRow* row) const {

   DCHECK_NOTNULL(bucket);

   if (UNLIKELY(bucket->hasDuplicates)) {

     DuplicateNode* duplicate = bucket->bucketData.duplicates;

     DCHECK_NOTNULL(duplicate);

     return GetRow(duplicate->htdata, row);

   } else {

     return GetRow(bucket->bucketData.htdata, row);

   }

 }


 inline TupleRow* HashTable::Iterator::GetRow() const {

   DCHECK(!AtEnd());

   DCHECK_NOTNULL(table_);

   DCHECK_NOTNULL(row_);

   Bucket* bucket = &table_->buckets_[bucket_idx_];

   if (UNLIKELY(bucket->hasDuplicates)) {

     DCHECK_NOTNULL(node_);

     return table_->GetRow(node_->htdata, row_);

   } else {

     return table_->GetRow(bucket->bucketData.htdata, row_);

   }

 }


 inline Tuple* HashTable::Iterator::GetTuple() const {

   DCHECK(!AtEnd());

   DCHECK(table_->stores_tuples_);

   Bucket* bucket = &table_->buckets_[bucket_idx_];

   // TODO: To avoid the hasDuplicates check, store the HtData* in the Iterator.

   if (UNLIKELY(bucket->hasDuplicates)) {

     DCHECK_NOTNULL(node_);

     return node_->htdata.tuple;

   } else {

     return bucket->bucketData.htdata.tuple;

   }

 }


 inline void HashTable::Iterator::SetMatched() {

   DCHECK(!AtEnd());

   Bucket* bucket = &table_->buckets_[bucket_idx_];

   if (bucket->hasDuplicates) {

     node_->matched = true;

   } else {

     bucket->matched = true;

   }

   // Used for disabling spilling of hash tables in right and full-outer joins with

   // matches. See IMPALA-1488.

   table_->has_matches_ = true;

 }


 inline bool HashTable::Iterator::IsMatched() const {

   DCHECK(!AtEnd());

   Bucket* bucket = &table_->buckets_[bucket_idx_];

   if (bucket->hasDuplicates) {

     return node_->matched;

   }

   return bucket->matched;

 }


 inline void HashTable::Iterator::SetAtEnd() {

   bucket_idx_ = BUCKET_NOT_FOUND;

   node_ = NULL;

 }


 inline void HashTable::Iterator::Next() {

   DCHECK(!AtEnd());

   if (table_->buckets_[bucket_idx_].hasDuplicates && node_->next != NULL) {

     node_ = node_->next;

   } else {

     table_->NextFilledBucket(&bucket_idx_, &node_);

   }

 }


 inline void HashTable::Iterator::NextDuplicate() {

   DCHECK(!AtEnd());

   if (table_->buckets_[bucket_idx_].hasDuplicates && node_->next != NULL) {

     node_ = node_->next;

   } else {

     bucket_idx_ = BUCKET_NOT_FOUND;

     node_ = NULL;

   }

 }


 inline void HashTable::Iterator::NextUnmatched() {

   DCHECK(!AtEnd());

   Bucket* bucket = &table_->buckets_[bucket_idx_];

   // Check if there is any remaining unmatched duplicate node in the current bucket.

   if (bucket->hasDuplicates) {

     while (node_->next != NULL) {

       node_ = node_->next;

       if (!node_->matched) return;

     }

   }

   // Move to the next filled bucket and return if this bucket is not matched or

   // iterate to the first not matched duplicate node.

   table_->NextFilledBucket(&bucket_idx_, &node_);

   while (bucket_idx_ != Iterator::BUCKET_NOT_FOUND) {

     bucket = &table_->buckets_[bucket_idx_];

     if (!bucket->hasDuplicates) {

       if (!bucket->matched) return;

     } else {

       while (node_->matched && node_->next != NULL) {

         node_ = node_->next;

       }

       if (!node_->matched) return;

     }

     table_->NextFilledBucket(&bucket_idx_, &node_);

   }

 }


 inline void HashTableCtx::set_level(int level) {

   DCHECK_GE(level, 0);

   DCHECK_LT(level, seeds_.size());

   level_ = level;

 }


 }


 #endif

impala::HashTableCtx::set_level
void set_level(int level)
Definition: hash-table.inline.h:329

impala::HashTable::Iterator
stl-like iterator interface.
Definition: hash-table.h:450

impala::HashTable::Iterator::AtEnd
bool AtEnd() const
Returns true if this iterator is at the end, i.e. GetRow() cannot be called.
Definition: hash-table.h:492

impala::HashTable::End
Iterator End()
Return end marker.
Definition: hash-table.h:434

impala::HashTableCtx::Equals
bool IR_NO_INLINE Equals(TupleRow *build_row)
Definition: hash-table.cc:171

impala::HashTable::GrowNodeArray
bool GrowNodeArray()
Grow the node array. Returns false on OOM.
Definition: hash-table.cc:345

impala::HashTable::Iterator::GetTuple
Tuple * GetTuple() const
Definition: hash-table.inline.h:243

impala::HashTable::travel_length_
int64_t travel_length_
Definition: hash-table.h:657

hash-table.h

impala::HashTable::Iterator::SetAtEnd
void SetAtEnd()
Resets everything but the pointer to the hash table.
Definition: hash-table.inline.h:278

impala::TupleRow::GetTuple
Tuple * GetTuple(int tuple_idx)
Definition: tuple-row.h:30

impala::HashTable::Iterator::NextUnmatched
void NextUnmatched()
Definition: hash-table.inline.h:302

impala::HashTable::Iterator::row_
TupleRow * row_
Definition: hash-table.h:506

impala::Tuple
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48

impala::hash
const StringSearch UrlParser::hash_search & hash
Definition: url-parser.cc:41

impala::HashTable::DuplicateNode::next
DuplicateNode * next
Definition: hash-table.h:299

impala::HashTable::AppendNextNode
DuplicateNode *IR_ALWAYS_INLINE AppendNextNode(Bucket *bucket)
Definition: hash-table.inline.h:175

impala::HashTable::InsertInternal
HtData *IR_ALWAYS_INLINE InsertInternal(HashTableCtx *ht_ctx, uint32_t hash)
Definition: hash-table.inline.h:77

impala::HashTable::Bucket::duplicates
DuplicateNode * duplicates
Definition: hash-table.h:324

impala::HashTable::Iterator::BUCKET_NOT_FOUND
static const int64_t BUCKET_NOT_FOUND
Bucket index value when probe is not successful.
Definition: hash-table.h:453

impala::HashTable::Begin
Iterator Begin(HashTableCtx *ht_ctx)
Definition: hash-table.inline.h:128

impala::HashTable::Bucket::bucketData
union impala::HashTable::Bucket::@6 bucketData
Either the data for this bucket or the linked list of duplicates.

impala::HashTable::HtData
Either the row in the tuple stream or a pointer to the single tuple of this row.
Definition: hash-table.h:285

impala::HashTable::Bucket
Definition: hash-table.h:303

impala::HashTable::quadratic_probing_
const bool quadratic_probing_
Quadratic probing enabled (as opposed to linear).
Definition: hash-table.h:605

impala::HashTable::HtData::tuple
Tuple * tuple
Definition: hash-table.h:287

impala::HashTable::Iterator::NextDuplicate
void IR_ALWAYS_INLINE NextDuplicate()
Definition: hash-table.inline.h:292

impala::HashTable::num_duplicate_nodes_
int64_t num_duplicate_nodes_
Number of duplicate nodes.
Definition: hash-table.h:620

impala::TupleRow
Definition: tuple-row.h:28

impala::HashTable::Bucket::matched
bool matched
Definition: hash-table.h:311

impala::HashTable::DuplicateNode
Linked list of entries used for duplicates.
Definition: hash-table.h:291

impala::HashTable::num_filled_buckets_
int64_t num_filled_buckets_
Number of non-empty buckets. Used to determine when to resize.
Definition: hash-table.h:632

impala::HashTable::Iterator::IsMatched
bool IsMatched() const
Definition: hash-table.inline.h:269

impala::HashTable::Iterator::bucket_idx_
int64_t bucket_idx_
Definition: hash-table.h:510

impala::HashTableCtx::stores_nulls_
const bool stores_nulls_
Definition: hash-table.h:234

impala::HashTable::Bucket::filled
bool filled
Whether this bucket contains a vaild entry, or it is empty.
Definition: hash-table.h:305

impala::HashTable::Iterator::table_
HashTable * table_
Definition: hash-table.h:505

impala::HashTable::stores_tuples_
const bool stores_tuples_
Definition: hash-table.h:602

impala::HashTable::HtData::idx
BufferedTupleStream::RowIdx idx
Definition: hash-table.h:286

impala::HashTable::Bucket::htdata
HtData htdata
Definition: hash-table.h:323

impala::HashTable::Iterator::Next
void IR_ALWAYS_INLINE Next()
Iterates to the next element. It should be called only if !AtEnd().
Definition: hash-table.inline.h:283

impala::HashTable::Bucket::hasDuplicates
bool hasDuplicates
Definition: hash-table.h:315

impala::HashTable::GetRow
TupleRow * GetRow(HtData &htdata, TupleRow *row) const
Return the TupleRow pointed by 'htdata'.
Definition: hash-table.inline.h:210

impala::HashTable::num_probes_
int64_t num_probes_
Definition: hash-table.h:650

impala::HashTable::num_buckets_with_duplicates_
int64_t num_buckets_with_duplicates_
Definition: hash-table.h:636

impala::HashTable::InsertDuplicateNode
DuplicateNode *IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx)
Definition: hash-table.inline.h:183

impala::HashTable::num_hash_collisions_
int64_t num_hash_collisions_
Definition: hash-table.h:661

impala::HashTable::PrepareBucketForInsert
void IR_ALWAYS_INLINE PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash)
Definition: hash-table.inline.h:163

impala::HashTable::num_buckets_
int64_t num_buckets_
Total number of buckets (filled and empty).
Definition: hash-table.h:629

impala::HashTable::node_remaining_current_page_
int node_remaining_current_page_
Number of nodes left in the current page.
Definition: hash-table.h:617

impala::HashTableCtx::HashCurrentRow
uint32_t IR_NO_INLINE HashCurrentRow()
Definition: hash-table.h:177

impala::HashTable::FirstUnmatched
Iterator FirstUnmatched(HashTableCtx *ctx)
Definition: hash-table.inline.h:135

impala::HashTable::Bucket::hash
uint32_t hash
Definition: hash-table.h:319

impala::HashTable::Iterator
friend class Iterator
Definition: hash-table.h:517

impala::HashTableCtx::row_
TupleRow * row_
Scratch buffer to generate rows on the fly.
Definition: hash-table.h:266

UNLIKELY
#define UNLIKELY(expr)
Definition: compiler-util.h:33

impala::HashTable::next_node_
DuplicateNode * next_node_
Next duplicate node to insert. Vaild when node_remaining_current_page_ > 0.
Definition: hash-table.h:614

impala::HashTableCtx::EvalProbeRow
bool IR_NO_INLINE EvalProbeRow(TupleRow *row)
Definition: hash-table.h:209

impala::HashTable::Find
Iterator IR_ALWAYS_INLINE Find(HashTableCtx *ht_ctx, uint32_t hash)
Definition: hash-table.inline.h:117

impala::HashTableCtx::finds_nulls_
const bool finds_nulls_
Definition: hash-table.h:235

LIKELY
#define LIKELY(expr)
Definition: compiler-util.h:32

impala::HashTable::NextFilledBucket
void NextFilledBucket(int64_t *bucket_idx, DuplicateNode **node)
Definition: hash-table.inline.h:150

impala::HashTable::Probe
int64_t IR_ALWAYS_INLINE Probe(Bucket *buckets, int64_t num_buckets, HashTableCtx *ht_ctx, uint32_t hash, bool *found)
There are wrappers of this function that perform the Find and Insert logic.
Definition: hash-table.inline.h:37

impala::HashTable::tuple_stream_
BufferedTupleStream * tuple_stream_
Definition: hash-table.h:593

impala::HashTable::DuplicateNode::htdata
HtData htdata
Definition: hash-table.h:300

impala::HashTableCtx::EvalAndHashBuild
bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow *row, uint32_t *hash)
Definition: hash-table.inline.h:23

gen_ir_descriptions.idx
int idx
Definition: gen_ir_descriptions.py:215

impala::HashTable::DuplicateNode::matched
bool matched
Definition: hash-table.h:297

impala::BufferedTupleStream::RowIdx
Definition: buffered-tuple-stream.h:121

impala::HashTable::Iterator::SetMatched
void SetMatched()
Definition: hash-table.inline.h:256

impala::HashTable::Iterator::node_
DuplicateNode * node_
Pointer to the current duplicate node.
Definition: hash-table.h:513

impala::HashTable::Iterator::GetRow
TupleRow * GetRow() const
Definition: hash-table.inline.h:230

impala::HashTableCtx::EvalBuildRow
bool IR_NO_INLINE EvalBuildRow(TupleRow *row)
Definition: hash-table.h:203

impala::HashTableCtx
Definition: hash-table.h:104

impala::HashTable::Insert
bool IR_ALWAYS_INLINE Insert(HashTableCtx *ht_ctx, const BufferedTupleStream::RowIdx &idx, TupleRow *row, uint32_t hash)
Definition: hash-table.inline.h:94

impala::HashTableCtx::row
TupleRow * row() const
Definition: hash-table.h:127

impala::HashTableCtx::EvalAndHashProbe
bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow *row, uint32_t *hash)
Definition: hash-table.inline.h:30

impala::BufferedTupleStream::GetTupleRow
void GetTupleRow(const RowIdx &idx, TupleRow *row) const
Definition: buffered-tuple-stream.inline.h:49

impala::HashTable::buckets_
Bucket * buckets_
Definition: hash-table.h:626