16 #ifndef IMPALA_EXEC_HASH_TABLE_H
17 #define IMPALA_EXEC_HASH_TABLE_H
20 #include <boost/cstdint.hpp>
21 #include <boost/scoped_ptr.hpp>
115 HashTableCtx(
const std::vector<ExprContext*>& build_expr_ctxs,
116 const std::vector<ExprContext*>& probe_expr_ctxs,
bool stores_nulls,
117 bool finds_nulls, int32_t initial_seed,
int max_levels,
118 int num_build_tuples);
192 uint32_t
inline Hash(
const void* input,
int len, int32_t
hash) {
344 int64_t max_num_buckets, int64_t initial_num_buckets = 1024);
404 return num_buckets *
sizeof(
Bucket);
440 std::string
DebugString(
bool skip_empty,
bool show_match,
void set_level(int level)
uint32_t Hash(const void *input, int len, int32_t hash)
Wrapper function for calling correct HashUtil function in non-codegen'd case.
stl-like iterator interface.
The underlying memory management is done by the BufferedBlockMgr.
bool AtEnd() const
Returns true if this iterator is at the end, i.e. GetRow() cannot be called.
std::string PrintStats() const
Update and print some statistics that can be used for performance debugging.
Iterator End()
Return end marker.
bool IR_NO_INLINE Equals(TupleRow *build_row)
bool GrowNodeArray()
Grow the node array. Returns false on OOM.
int results_buffer_size() const
static int64_t EstimateNumBuckets(int64_t num_rows)
void DebugStringTuple(std::stringstream &ss, HtData &htdata, const RowDescriptor *desc)
Print the content of a bucket or node.
bool EvalRow(TupleRow *row, const std::vector< ExprContext * > &ctxs)
void SetAtEnd()
Resets everything but the pointer to the hash table.
int64_t byte_size() const
Returns the number of bytes allocated to the hash table.
llvm::Function * CodegenHashCurrentRow(RuntimeState *state, bool use_murmur)
uint8_t * expr_values_buffer_
A tuple with 0 materialised slots is represented as NULL.
const StringSearch UrlParser::hash_search & hash
BufferedBlockMgr::Client * block_mgr_client_
Client to allocate data pages with.
static const double MAX_FILL_FACTOR
DuplicateNode *IR_ALWAYS_INLINE AppendNextNode(Bucket *bucket)
const std::vector< ExprContext * > & build_expr_ctxs_
HtData *IR_ALWAYS_INLINE InsertInternal(HashTableCtx *ht_ctx, uint32_t hash)
DuplicateNode * duplicates
static const int64_t BUCKET_NOT_FOUND
Bucket index value when probe is not successful.
bool Init()
Allocates the initial bucket structure. Returns false if OOM.
void * last_expr_value(int expr_idx) const
Iterator Begin(HashTableCtx *ht_ctx)
static const char * LLVM_CLASS_NAME
union impala::HashTable::Bucket::@6 bucketData
Either the data for this bucket or the linked list of duplicates.
bool CheckAndResize(uint64_t buckets_to_fill, HashTableCtx *ht_ctx)
int64_t EmptyBuckets() const
Returns the number of empty buckets.
std::vector< BufferedBlockMgr::Block * > data_pages_
Data pages for all nodes. These are always pinned.
Either the row in the tuple stream or a pointer to the single tuple of this row.
bool HasMatches() const
Return true if there was a least one match.
const bool quadratic_probing_
Quadratic probing enabled (as opposed to linear).
const std::vector< ExprContext * > & probe_expr_ctxs_
void IR_ALWAYS_INLINE NextDuplicate()
int64_t num_duplicate_nodes_
Number of duplicate nodes.
Linked list of entries used for duplicates.
int64_t num_filled_buckets_
Number of non-empty buckets. Used to determine when to resize.
bool filled
Whether this bucket contains a vaild entry, or it is empty.
uint32_t GetHashSeed() const
Cross-compiled functions to access member variables used in CodegenHashCurrentRow().
std::vector< int > expr_values_buffer_offsets_
const bool stores_tuples_
static uint32_t Hash(const void *data, int32_t bytes, uint32_t seed)
llvm::Function * CodegenEquals(RuntimeState *state)
BufferedTupleStream::RowIdx idx
void IR_ALWAYS_INLINE Next()
Iterates to the next element. It should be called only if !AtEnd().
int64_t num_resizes_
How many times this table has resized so far.
llvm::Function * CodegenEvalRow(RuntimeState *state, bool build_row)
bool ResizeBuckets(int64_t num_buckets, HashTableCtx *ht_ctx)
Resize the hash table to 'num_buckets'. Returns false on OOM.
int64_t CurrentMemSize() const
TupleRow * GetRow(HtData &htdata, TupleRow *row) const
Return the TupleRow pointed by 'htdata'.
int64_t num_buckets_with_duplicates_
DuplicateNode *IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx)
int64_t num_hash_collisions_
void IR_ALWAYS_INLINE PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash)
int64_t num_buckets_
Total number of buckets (filled and empty).
HashTableCtx(const std::vector< ExprContext * > &build_expr_ctxs, const std::vector< ExprContext * > &probe_expr_ctxs, bool stores_nulls, bool finds_nulls, int32_t initial_seed, int max_levels, int num_build_tuples)
int node_remaining_current_page_
Number of nodes left in the current page.
uint8_t * expr_value_null_bits_
MemPool * data_page_pool_
Only used for tests to allocate data pages instead of the block mgr.
uint32_t IR_NO_INLINE HashCurrentRow()
Iterator FirstUnmatched(HashTableCtx *ctx)
std::vector< uint32_t > seeds_
The seeds to use for hashing. Indexed by the level.
TupleRow * row_
Scratch buffer to generate rows on the fly.
static int64_t NextPowerOfTwo(int64_t v)
int64_t num_buckets() const
Returns the number of buckets.
DuplicateNode * next_node_
Next duplicate node to insert. Vaild when node_remaining_current_page_ > 0.
bool IR_NO_INLINE EvalProbeRow(TupleRow *row)
int64_t total_data_page_size_
Byte size of all buffers in data_pages_.
Iterator IR_ALWAYS_INLINE Find(HashTableCtx *ht_ctx, uint32_t hash)
HashTable(RuntimeState *state, BufferedBlockMgr::Client *client, int num_build_tuples, BufferedTupleStream *tuple_stream, int64_t max_num_buckets, int64_t initial_num_buckets=1024)
uint32_t HashVariableLenRow()
void NextFilledBucket(int64_t *bucket_idx, DuplicateNode **node)
bool last_expr_value_null(int expr_idx) const
Returns if the expr at 'expr_idx' evaluated to NULL for the last row.
static int64_t EstimateSize(int64_t num_rows)
int64_t IR_ALWAYS_INLINE Probe(Bucket *buckets, int64_t num_buckets, HashTableCtx *ht_ctx, uint32_t hash, bool *found)
There are wrappers of this function that perform the Find and Insert logic.
double load_factor() const
Returns the load factor (the number of non-empty buckets)
void Close()
Call to cleanup any resources.
BufferedTupleStream * tuple_stream_
void Close()
Call to cleanup any resources. Must be called once.
const int64_t max_num_buckets_
bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow *row, uint32_t *hash)
DuplicateNode * node_
Pointer to the current duplicate node.
TupleRow * GetRow() const
Iterator(HashTable *table, TupleRow *row, int bucket_idx, DuplicateNode *node, uint32_t hash)
bool IR_NO_INLINE EvalBuildRow(TupleRow *row)
bool IR_ALWAYS_INLINE Insert(HashTableCtx *ht_ctx, const BufferedTupleStream::RowIdx &idx, TupleRow *row, uint32_t hash)
int64_t size() const
Returns number of elements inserted in the hash table.
static uint64_t MurmurHash2_64(const void *input, int len, uint64_t seed)
Murmur2 hash implementation returning 64-bit hashes.
std::string DebugString(bool skip_empty, bool show_match, const RowDescriptor *build_desc)
bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow *row, uint32_t *hash)
int64_t num_failed_probes_
Number of probes that failed and had to fall back to linear probing without cap.