15 #ifndef IMPALA_UTIL_DICT_ENCODING_H
16 #define IMPALA_UTIL_DICT_ENCODING_H
20 #include <boost/foreach.hpp>
21 #include <boost/scoped_ptr.hpp>
22 #include <boost/unordered_map.hpp>
56 virtual void WriteDict(uint8_t* buffer) = 0;
82 int WriteData(uint8_t* buffer,
int buffer_len);
113 int Put(
const T& value);
154 inline uint32_t
Hash(
const T& value)
const;
169 void SetData(uint8_t* buffer,
int buffer_len) {
170 DCHECK_GT(buffer_len, 0);
171 uint8_t bit_width = *buffer;
172 DCHECK_GE(bit_width, 0);
195 DictDecoder(uint8_t* dict_buffer,
int dict_len,
int fixed_len_size);
210 NodeIndex* bucket = &buckets_[
Hash(value) & (HASH_TABLE_SIZE - 1)];
213 while (i != Node::INVALID_INDEX) {
214 const Node* n = &nodes_[i];
217 buffered_indices_.push_back(i);
224 if (
UNLIKELY(i >= Node::INVALID_INDEX))
return -1;
225 buffered_indices_.push_back(i);
226 return AddToTable(value, bucket);
241 DCHECK_GT(encoded_value_size_, 0);
243 nodes_.push_back(
Node(value, *bucket));
244 *bucket = nodes_.size() - 1;
245 dict_encoded_size_ += encoded_value_size_;
246 return encoded_value_size_;
252 char* ptr_copy =
reinterpret_cast<char*
>(pool_->Allocate(value.
len));
253 memcpy(ptr_copy, value.
ptr, value.
len);
256 nodes_.push_back(
Node(sv, *bucket));
257 *bucket = nodes_.size() - 1;
259 dict_encoded_size_ += bytes_added;
265 DCHECK(data_decoder_.get() != NULL);
267 bool result = data_decoder_->Get(&index);
268 if (!result)
return false;
269 if (index >= dict_.size())
return false;
270 *value = dict_[index];
276 DCHECK(data_decoder_.get() != NULL);
278 bool result = data_decoder_->Get(&index);
279 if (!result)
return false;
280 if (index >= dict_.size())
return false;
283 uint8_t* addr =
reinterpret_cast<uint8_t*
>(&dict_[0]);
284 addr = addr + index *
sizeof(*value);
285 memcpy(value, addr,
sizeof(*value));
291 BOOST_FOREACH(
const Node& node, nodes_) {
304 if (!encoder.
Put(index))
return -1;
307 return 1 + encoder.
len();
312 int fixed_len_size) {
313 uint8_t* end = dict_buffer + dict_len;
314 while (dict_buffer < end) {
318 dict_.push_back(value);
uint16_t NodeIndex
Dictates an upper bound on the capacity of the hash table.
boost::scoped_ptr< RleDecoder > data_decoder_
uint32_t Hash(const T &value) const
Hash function for mapping a value to a bucket.
int encoded_value_size_
Size of each encoded dictionary value. -1 for variable-length types.
virtual ~DictDecoderBase()
MemPool * pool_
Pool to store StringValue data. Not owned.
int WriteData(uint8_t *buffer, int buffer_len)
static int ByteSize(const T &v)
Returns the byte size of 'v'.
virtual int num_entries() const
The number of entries in the dictionary.
std::vector< NodeIndex > buckets_
static int Encode(uint8_t *buffer, int fixed_len_size, const T &t)
T value
The dictionary value.
std::vector< Node > nodes_
int bit_width() const
The minimum bit width required to encode the currently buffered indices.
virtual ~DictEncoderBase()
virtual int num_entries() const =0
The number of entries in the dictionary.
virtual void WriteDict(uint8_t *buffer)
static uint32_t Hash(const void *data, int32_t bytes, uint32_t seed)
int dict_encoded_size_
The number of bytes needed to encode the dictionary.
static uint64_t Hash(const IntVal &v)
int EstimatedDataEncodedSize()
static int Decode(uint8_t *buffer, int fixed_len_size, T *v)
int AddToTable(const T &value, NodeIndex *bucket)
void ClearIndices()
Clears all the indices (but leaves the dictionary).
Node in the chained hash table.
DictDecoder(uint8_t *dict_buffer, int dict_len, int fixed_len_size)
virtual void WriteDict(uint8_t *buffer)=0
virtual int num_entries() const =0
DictEncoder(MemPool *pool, int encoded_value_size)
Node(const T &v, const NodeIndex &n)
void SetData(uint8_t *buffer, int buffer_len)
The rle encoded indices into the dictionary.
static int MaxBufferSize(int bit_width, int num_values)
Returns the maximum byte size it could take to encode 'num_values'.
std::vector< int > buffered_indices_
Indices that have not yet be written out by WriteData().
DictEncoderBase(MemPool *pool)
virtual int num_entries() const
Decoder class for RLE encoded data.
static int Log2(uint64_t x)
NodeIndex next
Index into nodes_ for the next Node in the chain. INVALID_INDEX indicates end.