Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
cache-hash-test.cc
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <iostream>
4 
5 #include <glog/logging.h>
6 #include <vector>
7 
8 #include "cache-hash-table.h"
9 #include "cache-hash-table.inline.h"
10 #include "standard-hash-table.h"
11 #include "standard-hash-table.inline.h"
12 #include "tuple-types.h"
13 #include "runtime/mem-pool.h"
14 #include "util/cpu-info.h"
15 #include "util/debug-util.h"
16 #include "util/pretty-printer.h"
17 #include "util/hash-util.h"
18 #include "util/runtime-profile.h"
19 #include "util/stopwatch.h"
20 
21 using namespace impala;
22 
23 // Very basic hash aggregation prototype and test
24 // TODO: Generalize beyond hash aggregation, beyond hashing on the one column, etc.
25 
26 CacheHashTable::CacheHashTable() {
27  num_content_allocated_ = 0;
28 }
29 
30 void CacheHashTable::BucketSizeDistribution() {
31  std::vector<int> bucket_size;
32  for (int i = 0; i < BUCKETS; ++i) {
33  int size = buckets_[i].count;
34  if (size >= bucket_size.size()) {
35  // grow bucket_size to fit this size
36  bucket_size.resize(size + 1, 0);
37  }
38  ++bucket_size[size];
39  }
40 
41  std::stringstream distr;
42  for (int i = 0; i < bucket_size.size(); ++i) {
43  distr << i << ": " << bucket_size[i] << "\n";
44  }
45  LOG(INFO) << "Bucket Size Distribution\n" << distr.str();
46 }
47 
48 
49 // Update ht, which is doing a COUNT(*) GROUP BY id,
50 // by having it process the new tuple probe.
51 // Templatized on the type of hash table so we can reuse code without virtual calls.
52 template<typename T>
53 inline void Process(T* ht, const ProbeTuple* probe) {
54  BuildTuple *existing = ht->Find(probe);
55  if (existing != NULL) {
56  ++existing->count;
57  } else {
58  BuildTuple build;
59  build.id = probe->id;
60  build.count = 1;
61  ht->Insert(&build);
62  }
63 }
64 
65 // Test ht by aggregating input, which is an array of num_tuples ProbeTuples
66 // Templatized on the type of hash table so we can reuse code without virtual calls.
67 template<typename T>
68 uint64_t Test(T* ht, const ProbeTuple* input, uint64_t num_tuples)
69 {
70  StopWatch time;
71  time.Start();
72  for (int i = 0; i < num_tuples; ++i) {
73  Process<T>(ht, &input[i]);
74  }
75  time.Stop();
76  return time.Ticks();
77 }
78 
79 int main(int argc, char **argv) {
80  google::InitGoogleLogging(argv[0]);
81  CpuInfo::Init();
82 
83  srand(time(NULL));
84 
85  const int NUM_TUPLES = 100000000; //10^8
86  const int NUM_BUILD_TUPLES = 4 * CacheHashTable::MaxBuildTuples() / 10;
87 
88  CacheHashTable cache_ht;
89  StandardHashTable std_ht;
90 
91  ProbeTuple* input = GenTuples(NUM_TUPLES, NUM_BUILD_TUPLES);
92  uint64_t cache_time = Test<CacheHashTable>(&cache_ht, input, NUM_TUPLES);
93  LOG(ERROR) << "Cache-aware time: "
94  << PrettyPrinter::Print(cache_time, TUnit::CPU_TICKS);
95  uint64_t std_time = Test<StandardHashTable>(&std_ht, input, NUM_TUPLES);
96 
97  LOG(ERROR) << "Bucket-chained time: "
98  << PrettyPrinter::Print(std_time, TUnit::CPU_TICKS);
99  return 0;
100 }
void Process(T *ht, const ProbeTuple *probe)
static std::string Print(bool value, TUnit::type ignored, bool verbose=false)
int main(int argc, char **argv)
const int NUM_TUPLES
uint64_t Test(T *ht, const ProbeTuple *input, uint64_t num_tuples)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75