Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
cache-hash-test.cc
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <iostream>
4 
5 #include <glog/logging.h>
6 #include <vector>
7 
8 #include "cache-hash-table.h"
9 #include "cache-hash-table.inline.h"
10 #include "standard-hash-table.h"
11 #include "standard-hash-table.inline.h"
12 #include "tuple-types.h"
13 #include "runtime/mem-pool.h"
14 #include "util/cpu-info.h"
15 #include "util/debug-util.h"
16 #include "util/hash-util.h"
17 #include "util/runtime-profile.h"
18 #include "util/stopwatch.h"
19 
20 using namespace impala;
21 
22 // Very basic hash aggregation prototype and test
23 // TODO: Generalize beyond hash aggregation, beyond hashing on the one column, etc.
24 
25 CacheHashTable::CacheHashTable() {
26  num_content_allocated_ = 0;
27 }
28 
29 void CacheHashTable::BucketSizeDistribution() {
30  std::vector<int> bucket_size;
31  for (int i = 0; i < BUCKETS; ++i) {
32  int size = buckets_[i].count;
33  if (size >= bucket_size.size()) {
34  // grow bucket_size to fit this size
35  bucket_size.resize(size + 1, 0);
36  }
37  ++bucket_size[size];
38  }
39 
40  std::stringstream distr;
41  for (int i = 0; i < bucket_size.size(); ++i) {
42  distr << i << ": " << bucket_size[i] << "\n";
43  }
44  LOG(INFO) << "Bucket Size Distribution\n" << distr.str();
45 }
46 
47 
48 // Update ht, which is doing a COUNT(*) GROUP BY id,
49 // by having it process the new tuple probe.
50 // Templatized on the type of hash table so we can reuse code without virtual calls.
51 template<typename T>
52 inline void Process(T* ht, const ProbeTuple* probe) {
53  BuildTuple *existing = ht->Find(probe);
54  if (existing != NULL) {
55  ++existing->count;
56  } else {
57  BuildTuple build;
58  build.id = probe->id;
59  build.count = 1;
60  ht->Insert(&build);
61  }
62 }
63 
64 // Test ht by aggregating input, which is an array of num_tuples ProbeTuples
65 // Templatized on the type of hash table so we can reuse code without virtual calls.
66 template<typename T>
67 uint64_t Test(T* ht, const ProbeTuple* input, uint64_t num_tuples)
68 {
69  StopWatch time;
70  time.Start();
71  for (int i = 0; i < num_tuples; ++i) {
72  Process<T>(ht, &input[i]);
73  }
74  time.Stop();
75  return time.Ticks();
76 }
77 
78 int main(int argc, char **argv) {
79  google::InitGoogleLogging(argv[0]);
80  CpuInfo::Init();
81 
82  srand(time(NULL));
83 
84  const int NUM_TUPLES = 100000000; //10^8
85  const int NUM_BUILD_TUPLES = 4 * CacheHashTable::MaxBuildTuples() / 10;
86 
87  CacheHashTable cache_ht;
88  StandardHashTable std_ht;
89 
90  ProbeTuple* input = GenTuples(NUM_TUPLES, NUM_BUILD_TUPLES);
91  uint64_t cache_time = Test<CacheHashTable>(&cache_ht, input, NUM_TUPLES);
92  LOG(ERROR) << "Cache-aware time: "
93  << PrettyPrinter::Print(cache_time, TUnit::CPU_TICKS);
94  uint64_t std_time = Test<StandardHashTable>(&std_ht, input, NUM_TUPLES);
95 
96  LOG(ERROR) << "Bucket-chained time: "
97  << PrettyPrinter::Print(std_time, TUnit::CPU_TICKS);
98  return 0;
99 }
void Process(T *ht, const ProbeTuple *probe)
static std::string Print(bool value, TUnit::type ignored, bool verbose=false)
int main(int argc, char **argv)
const int NUM_TUPLES
uint64_t Test(T *ht, const ProbeTuple *input, uint64_t num_tuples)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75