Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
cache-hash-test.cc
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <iostream>
4 
5 #include <vector>
6 
7 #include "cache-hash-table.h"
8 #include "cache-hash-table.inline.h"
9 #include "standard-hash-table.h"
10 #include "standard-hash-table.inline.h"
11 #include "tuple-types.h"
12 #include "runtime/mem-pool.h"
13 #include "util/cpu-info.h"
14 #include "util/debug-util.h"
15 #include "util/pretty-printer.h"
16 #include "util/hash-util.h"
17 #include "util/runtime-profile.h"
18 #include "util/stopwatch.h"
19 
20 using namespace impala;
21 
22 // Very basic hash aggregation prototype and test
23 // TODO: Generalize beyond hash aggregation, beyond hashing on the one column, etc.
24 
25 CacheHashTable::CacheHashTable() {
26  num_content_allocated_ = 0;
27 }
28 
29 void CacheHashTable::BucketSizeDistribution() {
30  std::vector<int> bucket_size;
31  for (int i = 0; i < BUCKETS; ++i) {
32  int size = buckets_[i].count;
33  if (size >= bucket_size.size()) {
34  // grow bucket_size to fit this size
35  bucket_size.resize(size + 1, 0);
36  }
37  ++bucket_size[size];
38  }
39 
40  std::stringstream distr;
41  for (int i = 0; i < bucket_size.size(); ++i) {
42  distr << i << ": " << bucket_size[i] << "\n";
43  }
44  LOG(INFO) << "Bucket Size Distribution\n" << distr.str();
45 }
46 
47 
48 // Update ht, which is doing a COUNT(*) GROUP BY id,
49 // by having it process the new tuple probe.
50 // Templatized on the type of hash table so we can reuse code without virtual calls.
51 template<typename T>
52 inline void Process(T* ht, const ProbeTuple* probe) {
53  BuildTuple *existing = ht->Find(probe);
54  if (existing != NULL) {
55  ++existing->count;
56  } else {
57  BuildTuple build;
58  build.id = probe->id;
59  build.count = 1;
60  ht->Insert(&build);
61  }
62 }
63 
64 // Test ht by aggregating input, which is an array of num_tuples ProbeTuples
65 // Templatized on the type of hash table so we can reuse code without virtual calls.
66 template<typename T>
67 uint64_t Test(T* ht, const ProbeTuple* input, uint64_t num_tuples) {
68  StopWatch time;
69  time.Start();
70  for (int i = 0; i < num_tuples; ++i) {
71  Process<T>(ht, &input[i]);
72  }
73  time.Stop();
74  return time.ElapsedTime();
75 }
76 
77 int main(int argc, char **argv) {
78  google::InitGoogleLogging(argv[0]);
79  CpuInfo::Init();
80 
81  srand(time(NULL));
82 
83  const int NUM_TUPLES = 100000000; //10^8
84  const int NUM_BUILD_TUPLES = 4 * CacheHashTable::MaxBuildTuples() / 10;
85 
86  CacheHashTable cache_ht;
87  StandardHashTable std_ht;
88 
89  ProbeTuple* input = GenTuples(NUM_TUPLES, NUM_BUILD_TUPLES);
90  uint64_t cache_time = Test<CacheHashTable>(&cache_ht, input, NUM_TUPLES);
91  LOG(ERROR) << "Cache-aware time: "
92  << PrettyPrinter::Print(cache_time, TUnit::CPU_TICKS);
93  uint64_t std_time = Test<StandardHashTable>(&std_ht, input, NUM_TUPLES);
94 
95  LOG(ERROR) << "Bucket-chained time: "
96  << PrettyPrinter::Print(std_time, TUnit::CPU_TICKS);
97  return 0;
98 }
uint64_t ElapsedTime() const
Returns time in cpu ticks.
Definition: stopwatch.h:50
void Process(T *ht, const ProbeTuple *probe)
static std::string Print(bool value, TUnit::type ignored, bool verbose=false)
int main(int argc, char **argv)
const int NUM_TUPLES
uint64_t Test(T *ht, const ProbeTuple *input, uint64_t num_tuples)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75