Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
string-benchmark.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <algorithm>
16 #include <stdlib.h>
17 #include <stdio.h>
18 #include <iostream>
19 #include "runtime/string-value.h"
20 #include "util/benchmark.h"
21 #include "util/cpu-info.h"
22 #include "util/hash-util.h"
23 
24 #include "common/names.h"
25 
26 using namespace impala;
27 
28 // Benchmark for testing internal representation of strings. This is prototype
29 // code and should eventually be merged into StringValue
30 
31 // In this case there are 10x as many short strings are long strings.
32 // Results:
33 // Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
34 // String Test: Function Rate Comparison
35 // ----------------------------------------------------------------------
36 // Normal Sequential 31.54 1X
37 // Compact Sequential 30.44 0.9651X
38 // Normal Random 16.09 0.51X
39 // Compact Random 23.13 0.7331X
40 
41 // This is a string representation for compact strings. If the string is
42 // sufficiently short (less than STORAGE_SIZE - 1 bytes), it will be stored
43 // inline. Otherwise, it will be stored as the len/ptr. This only supports
44 // strings up to 2^15 in length (only 2 bytes are used to store the len) and
45 // relies on the fact that the upper 16 bits of addresses are unused on x64.
46 template <int STORAGE_SIZE>
48  private:
49  union {
50  char bytes_[STORAGE_SIZE];
51  struct {
52  // The upper bit is used to encode whether this is stored inline or not.
53  // If the bit is set, it is inlined.
54  long is_inline_ : 1;
55  long inline_len_ : 7;
56  char inline_data_[STORAGE_SIZE - 1];
57  };
58  struct {
59  long dummy_ : 1; // Lines up with is_inline_
60  long indirect_len_ : 15;
61  long indirect_ptr_ : 48;
62  // Rest of STORAGE_SIZE is unused. This is a minimum of 8 bytes.
63  // TODO: we could try to adapt this to support longer lengths if STORAGE_SIZE is
64  // greater than 8.
65  };
66  };
67 
68  public:
69  CompactStringValue(const char* str) {
70  long len = strlen(str);
71  if (len < STORAGE_SIZE) {
72  memcpy(inline_data_, str, len);
73  inline_len_ = len;
74  is_inline_ = true;
75  } else {
76  indirect_ptr_ = reinterpret_cast<long>(str);
77  indirect_len_ = len;
78  is_inline_ = false;
79  }
80  }
81 
82  const char* ptr() const {
83  if (is_inline_) return inline_data_;
84  return reinterpret_cast<const char*>(indirect_ptr_);
85  }
86 
87  int len() const {
88  if (is_inline_) return inline_len_;
89  return indirect_len_;
90  }
91 };
92 
93 struct TestData {
94  vector<StringValue> normal_strings;
95  vector<CompactStringValue<8> > compact_strings;
96  vector<int> random_order;
97  uint32_t hash_normal;
98  uint32_t hash_compact;
99  vector<string> string_data;
100 };
101 
102 void TestNormalStringsSequential(int batch_size, void* d) {
103  TestData* data = reinterpret_cast<TestData*>(d);
104  for (int i = 0; i < batch_size; ++i) {
105  data->hash_normal = 0;
106  for (int j = 0; j < data->normal_strings.size(); ++j) {
107  const StringValue& str = data->normal_strings[j];
108  data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
109  }
110  }
111 }
112 
113 void TestCompactStringsSequential(int batch_size, void* d) {
114  TestData* data = reinterpret_cast<TestData*>(d);
115  for (int i = 0; i < batch_size; ++i) {
116  data->hash_compact = 0;
117  for (int j = 0; j < data->compact_strings.size(); ++j) {
118  const CompactStringValue<8>& str = data->compact_strings[j];
119  data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
120  }
121  }
122 }
123 
124 void TestNormalStringsRandom(int batch_size, void* d) {
125  TestData* data = reinterpret_cast<TestData*>(d);
126  for (int i = 0; i < batch_size; ++i) {
127  data->hash_normal = 0;
128  for (int j = 0; j < data->normal_strings.size(); ++j) {
129  const StringValue& str = data->normal_strings[data->random_order[j]];
130  data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
131  }
132  }
133 }
134 
135 void TestCompactStringsRandom(int batch_size, void* d) {
136  TestData* data = reinterpret_cast<TestData*>(d);
137  for (int i = 0; i < batch_size; ++i) {
138  data->hash_compact = 0;
139  for (int j = 0; j < data->compact_strings.size(); ++j) {
140  const CompactStringValue<8>& str = data->compact_strings[data->random_order[j]];
141  data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
142  }
143  }
144 }
145 
146 void AddTestString(TestData* data, const char* s) {
147  data->compact_strings.push_back(s);
148  data->normal_strings.push_back(StringValue(const_cast<char*>(s), strlen(s)));
149 }
150 
151 // This creates more short strings than long strings.
152 void InitTestData(TestData* data, int num_small_strings, int num_large_strings) {
153  for (int i = 0; i < num_small_strings; ++i) {
154  data->string_data.push_back("small");
155  }
156  for (int i = 0; i < num_large_strings; ++i) {
157  // We don't need this to be too large as to minimize the crc time. It just
158  // needs to be large enough to trigger then indirect string path.
159  data->string_data.push_back("large-large-large");
160  }
161 
162  for (int i = 0; i < num_small_strings; ++i) {
163  AddTestString(data, data->string_data[i].c_str());
164  }
165 
166  for (int i = 0; i < num_large_strings; ++i) {
167  AddTestString(data, data->string_data[i + num_large_strings].c_str());
168  }
169 
170  data->random_order.resize(data->normal_strings.size());
171  for (int i = 0; i < data->random_order.size(); ++i) {
172  data->random_order[i] = i;
173  }
174  random_shuffle(data->random_order.begin(), data->random_order.end());
175 }
176 
177 int main(int argc, char **argv) {
178  CpuInfo::Init();
179  cout << Benchmark::GetMachineInfo() << endl;
180 
181  TestData data;
182  InitTestData(&data, 10000, 100);
183 
184  Benchmark suite("String Test");
185  suite.AddBenchmark("Normal Sequential", TestNormalStringsSequential, &data);
186  suite.AddBenchmark("Compact Sequential", TestCompactStringsSequential, &data);
187  suite.AddBenchmark("Normal Random", TestNormalStringsRandom, &data);
188  suite.AddBenchmark("Compact Random", TestCompactStringsRandom, &data);
189  cout << suite.Measure();
190 
191  if (data.hash_normal != data.hash_compact) {
192  cout << "Uh oh - this is broken." << endl;
193  return 1;
194  }
195 
196  return 0;
197 }
int main(int argc, char **argv)
int AddBenchmark(const std::string &name, BenchmarkFunction fn, void *args, int baseline_idx=0)
Definition: benchmark.cc:70
vector< string > string_data
void InitTestData(TestData *data, int num_small_strings, int num_large_strings)
const char * ptr() const
static std::string GetMachineInfo()
Output machine/build configuration as a string.
Definition: benchmark.cc:124
vector< CompactStringValue< 8 > > compact_strings
void TestCompactStringsSequential(int batch_size, void *d)
void TestCompactStringsRandom(int batch_size, void *d)
std::string Measure()
Runs all the benchmarks and returns the result in a formatted string.
Definition: benchmark.cc:83
uint32_t hash_normal
vector< StringValue > normal_strings
void TestNormalStringsRandom(int batch_size, void *d)
uint32_t hash_compact
static uint32_t CrcHash(const void *data, int32_t bytes, uint32_t hash)
Definition: hash-util.h:37
void TestNormalStringsSequential(int batch_size, void *d)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75
vector< int > random_order
CompactStringValue(const char *str)
void AddTestString(TestData *data, const char *s)