Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
multiint-benchmark.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <iomanip>
16 #include <iostream>
17 #include <sstream>
18 
19 #include "util/benchmark.h"
20 #include "util/cpu-info.h"
21 
23 #include "common/names.h"
24 
25 // Benchmark to measure operations on different implementation of multi (i.e. > 8)
26 // byte integers.
27 // Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
28 // Add: Function Rate (iters/ms) Comparison
29 // ----------------------------------------------------------------------
30 // int128_CPP 1.87e+04 1X
31 // int128_Boost 2842 0.1519X
32 // int96_CPP 1289 0.06891X
33 // int64 2.381e+04 1.273X
34 // int128_Base1B 1.08e+04 0.5775X
35 // doubles 2.559e+04 1.368X
36 //
37 // Multiply: Function Rate (iters/ms) Comparison
38 // ----------------------------------------------------------------------
39 // int128_CPP 1.263e+04 1X
40 // int128_Boost 4418 0.3497X
41 // int96_CPP 1220 0.09656X
42 // int64 2.913e+04 2.306X
43 // doubles 2.282e+04 1.806X
44 //
45 // Divide: Function Rate (iters/ms) Comparison
46 // ----------------------------------------------------------------------
47 // int128_CPP 1696 1X
48 // int128_Boost 1112 0.6558X
49 // int96_CPP 561.7 0.3313X
50 // int64 2788 1.644X
51 // double 4565 2.692X
52 
53 
54 using std::max;
55 using std::numeric_limits;
56 using namespace impala;
57 
58 // Multi byte ints encoded using a big base.
59 // The decimal value is:
60 // data[0] + data[1] * BASE^1 + data[2] * BASE^2 + ...
61 // This is not a full implementation of the functionality required.
62 template<uint32_t BASE>
63 struct BaseInt128 {
64  uint32_t data[4]; // least significant first.
65 
67  memset(data, 0, sizeof(data));
68  int idx = 0;
69  while (v > 0) {
70  data[idx++] = v % BASE;
71  v /= BASE;
72  }
73  }
74 
76  uint64_t r0 = data[0] + rhs.data[0];
77  uint64_t r1 = data[1] + rhs.data[1];
78  uint64_t r2 = data[2] + rhs.data[2];
79  uint64_t r3 = data[3] + rhs.data[3];
80 
81  this->data[0] = static_cast<uint32_t>(r0);
82  if (r0 > numeric_limits<uint32_t>::max()) ++r1;
83  this->data[1] = static_cast<uint32_t>(r1);
84  if (r1 > numeric_limits<uint32_t>::max()) ++r2;
85  this->data[2] = static_cast<uint32_t>(r2);
86  if (r2 > numeric_limits<uint32_t>::max()) ++r3;
87  this->data[3] = static_cast<uint32_t>(r3);
88  return *this;
89  }
90 
91  string Print() const {
92  stringstream ss;
93  bool print_padded = false;
94  for (int i = 3; i >= 0; --i) {
95  if (data[i] == 0 && !print_padded) continue;
96  if (print_padded) {
97  ss << setw(9) << data[i];
98  } else {
99  ss << data[i];
100  }
101  }
102  ss << data[0];
103  return ss.str();
104  }
105 };
106 
107 // Prototype for int96 by doing the arithmetic as casts to int128. Minimal
108 // implementation to run benchmark.
109 struct int96 {
110  public:
111  int96() {
112  }
113 
114  int96(__int128_t t) {
115  memcpy(data, reinterpret_cast<uint8_t*>(&t), 12);
116  }
117 
118  __int128_t to_int128() const {
119  __int128_t r;
120  // Fill in upper bits and bit shift down to sign extend
121  memcpy(reinterpret_cast<uint8_t*>(&r) + 4, data, 12);
122  r >>= 4;
123  return r;
124  }
125 
126  int96& operator+=(const int96& v) {
127  __int128_t x = to_int128();
128  __int128_t y = v.to_int128();
129  __int128_t r = x + y;
130  *this = int96(r);
131  return *this;
132  }
133 
134  int96& operator*=(const int96& v) {
135  __int128_t x = to_int128();
136  __int128_t y = v.to_int128();
137  __int128_t r = x * y;
138  *this = int96(r);
139  return *this;
140  }
141 
142  int96 operator/(const int96& v) const {
143  __int128_t x = to_int128();
144  __int128_t y = v.to_int128();
145  __int128_t r = x / y;
146  return int96(r);
147  }
148 
149  int96 operator-() const {
150  return int96(-to_int128());
151  }
152 
153  private:
154  // First (little endian) 12 bytes of int128_t
155  uint8_t data[12];
156 };
157 
159 
160 struct TestData {
161  // multi precision ints implemented with the boost library.
162  vector<boost::multiprecision::int128_t> boost_add_ints;
163  vector<boost::multiprecision::int128_t> boost_mult_ints;
165 
166  // multi precision ints as defined by the c++ extension
167  vector<__int128_t> cpp_add_ints;
168  vector<__int128_t> cpp_mult_ints;
169  __int128_t cpp_result;
170 
171  vector<int96> cpp96_add_ints;
172  vector<int96> cpp96_mult_ints;
174 
175  vector<int64_t> int64_ints;
176  int64_t int64_result;
177 
178  vector<double> doubles;
180 
181  vector<Base1BInt128> base1b_ints;
183 };
184 
185 // Initialize test data. 1/4 will be negative. 1/2 will require more than 8 bytes.
186 void InitTestData(TestData* data, int n) {
187  for (int i = 0; i < n; ++i) {
188  data->boost_add_ints.push_back(boost::multiprecision::int128_t(i + 1));
189  data->boost_mult_ints.push_back(boost::multiprecision::int128_t(i + 1));
190  data->cpp_add_ints.push_back(__int128_t(i + 1));
191  data->cpp_mult_ints.push_back(__int128_t(i + 1));
192  data->cpp96_add_ints.push_back(__int128_t(i + 1));
193  data->cpp96_mult_ints.push_back(__int128_t(i + 1));
194 
195  if (i % 2 == 0) {
196  data->boost_add_ints[i] *= numeric_limits<int64_t>::max();
197  data->cpp_add_ints[i] *= numeric_limits<int64_t>::max();
198  data->cpp96_add_ints[i] *= numeric_limits<int64_t>::max();
199  }
200  if (i % 4 == 0) {
201  data->boost_add_ints[i] = -data->boost_add_ints[i];
202  data->cpp_add_ints[i] = -data->cpp_add_ints[i];
203  data->cpp96_add_ints[i] = -data->cpp96_add_ints[i];
204 
205  data->boost_mult_ints[i] = -data->boost_mult_ints[i];
206  data->cpp_mult_ints[i] = -data->cpp_mult_ints[i];
207  data->cpp96_mult_ints[i] = -data->cpp96_mult_ints[i];
208  }
209 
210  data->int64_ints.push_back(i + 1);
211  data->base1b_ints.push_back(i + 1);
212  data->doubles.push_back(i + 1);
213  }
214 }
215 
216 #define TEST_ADD(NAME, RESULT, VALS)\
217  void NAME(int batch_size, void* d) {\
218  TestData* data = reinterpret_cast<TestData*>(d);\
219  for (int i = 0; i < batch_size; ++i) {\
220  data->RESULT = 0;\
221  for (int j = 0; j < data->VALS.size(); ++j) {\
222  data->RESULT += data->VALS[j];\
223  }\
224  }\
225  }
226 
227 #define TEST_MULTIPLY(NAME, RESULT, VALS)\
228  void NAME(int batch_size, void* d) {\
229  TestData* data = reinterpret_cast<TestData*>(d);\
230  for (int i = 0; i < batch_size; ++i) {\
231  data->RESULT = 1;\
232  for (int j = 0; j < data->VALS.size(); ++j) {\
233  data->RESULT *= data->VALS[j];\
234  }\
235  }\
236  }
237 
238 #define TEST_DIVIDE(NAME, RESULT, VALS)\
239  void NAME(int batch_size, void* d) {\
240  TestData* data = reinterpret_cast<TestData*>(d);\
241  for (int i = 0; i < batch_size; ++i) {\
242  data->RESULT = 0;\
243  for (int j = 0; j < data->VALS.size() - 1; ++j) {\
244  data->RESULT += data->VALS[j + 1] / data->VALS[j];\
245  }\
246  }\
247  }
248 
249 TEST_ADD(TestBoostAdd, boost_result, boost_add_ints);
250 TEST_ADD(TestCppAdd, cpp_result, cpp_add_ints);
251 TEST_ADD(TestCpp96Add, cpp96_result, cpp96_add_ints);
252 TEST_ADD(TestBaseBillionAdd, base1b_result, base1b_ints);
253 TEST_ADD(TestInt64Add, int64_result, int64_ints);
254 TEST_ADD(TestDoubleAdd, double_result, doubles);
255 
256 TEST_MULTIPLY(TestBoostMultiply, boost_result, boost_mult_ints);
257 TEST_MULTIPLY(TestCppMultiply, cpp_result, cpp_mult_ints);
258 TEST_MULTIPLY(TestCpp96Multiply, cpp96_result, cpp96_mult_ints);
259 TEST_MULTIPLY(TestInt64Multiply, int64_result, int64_ints);
260 TEST_MULTIPLY(TestDoubleMultiply, double_result, doubles);
261 
262 TEST_DIVIDE(TestBoostDivide, boost_result, boost_mult_ints);
263 TEST_DIVIDE(TestCppDivide, cpp_result, cpp_mult_ints);
264 TEST_DIVIDE(TestCpp96Divide, cpp96_result, cpp96_mult_ints);
265 TEST_DIVIDE(TestInt64Divide, int64_result, int64_ints);
266 TEST_DIVIDE(TestDoubleDivide, double_result, doubles);
267 
268 int main(int argc, char** argv) {
269  CpuInfo::Init();
270  cout << Benchmark::GetMachineInfo() << endl;
271 
272  TestData data;
273  InitTestData(&data, 38); // 38! doesn't overflow int128.
274 
275  Benchmark add_suite("Add");
276  add_suite.AddBenchmark("int128_CPP", TestCppAdd, &data);
277  add_suite.AddBenchmark("int128_Boost", TestBoostAdd, &data);
278  add_suite.AddBenchmark("int96_CPP", TestCpp96Add, &data);
279  add_suite.AddBenchmark("int64", TestInt64Add, &data);
280  add_suite.AddBenchmark("int128_Base1B", TestBaseBillionAdd, &data);
281  add_suite.AddBenchmark("doubles", TestDoubleAdd, &data);
282  cout << add_suite.Measure() << endl;
283 
284  Benchmark multiply_suite("Multiply");
285  multiply_suite.AddBenchmark("int128_CPP", TestCppMultiply, &data);
286  multiply_suite.AddBenchmark("int128_Boost", TestBoostMultiply, &data);
287  multiply_suite.AddBenchmark("int96_CPP", TestCpp96Multiply, &data);
288  multiply_suite.AddBenchmark("int64", TestInt64Multiply, &data);
289  multiply_suite.AddBenchmark("doubles", TestDoubleMultiply, &data);
290  cout << multiply_suite.Measure() << endl;
291 
292  Benchmark divide_suite("Divide");
293  divide_suite.AddBenchmark("int128_CPP", TestCppDivide, &data);
294  divide_suite.AddBenchmark("int128_Boost", TestBoostDivide, &data);
295  divide_suite.AddBenchmark("int96_CPP", TestCpp96Divide, &data);
296  divide_suite.AddBenchmark("int64", TestInt64Divide, &data);
297  divide_suite.AddBenchmark("double", TestDoubleDivide, &data);
298  cout << divide_suite.Measure() << endl;
299 
300  return 0;
301 }
BaseInt128(uint64_t v=0)
int AddBenchmark(const std::string &name, BenchmarkFunction fn, void *args, int baseline_idx=0)
Definition: benchmark.cc:70
Base1BInt128 base1b_result
__int128_t cpp_result
#define TEST_DIVIDE(NAME, RESULT, VALS)
vector< int64_t > int64_ints
vector< Base1BInt128 > base1b_ints
static std::string GetMachineInfo()
Output machine/build configuration as a string.
Definition: benchmark.cc:124
__int128_t to_int128() const
vector< __int128_t > cpp_add_ints
std::string Measure()
Runs all the benchmarks and returns the result in a formatted string.
Definition: benchmark.cc:83
int96(__int128_t t)
double double_result
vector< int96 > cpp96_add_ints
int64_t int64_result
vector< double > doubles
int96 operator-() const
boost::multiprecision::int128_t boost_result
uint32_t data[4]
#define TEST_MULTIPLY(NAME, RESULT, VALS)
int main(int argc, char **argv)
string Print() const
void InitTestData(TestData *data, int n)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75
BaseInt128 & operator+=(const BaseInt128 &rhs)
vector< __int128_t > cpp_mult_ints
vector< boost::multiprecision::int128_t > boost_add_ints
vector< boost::multiprecision::int128_t > boost_mult_ints
vector< int96 > cpp96_mult_ints
int96 & operator*=(const int96 &v)
int96 & operator+=(const int96 &v)
int96 operator/(const int96 &v) const
BaseInt128< 1000000000 > Base1BInt128
#define TEST_ADD(NAME, RESULT, VALS)
__int128_t int128_t
We use the c++ int128_t type. This is stored using 16 bytes and very performant.