Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
string-search-benchmark.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <iostream>
18 #include "util/benchmark.h"
19 #include "util/cpu-info.h"
20 #include "runtime/string-search.h"
22 
23 #include "common/names.h"
24 
25 using namespace impala;
26 
27 // Benchmark tests for string search. This is probably a science of its own
28 // but we'll run some simple tests. (We can't use libc strstr because our
29 // strings are not null-terminated and also, it's not that fast).
30 // Results:
31 // String Search: Function Rate Comparison
32 // ----------------------------------------------------------------------
33 // Python 81.93 1X
34 // LibC 262.2 3.201X
35 // Null Terminated SSE 212.4 2.592X
36 // Non-null Terminated SSE 139.6 1.704X
37 
38 struct TestData {
39  vector<StringValue> needles;
40  vector<StringValue> haystacks;
41  int matches;
42  vector<string> strings; // Memory store for allocated strings
43 };
44 
45 void TestLibc(int batch_size, void* d) {
46  TestData* data = reinterpret_cast<TestData*>(d);
47  for (int i = 0; i < batch_size; ++i) {
48  data->matches = 0;
49  for (int n = 0; n < data->needles.size(); ++n) {
50  char* needle = data->needles[n].ptr;
51  for (int iters = 0; iters < 10; ++iters) {
52  for (int h = 0; h < data->haystacks.size(); ++h) {
53  if (strstr(data->haystacks[h].ptr, needle) != NULL) {
54  ++data->matches;
55  }
56  }
57  }
58  }
59  }
60 }
61 
62 void TestPython(int batch_size, void* d) {
63  TestData* data = reinterpret_cast<TestData*>(d);
64  for (int i = 0; i < batch_size; ++i) {
65  data->matches = 0;
66  for (int n = 0; n < data->needles.size(); ++n) {
67  StringSearch needle(&(data->needles[n]));
68  for (int iters = 0; iters < 10; ++iters) {
69  for (int h = 0; h < data->haystacks.size(); ++h) {
70  if (needle.Search(&(data->haystacks[h])) != -1) {
71  ++data->matches;
72  }
73  }
74  }
75  }
76  }
77 }
78 
79 void TestImpalaNullTerminated(int batch_size, void* d) {
80  TestData* data = reinterpret_cast<TestData*>(d);
81  for (int i = 0; i < batch_size; ++i) {
82  data->matches = 0;
83  for (int n = 0; n < data->needles.size(); ++n) {
84  // Exercise the null-terminated path.
85  StringSearchSSE needle(data->needles[n].ptr);
86  for (int iters = 0; iters < 10; ++iters) {
87  for (int h = 0; h < data->haystacks.size(); ++h) {
88  if (needle.Search(data->haystacks[h]) != -1) {
89  ++data->matches;
90  }
91  }
92  }
93  }
94  }
95 }
96 
97 void TestImpalaNonNullTerminated(int batch_size, void* d) {
98  TestData* data = reinterpret_cast<TestData*>(d);
99  for (int i = 0; i < batch_size; ++i) {
100  data->matches = 0;
101  for (int n = 0; n < data->needles.size(); ++n) {
102  // Exercise the non null-terminated path.
103  StringSearchSSE needle(&(data->needles[n]));
104  for (int iters = 0; iters < 10; ++iters) {
105  for (int h = 0; h < data->haystacks.size(); ++h) {
106  if (needle.Search(data->haystacks[h]) != -1) {
107  ++data->matches;
108  }
109  }
110  }
111  }
112  }
113 }
114 
115 // This should be tailored to represent the data we expect.
116 // Some algos are better suited for longer vs shorter needles, finding the string vs.
117 // not finding the string, etc.
118 // For now, pick data that mimicks the grep data set.
119 void InitTestData(TestData* data) {
120  vector<string> needles;
121  vector<string> haystacks;
122 
123  needles.push_back("xyz");
124 
125  // From a random password generator:
126  // https://www.grc.com/passwords.htm
127  haystacks.push_back("d1h2POju5AG3zCxiBNKBxdzLdW7VfPkgvHDRLK2o78pGu4eywZ5mmmsV1LscqIH");
128  haystacks.push_back("Xv7rmDx3ksECRPg94mPLI1gQjRIviZNINZwbDsvcyNAf7q4pzGQ6JHUqYHxFk0x");
129  haystacks.push_back("cI8f2V9cYjmE70ruUeST9Arty7B8iKLiF4DaXhY4nF7OmXZOKRBVPd5cmjxiDRw");
130  haystacks.push_back("3cb7QCWJ7QzNSwaXQQIByo7is1b3YPTx9u0ZmRV0IJEKJE2z2oMaKgzeA1ny7hx");
131  haystacks.push_back("teRFhdOc5HrGvXUo1yGZSUc46hCVLjVzGaK37vfGAsuCpiji2R4YQ7Y4FN2O3lu");
132  haystacks.push_back("xecRm4HYLTfhQgPQHTIhWQKNGJcAUYCziDJQlbAitVHC9ClbbcEkVrXsFSpVrx9");
133  haystacks.push_back("M1k4su3sxpcBl8SQgnF6LOD1fNjTrQQW84vLofF39IbY4gTVpsHIlon8mbJ74D7");
134  haystacks.push_back("dMROjEop61WoHNSegkiw7o1uIM5xgccf9SRhIEOnq9CoM7e0KatpjAmGLlANggK");
135  haystacks.push_back("OIPM3iTT40sSkCTfDOjK4BSHZvV0xazTXarwYhyQEu5aGn7XeCABezJhbO1X8rr");
136  haystacks.push_back("0YyLBoITUS7Mxfo3o44j4eEdWqwx6Um5EkUzIo7c0ETYfnkcaHsYcpT5Xn3coyE");
137  haystacks.push_back("LzOFXp8ekVkeBy6Br9rRefzDDA30SLkvV3gGochoKGUkS0oj9fANBfOSBeyNypm");
138  haystacks.push_back("uYDMZ4SNIh6XwBQcGCS3AQ5h5c2iyaPS70r4emPheC3hQhM84zwsMovByLNecTW");
139  haystacks.push_back("EFY8GtkUkXRJQr3ZjANKhUFXTmnA2S4V6hTKeeskQhlUA65lTCO0PpyZxL8kyVE");
140  haystacks.push_back("fgU8K01JtTn3UldvO9jAywDEN4HW5PwCFEJ4pwSwHfec6HsBN5rzzhQ2Tn4QDxE");
141  haystacks.push_back("TMLE6UzW3CUUogfRFjZooP1eCZn05bI6NsWL67fOmwYhZND1nuAASG0ZhGFBdZH");
142 
143  // Contains the key in the beginning/middle/end
144  haystacks.push_back("of9VxyzzWca0GBs8CrFsDIvi8yWCkRTRRL7X4TlMhPfRAWPOZfRVwybJoYQhslJ");
145  haystacks.push_back("iEO7zz7wWm4LmvumisOwPRUzv9xyzhI7omKDlLYM7Vao9pggACVgDJBhAzDkcJp");
146  haystacks.push_back("tQJzS0SiXRElwq1QEBhy0gGGii9xAcQbTIrt4QcGMViyOx4lfZ73zZ5XHxyzk1V");
147  haystacks.push_back("tQJzS0SiXRElwq1QEBhy0gGGii9xAcQbTIrt4QcGMViyOx4lfZ73zZ5XHyzaxyz");
148 
149  for (int i = 0; i < needles.size(); ++i) {
150  StringValue v(const_cast<char*>(needles[i].c_str()), needles[i].size());
151  data->needles.push_back(v);
152  data->strings.push_back(needles[i]);
153  }
154  for (int i = 0; i < haystacks.size(); ++i) {
155  StringValue v(const_cast<char*>(haystacks[i].c_str()), haystacks[i].size());
156  data->haystacks.push_back(v);
157  data->strings.push_back(haystacks[i]);
158  }
159 }
160 
161 int main(int argc, char **argv) {
162  CpuInfo::Init();
163  cout << Benchmark::GetMachineInfo() << endl;
164 
165  TestData data;
166  InitTestData(&data);
167 
168  Benchmark suite("String Search");
169  suite.AddBenchmark("Python", TestPython, &data);
170  suite.AddBenchmark("LibC", TestLibc, &data);
171  suite.AddBenchmark("Null Terminated SSE", TestImpalaNullTerminated, &data);
172  suite.AddBenchmark("Non-null Terminated SSE", TestImpalaNonNullTerminated, &data);
173  cout << suite.Measure();
174 
175  return 0;
176 }
int AddBenchmark(const std::string &name, BenchmarkFunction fn, void *args, int baseline_idx=0)
Definition: benchmark.cc:70
static std::string GetMachineInfo()
Output machine/build configuration as a string.
Definition: benchmark.cc:124
int Search(const StringValue *str) const
void TestImpalaNullTerminated(int batch_size, void *d)
vector< StringValue > needles
vector< StringValue > haystacks
std::string Measure()
Runs all the benchmarks and returns the result in a formatted string.
Definition: benchmark.cc:83
int Search(const StringValue &haystack) const
void TestImpalaNonNullTerminated(int batch_size, void *d)
void TestPython(int batch_size, void *d)
int main(int argc, char **argv)
void TestLibc(int batch_size, void *d)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75
vector< string > strings
void InitTestData(TestData *data)