Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
uda-test.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <iostream>
16 #include <gtest/gtest.h>
17 
18 #include "common/logging.h"
19 #include "udf/uda-test-harness.h"
20 #include "testutil/test-udas.h"
21 
22 #include "common/names.h"
23 
24 using std::min;
25 using namespace impala;
26 using namespace impala_udf;
27 
28 
29 //-------------------------------- Count ------------------------------------
30 // Example of implementing Count(int_col).
31 // The input type is: int
32 // The intermediate type is bigint
33 // the return type is bigint
34 void CountInit(FunctionContext* context, BigIntVal* val) {
35  val->is_null = false;
36  val->val = 0;
37 }
38 
39 void CountUpdate(FunctionContext* context, const IntVal& input, BigIntVal* val) {
40  // BigIntVal is the same ptr as what was passed to CountInit
41  if (input.is_null) return;
42  ++val->val;
43 }
44 
45 void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) {
46  dst->val += src.val;
47 }
48 
50  return val;
51 }
52 
53 //-------------------------------- Count(...) ------------------------------------
54 // Example of implementing Count(...)
55 // The input type is: multiple ints
56 // The intermediate type is bigint
57 // the return type is bigint
58 void Count2Update(FunctionContext* context, const IntVal& input1, const IntVal& input2,
59  BigIntVal* val) {
60  val->val += (!input1.is_null + !input2.is_null);
61 }
62 void Count3Update(FunctionContext* context, const IntVal& input1, const IntVal& input2,
63  const IntVal& input3, BigIntVal* val) {
64  val->val += (!input1.is_null + !input2.is_null + !input3.is_null);
65 }
66 void Count4Update(FunctionContext* context, const IntVal& input1, const IntVal& input2,
67  const IntVal& input3, const IntVal& input4, BigIntVal* val) {
68  val->val += (!input1.is_null + !input2.is_null + !input3.is_null + !input4.is_null);
69 }
70 
71 //-------------------------------- Min(String) ------------------------------------
72 // Example of implementing MIN for strings.
73 // The input type is: STRING
74 // The intermediate type is BufferVal
75 // the return type is STRING
76 // This is a little more sophisticated since the result buffers are reused (it grows
77 // to the longest result string).
78 struct MinState {
79  uint8_t* value;
80  int len;
82 
83  void Set(FunctionContext* context, const StringVal& val) {
84  if (buffer_len < val.len) {
85  context->Free(value);
86  value = context->Allocate(val.len);
87  buffer_len = val.len;
88  }
89  memcpy(value, val.ptr, val.len);
90  len = val.len;
91  }
92 };
93 
94 // Initialize the MinState scratch space
95 void MinInit(FunctionContext* context, BufferVal* val) {
96  MinState* state = reinterpret_cast<MinState*>(*val);
97  state->value = NULL;
98  state->buffer_len = 0;
99 }
100 
101 // Update the min value, comparing with the current value in MinState
102 void MinUpdate(FunctionContext* context, const StringVal& input, BufferVal* val) {
103  if (input.is_null) return;
104  MinState* state = reinterpret_cast<MinState*>(*val);
105  if (state->value == NULL) {
106  state->Set(context, input);
107  return;
108  }
109  int cmp = memcmp(input.ptr, state->value, ::min(input.len, state->len));
110  if (cmp < 0 || (cmp == 0 && input.len < state->len)) {
111  state->Set(context, input);
112  }
113 }
114 
115 // Serialize the state into the min string
116 const BufferVal MinSerialize(FunctionContext* context, const BufferVal& intermediate) {
117  MinState* state = reinterpret_cast<MinState*>(intermediate);
118  if (state->value == NULL) return intermediate;
119  // Hack to persist the intermediate state's value without leaking.
120  // TODO: revisit BufferVal and design a better way to do this
121  StringVal copy_buffer(context, state->len);
122  memcpy(copy_buffer.ptr, state->value, state->len);
123  context->Free(state->value);
124  state->value = copy_buffer.ptr;
125  return intermediate;
126 }
127 
128 // Merge is the same as Update since the serialized format is the raw input format
129 void MinMerge(FunctionContext* context, const BufferVal& src, BufferVal* dst) {
130  const MinState* src_state = reinterpret_cast<const MinState*>(src);
131  if (src_state->value == NULL) return;
132  MinUpdate(context, StringVal(src_state->value, src_state->len), dst);
133 }
134 
135 // Finalize also just returns the string so is the same as MinSerialize.
137  const MinState* state = reinterpret_cast<const MinState*>(val);
138  if (state->value == NULL) return StringVal::null();
139  StringVal result = StringVal(context, state->len);
140  memcpy(result.ptr, state->value, state->len);
141  context->Free(state->value);
142  return result;
143 }
144 
145 //----------------------------- Bits after Xor ------------------------------------
146 // Example of a UDA that xors all the input bits and then returns the number of
147 // resulting bits that are set. This illustrates where the result and intermediate
148 // are the same type, but a transformation is still needed in Finialize()
149 // The input type is: double
150 // The intermediate type is bigint
151 // the return type is bigint
152 void XorInit(FunctionContext* context, BigIntVal* val) {
153  val->is_null = false;
154  val->val = 0;
155 }
156 
157 void XorUpdate(FunctionContext* context, const double* input, BigIntVal* val) {
158  // BigIntVal is the same ptr as what was passed to CountInit
159  if (input == NULL) return;
160  val->val |= *reinterpret_cast<const int64_t*>(input);
161 }
162 
163 void XorMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) {
164  dst->val |= src.val;
165 }
166 
168  int64_t set_bits = 0;
169  // Do popcnt on val
170  // set_bits = popcnt(val.val);
171  return BigIntVal(set_bits);
172 }
173 
174 //--------------------------- HLL(Distinct Estimate) ---------------------------------
175 // Example of implementing distinct estimate. As an example, we will compress the
176 // intermediate buffer.
177 // Note: this is not the actual algorithm but a sketch of how it would be implemented
178 // with the UDA interface.
179 // The input type is: bigint
180 // The intermediate type is string (fixed at 256 bytes)
181 // the return type is bigint
183  // Since this is known, this will be allocated to 256 bytes.
184  assert(val->len == 256);
185  memset(val->ptr, 0, 256);
186 }
187 
189  const int64_t* input, StringVal* val) {
190  if (input == NULL) return;
191  for (int i = 0; i < 256; ++i) {
192  int hash = 0;
193  // Hash(input) with the ith hash function
194  // hash = Hash(*input, i);
195  val->ptr[i] = hash;
196  }
197 }
198 
200  const StringVal& intermediate) {
201  int compressed_size = 0;
202  uint8_t* result = NULL; // SnappyCompress(intermediate.ptr, intermediate.len);
203  return StringVal(result, compressed_size);
204 }
205 
206 void DistinctEstimateMerge(FunctionContext* context, const StringVal& src, StringVal* dst) {
207  uint8_t* src_uncompressed = NULL; // SnappyUncompress(src.ptr, src.len);
208  for (int i = 0; i < 256; ++i) {
209  dst->ptr[i] ^= src_uncompressed[i];
210  }
211 }
212 
214  int64_t set_bits = 0;
215  // Do popcnt on val
216  // set_bits = popcnt(val.val);
217  return BigIntVal(set_bits);
218 }
219 
220 TEST(CountTest, Basic) {
223  vector<IntVal> no_nulls;
224  no_nulls.resize(1000);
225 
226  EXPECT_TRUE(test.Execute(no_nulls, BigIntVal(no_nulls.size()))) << test.GetErrorMsg();
227  EXPECT_FALSE(test.Execute(no_nulls, BigIntVal(100))) << test.GetErrorMsg();
228 }
229 
230 TEST(CountMultiArgTest, Basic) {
231  int num = 1000;
232  vector<IntVal> no_nulls;
233  no_nulls.resize(num);
234 
237  EXPECT_TRUE(test2.Execute(no_nulls, no_nulls, BigIntVal(2 * num)));
238  EXPECT_FALSE(test2.Execute(no_nulls, no_nulls, BigIntVal(100)));
239 
242  EXPECT_TRUE(test3.Execute(no_nulls, no_nulls, no_nulls, BigIntVal(3 * num)));
243 
246  EXPECT_TRUE(test4.Execute(no_nulls, no_nulls, no_nulls, no_nulls, BigIntVal(4 * num)));
247 }
248 
249 bool FuzzyCompare(const BigIntVal& r1, const BigIntVal& r2) {
250  if (r1.is_null && r2.is_null) return true;
251  if (r1.is_null || r2.is_null) return false;
252  return abs(r1.val - r2.val) <= 1;
253 }
254 
255 TEST(CountTest, FuzzyEquals) {
258  vector<IntVal> no_nulls;
259  no_nulls.resize(1000);
260 
261  EXPECT_TRUE(test.Execute(no_nulls, BigIntVal(1000))) << test.GetErrorMsg();
262  EXPECT_FALSE(test.Execute(no_nulls, BigIntVal(999))) << test.GetErrorMsg();
263 
265  EXPECT_TRUE(test.Execute(no_nulls, BigIntVal(1000))) << test.GetErrorMsg();
266  EXPECT_TRUE(test.Execute(no_nulls, BigIntVal(999))) << test.GetErrorMsg();
267  EXPECT_FALSE(test.Execute(no_nulls, BigIntVal(998))) << test.GetErrorMsg();
268 }
269 
270 TEST(MinTest, Basic) {
273  test.SetIntermediateSize(sizeof(MinState));
274 
275  vector<StringVal> values;
276  values.push_back(StringVal("BBB"));
277  EXPECT_TRUE(test.Execute(values, StringVal("BBB"))) << test.GetErrorMsg();
278 
279  values.push_back(StringVal("AA"));
280  EXPECT_TRUE(test.Execute(values, StringVal("AA"))) << test.GetErrorMsg();
281 
282  values.push_back(StringVal("CCC"));
283  EXPECT_TRUE(test.Execute(values, StringVal("AA"))) << test.GetErrorMsg();
284 
285  values.push_back(StringVal("ABCDEF"));
286  values.push_back(StringVal("AABCDEF"));
287  values.push_back(StringVal("A"));
288  EXPECT_TRUE(test.Execute(values, StringVal("A"))) << test.GetErrorMsg();
289 
290  values.clear();
291  values.push_back(StringVal::null());
292  EXPECT_TRUE(test.Execute(values, StringVal::null())) << test.GetErrorMsg();
293 
294  values.push_back(StringVal("ZZZ"));
295  EXPECT_TRUE(test.Execute(values, StringVal("ZZZ"))) << test.GetErrorMsg();
296 }
297 
298 TEST(MemTest, Basic) {
301  ::MemTestFinalize);
302  vector<BigIntVal> input;
303  for (int i = 0; i < 10; ++i) {
304  input.push_back(10);
305  }
306  EXPECT_TRUE(test.Execute(input, BigIntVal(100))) << test.GetErrorMsg();
307 
310  EXPECT_FALSE(test_leak.Execute(input, BigIntVal(100))) << test.GetErrorMsg();
311 }
312 
313 int main(int argc, char** argv) {
315  ::testing::InitGoogleTest(&argc, argv);
316  return RUN_ALL_TESTS();
317 }
BigIntVal MemTest(FunctionContext *context, const BigIntVal &bytes)
Definition: test-udfs.cc:260
void SetResultComparator(ResultComparator fn)
const BigIntVal MemTestSerialize(FunctionContext *context, const BigIntVal &total)
Definition: test-udas.cc:83
void XorMerge(FunctionContext *context, const BigIntVal &src, BigIntVal *dst)
Definition: uda-test.cc:163
BigIntVal DistinctEstimateFinalize(FunctionContext *context, const StringVal &val)
Definition: uda-test.cc:213
BigIntVal MemTestFinalize(FunctionContext *context, const BigIntVal &total)
Definition: test-udas.cc:89
void SetIntermediateSize(int byte_size)
This must be called if the INTERMEDIATE is TYPE_FIXED_BUFFER.
bool Execute(const std::vector< INPUT1 > &values1, const std::vector< INPUT2 > &values2, const RESULT &expected, UdaExecutionMode mode=ALL)
Runs the UDA in all the modes, validating the result is 'expected' each time.
void DistinctEstimateMerge(FunctionContext *context, const StringVal &src, StringVal *dst)
Definition: uda-test.cc:206
BigIntVal CountFinalize(FunctionContext *context, const BigIntVal &val)
Definition: uda-test.cc:49
void CountUpdate(FunctionContext *context, const IntVal &input, BigIntVal *val)
Definition: uda-test.cc:39
void MinInit(FunctionContext *context, BufferVal *val)
Definition: uda-test.cc:95
int128_t abs(const int128_t &x)
void DistinctEstimateInit(FunctionContext *context, StringVal *val)
Definition: uda-test.cc:182
int main(int argc, char **argv)
Definition: uda-test.cc:313
void MemTestInit(FunctionContext *, BigIntVal *total)
Definition: test-udas.cc:63
const StringSearch UrlParser::hash_search & hash
Definition: url-parser.cc:41
TEST(AtomicTest, Basic)
Definition: atomic-test.cc:28
BigIntVal XorFinalize(FunctionContext *context, const BigIntVal &val)
Definition: uda-test.cc:167
StringVal DistinctEstimatSerialize(FunctionContext *context, const StringVal &intermediate)
Definition: uda-test.cc:199
uint8_t * BufferVal
Definition: udf.h:600
uint8_t * ptr
Definition: udf.h:523
StringVal MinFinalize(FunctionContext *context, const BufferVal &val)
Definition: uda-test.cc:136
void MemTestMerge(FunctionContext *context, const BigIntVal &src, BigIntVal *dst)
Definition: test-udas.cc:73
bool is_null
Definition: udf.h:359
int len
Definition: uda-test.cc:80
bool FuzzyCompare(const BigIntVal &r1, const BigIntVal &r2)
Definition: uda-test.cc:249
void InitGoogleLoggingSafe(const char *arg)
Definition: logging.cc:55
void Set(FunctionContext *context, const StringVal &val)
Definition: uda-test.cc:83
void Free(uint8_t *buffer)
Frees a buffer returned from Allocate() or Reallocate()
Definition: udf.cc:291
const BufferVal MinSerialize(FunctionContext *context, const BufferVal &intermediate)
Definition: uda-test.cc:116
void Count3Update(FunctionContext *context, const IntVal &input1, const IntVal &input2, const IntVal &input3, BigIntVal *val)
Definition: uda-test.cc:62
void CountInit(FunctionContext *context, BigIntVal *val)
This is an example of the COUNT aggregate function.
Definition: uda-test.cc:34
void MinUpdate(FunctionContext *context, const StringVal &input, BufferVal *val)
Definition: uda-test.cc:102
void XorUpdate(FunctionContext *context, const double *input, BigIntVal *val)
Definition: uda-test.cc:157
uint8_t * Allocate(int byte_size)
Definition: udf.cc:262
bool Execute(const std::vector< INPUT > &values, const RESULT &expected, UdaExecutionMode mode=ALL)
Runs the UDA in all the modes, validating the result is 'expected' each time.
void XorInit(FunctionContext *context, BigIntVal *val)
Definition: uda-test.cc:152
void CountMerge(FunctionContext *context, const BigIntVal &src, BigIntVal *dst)
Definition: uda-test.cc:45
void Count4Update(FunctionContext *context, const IntVal &input1, const IntVal &input2, const IntVal &input3, const IntVal &input4, BigIntVal *val)
Definition: uda-test.cc:66
void DistinctEstimatUpdate(FunctionContext *context, const int64_t *input, StringVal *val)
Definition: uda-test.cc:188
uint8_t * value
Definition: uda-test.cc:79
const std::string & GetErrorMsg() const
Returns the failure string if any.
void MemTestUpdate(FunctionContext *context, const BigIntVal &bytes, BigIntVal *total)
Definition: test-udas.cc:67
void Count2Update(FunctionContext *context, const IntVal &input1, const IntVal &input2, BigIntVal *val)
Definition: uda-test.cc:58
void MinMerge(FunctionContext *context, const BufferVal &src, BufferVal *dst)
Definition: uda-test.cc:129
int buffer_len
Definition: uda-test.cc:81