Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
aggregate-functions-test.cc
Go to the documentation of this file.
1 // Copyright 2014 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <iostream>
16 #include <gtest/gtest.h>
17 #include <boost/algorithm/string.hpp>
18 #include <boost/lexical_cast.hpp>
19 #include <boost/foreach.hpp>
20 #include <boost/accumulators/accumulators.hpp>
21 #include <boost/accumulators/statistics/stats.hpp>
22 #include <boost/accumulators/statistics/variance.hpp>
23 
24 #include "common/logging.h"
27 #include "udf/udf.h"
28 #include "udf/uda-test-harness.h"
29 #include "util/decimal-util.h"
30 
31 #include "common/names.h"
32 
33 namespace tag = boost::accumulators::tag;
34 using boost::accumulators::accumulator_set;
35 using boost::accumulators::stats;
36 using boost::accumulators::variance;
37 using boost::algorithm::is_any_of;
38 using boost::algorithm::trim;
39 using namespace impala;
40 using namespace impala_udf;
41 
42 template <int RANGE_START, int RANGE_END>
43 bool CheckAppxMedian(const IntVal& actual, const IntVal& expected) {
44  return actual.val >= RANGE_START && actual.val <= RANGE_END;
45 }
46 
48  const StringVal& max_expected_stdev) {
49  string result(reinterpret_cast<char*>(actual.ptr), actual.len);
50  vector<string> str_vals;
51  split(str_vals, result, is_any_of(","));
52 
53  accumulator_set<int, stats<tag::variance> > acc;
54  int prev_val = -1;
55  BOOST_FOREACH(string& s, str_vals) {
56  trim(s);
57  int val = lexical_cast<int>(s);
58  if (prev_val != -1) acc(val - prev_val);
59  prev_val = val;
60  }
61  double actual_stdev = sqrt(variance(acc));
62  string expected_str(reinterpret_cast<char*>(max_expected_stdev.ptr),
63  max_expected_stdev.len);
64  return actual_stdev < lexical_cast<double>(expected_str);
65 }
66 
67 // TODO: Add other datatypes
68 TEST(HistogramTest, TestInt) {
70  AggregateFunctions::ReservoirSampleInit<IntVal>,
71  AggregateFunctions::ReservoirSampleUpdate<IntVal>,
72  AggregateFunctions::ReservoirSampleMerge<IntVal>,
73  AggregateFunctions::ReservoirSampleSerialize<IntVal>,
74  AggregateFunctions::HistogramFinalize<IntVal>);
76  AggregateFunctions::ReservoirSampleInit<IntVal>,
77  AggregateFunctions::ReservoirSampleUpdate<IntVal>,
78  AggregateFunctions::ReservoirSampleMerge<IntVal>,
79  AggregateFunctions::ReservoirSampleSerialize<IntVal>,
80  AggregateFunctions::AppxMedianFinalize<IntVal>);
81  const int NUM_BUCKETS = 100;
82  const int INPUT_SIZE = NUM_BUCKETS * 1000;
83 
84  // All input values are 1, result should be constant.
85  {
86  vector<IntVal> input(INPUT_SIZE, 1);
87  char expected[] = "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
88  "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
89  "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
90  "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1";
91  EXPECT_TRUE(test_histogram.Execute(input, StringVal(&expected[0])))
92  << test_histogram.GetErrorMsg();
93  }
94 
95  // Now check input values ranging from 0 to 100,000. Each bucket should have 1000
96  // values, i.e. bucket i should approximately contain values [100*i, 100*(i+1)]. We
97  // check the distribution of the deltas between histogram values is not too large.
98  // TODO: Add more deterministic test cases
99  {
100  vector<IntVal> input;
101  for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(i);
103  StringVal max_expected_stdev = StringVal("100.0");
104  EXPECT_TRUE(test_histogram.Execute(input, max_expected_stdev))
105  << test_histogram.GetErrorMsg();
106 
107  test_median.SetResultComparator(CheckAppxMedian<45000,55000>);
108  EXPECT_TRUE(test_median.Execute(input, IntVal()))
109  << test_median.GetErrorMsg();
110  }
111 }
112 
113 TEST(HistogramTest, TestDecimal) {
115  AggregateFunctions::ReservoirSampleInit<DecimalVal>,
116  AggregateFunctions::ReservoirSampleUpdate<DecimalVal>,
117  AggregateFunctions::ReservoirSampleMerge<DecimalVal>,
118  AggregateFunctions::ReservoirSampleSerialize<DecimalVal>,
119  AggregateFunctions::HistogramFinalize<DecimalVal>);
120  const int NUM_BUCKETS = 100;
121  const int INPUT_SIZE = NUM_BUCKETS * 1000;
122 
123  // All input values are x, result should be constant.
124  {
125  vector<DecimalVal> input;
127  stringstream ss;
128  for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(DecimalVal(val));
129  for (int i = 0; i < NUM_BUCKETS; ++i) {
130  ss << val;
131  if (i < NUM_BUCKETS - 1) ss << ", ";
132  }
133  EXPECT_TRUE(test.Execute(input, StringVal(ss.str().c_str()))) << test.GetErrorMsg();
134  }
135 
136  {
137  vector<DecimalVal> input;
138  for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(DecimalVal(i));
140  StringVal max_expected_stdev = StringVal("100.0");
141  EXPECT_TRUE(test.Execute(input, max_expected_stdev)) << test.GetErrorMsg();
142  }
143 }
144 
145 TEST(HistogramTest, TestString) {
147  AggregateFunctions::ReservoirSampleInit<StringVal>,
148  AggregateFunctions::ReservoirSampleUpdate<StringVal>,
149  AggregateFunctions::ReservoirSampleMerge<StringVal>,
150  AggregateFunctions::ReservoirSampleSerialize<StringVal>,
151  AggregateFunctions::HistogramFinalize<StringVal>);
152  const int NUM_BUCKETS = 100;
153  const int INPUT_SIZE = NUM_BUCKETS * 1000;
154 
155  // All input values are x, result should be constant.
156  vector<StringVal> input;
157  for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(StringVal("x"));
158  char expected[] = "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
159  "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
160  "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
161  "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x";
162  EXPECT_TRUE(test.Execute(input, StringVal(&expected[0]))) << test.GetErrorMsg();
163 }
164 
165 int main(int argc, char** argv) {
168  ::testing::InitGoogleTest(&argc, argv);
169  return RUN_ALL_TESTS();
170 }
int main(int argc, char **argv)
void SetResultComparator(ResultComparator fn)
static const int NUM_BUCKETS
int32_t val
Definition: udf.h:421
TEST(AtomicTest, Basic)
Definition: atomic-test.cc:28
bool CheckAppxMedian(const IntVal &actual, const IntVal &expected)
uint8_t * ptr
Definition: udf.h:523
static int128_t MAX_UNSCALED_DECIMAL
Maximum absolute value of int128_t that we use. This is 38 digits of 9's.
Definition: decimal-util.h:32
void InitGoogleLoggingSafe(const char *arg)
Definition: logging.cc:55
static void InitMaxUnscaledDecimal()
Initializes MAX_UNSCALED_DECIMAL. Must be called once before using it.
Definition: decimal-util.cc:22
bool Execute(const std::vector< INPUT > &values, const RESULT &expected, UdaExecutionMode mode=ALL)
Runs the UDA in all the modes, validating the result is 'expected' each time.
bool CheckHistogramDistribution(const StringVal &actual, const StringVal &max_expected_stdev)
const std::string & GetErrorMsg() const
Returns the failure string if any.
__int128_t int128_t
We use the c++ int128_t type. This is stored using 16 bytes and very performant.