16 #include <gtest/gtest.h>
17 #include <boost/algorithm/string.hpp>
18 #include <boost/lexical_cast.hpp>
19 #include <boost/foreach.hpp>
20 #include <boost/accumulators/accumulators.hpp>
21 #include <boost/accumulators/statistics/stats.hpp>
22 #include <boost/accumulators/statistics/variance.hpp>
33 namespace tag = boost::accumulators::tag;
34 using boost::accumulators::accumulator_set;
35 using boost::accumulators::stats;
36 using boost::accumulators::variance;
37 using boost::algorithm::is_any_of;
38 using boost::algorithm::trim;
39 using namespace impala;
40 using namespace impala_udf;
42 template <
int RANGE_START,
int RANGE_END>
44 return actual.
val >= RANGE_START && actual.
val <= RANGE_END;
49 string result(reinterpret_cast<char*>(actual.
ptr), actual.
len);
50 vector<string> str_vals;
51 split(str_vals, result, is_any_of(
","));
53 accumulator_set<int, stats<tag::variance> > acc;
55 BOOST_FOREACH(
string& s, str_vals) {
57 int val = lexical_cast<
int>(s);
58 if (prev_val != -1) acc(val - prev_val);
61 double actual_stdev = sqrt(variance(acc));
62 string expected_str(reinterpret_cast<char*>(max_expected_stdev.
ptr),
63 max_expected_stdev.
len);
64 return actual_stdev < lexical_cast<double>(expected_str);
68 TEST(HistogramTest, TestInt) {
70 AggregateFunctions::ReservoirSampleInit<IntVal>,
71 AggregateFunctions::ReservoirSampleUpdate<IntVal>,
72 AggregateFunctions::ReservoirSampleMerge<IntVal>,
73 AggregateFunctions::ReservoirSampleSerialize<IntVal>,
74 AggregateFunctions::HistogramFinalize<IntVal>);
76 AggregateFunctions::ReservoirSampleInit<IntVal>,
77 AggregateFunctions::ReservoirSampleUpdate<IntVal>,
78 AggregateFunctions::ReservoirSampleMerge<IntVal>,
79 AggregateFunctions::ReservoirSampleSerialize<IntVal>,
80 AggregateFunctions::AppxMedianFinalize<IntVal>);
82 const int INPUT_SIZE = NUM_BUCKETS * 1000;
86 vector<IntVal> input(INPUT_SIZE, 1);
87 char expected[] =
"1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
88 "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
89 "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
90 "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1";
100 vector<IntVal> input;
101 for (
int i = 0; i < INPUT_SIZE; ++i) input.push_back(i);
104 EXPECT_TRUE(test_histogram.
Execute(input, max_expected_stdev))
113 TEST(HistogramTest, TestDecimal) {
115 AggregateFunctions::ReservoirSampleInit<DecimalVal>,
116 AggregateFunctions::ReservoirSampleUpdate<DecimalVal>,
117 AggregateFunctions::ReservoirSampleMerge<DecimalVal>,
118 AggregateFunctions::ReservoirSampleSerialize<DecimalVal>,
119 AggregateFunctions::HistogramFinalize<DecimalVal>);
121 const int INPUT_SIZE = NUM_BUCKETS * 1000;
125 vector<DecimalVal> input;
128 for (
int i = 0; i < INPUT_SIZE; ++i) input.push_back(
DecimalVal(val));
131 if (i < NUM_BUCKETS - 1) ss <<
", ";
137 vector<DecimalVal> input;
138 for (
int i = 0; i < INPUT_SIZE; ++i) input.push_back(
DecimalVal(i));
145 TEST(HistogramTest, TestString) {
147 AggregateFunctions::ReservoirSampleInit<StringVal>,
148 AggregateFunctions::ReservoirSampleUpdate<StringVal>,
149 AggregateFunctions::ReservoirSampleMerge<StringVal>,
150 AggregateFunctions::ReservoirSampleSerialize<StringVal>,
151 AggregateFunctions::HistogramFinalize<StringVal>);
153 const int INPUT_SIZE = NUM_BUCKETS * 1000;
156 vector<StringVal> input;
157 for (
int i = 0; i < INPUT_SIZE; ++i) input.push_back(
StringVal(
"x"));
158 char expected[] =
"x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
159 "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
160 "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
161 "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x";
165 int main(
int argc,
char** argv) {
168 ::testing::InitGoogleTest(&argc, argv);
169 return RUN_ALL_TESTS();
int main(int argc, char **argv)
void SetResultComparator(ResultComparator fn)
static const int NUM_BUCKETS
bool CheckAppxMedian(const IntVal &actual, const IntVal &expected)
static int128_t MAX_UNSCALED_DECIMAL
Maximum absolute value of int128_t that we use. This is 38 digits of 9's.
void InitGoogleLoggingSafe(const char *arg)
static void InitMaxUnscaledDecimal()
Initializes MAX_UNSCALED_DECIMAL. Must be called once before using it.
bool Execute(const std::vector< INPUT > &values, const RESULT &expected, UdaExecutionMode mode=ALL)
Runs the UDA in all the modes, validating the result is 'expected' each time.
bool CheckHistogramDistribution(const StringVal &actual, const StringVal &max_expected_stdev)
const std::string & GetErrorMsg() const
Returns the failure string if any.
__int128_t int128_t
We use the c++ int128_t type. This is stored using 16 bytes and very performant.