Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
data-provider.h
Go to the documentation of this file.
1 // Copyright (c) 2012 Cloudera, Inc. All rights reserved.
2 
3 #ifndef IMPALA_EXPERIMENT_DATPROVIDER_H
4 #define IMPALA_EXPERIMENT_DATAPROVIDER_H
5 
6 #include <math.h>
7 #include <iostream>
8 #include <limits>
9 #include <boost/cstdint.hpp>
10 #include <boost/scoped_ptr.hpp>
11 #include <boost/random/uniform_int.hpp>
12 #include <boost/random/linear_congruential.hpp>
13 #include <boost/random/uniform_int.hpp>
14 #include <boost/random/uniform_real.hpp>
15 #include <boost/random/variate_generator.hpp>
16 #include <boost/generator_iterator.hpp>
17 
18 #include "runtime/mem-pool.h"
19 #include "runtime/types.h"
20 #include "runtime/string-value.h"
21 #include "util/runtime-profile.h"
22 
27 //
29 //
33 class DataProvider {
34  public:
35  struct Value {
36  union {
37  bool b;
38  int8_t int8;
39  int16_t int16;
40  int32_t int32;
41  int64_t int64;
42  float f;
43  double d;
44  };
46  };
47 
49  enum DataGen {
52  };
53 
54  class ColDesc {
55  public:
57  template<typename T>
58  static ColDesc Create(const T& min, const T& max, DataGen gen = UNIFORM_RANDOM);
59 
60  private:
61  friend class DataProvider;
62 
65  template<typename T>
66  T Generate(double d, int i) const;
67 
68 
70  this->type = type;
71  this->bytes = bytes;
72  }
73 
75  template<typename T>
76  T Generate(double d, int i, T min, T max) const {
77  switch (gen_type) {
78  case UNIFORM_RANDOM:
79  return (T)(d * (max - min) + min);
80  case SEQUENTIAL:
81  return (T)(i % (int64_t)(max - min) + min);
82  }
83  return 0;
84  }
85 
89  int bytes;
90  };
91 
95 
100  void Reset(int num_rows, int batch_size, const std::vector<ColDesc>& columns);
101 
104  void SetSeed(int seed);
105 
107  int row_size() const { return row_size_; }
108 
110  int total_rows() const { return num_rows_; }
111 
115  void* NextBatch(int* rows_returned);
116 
118  void Print(std::ostream*, char* data, int num_rows) const;
119 
120  private:
126  boost::scoped_ptr<char> data_;
128  boost::minstd_rand rand_generator_;
129  std::vector<ColDesc> cols_;
130 
132 };
133 
134 template<>
135 inline DataProvider::ColDesc DataProvider::ColDesc::Create<bool>(
136  const bool& min, const bool &max, DataGen gen) {
138  c.min.b = min;
139  c.max.b = max;
140  c.gen_type = gen;
141  return c;
142 }
143 template<>
144 inline DataProvider::ColDesc DataProvider::ColDesc::Create<int8_t>(
145  const int8_t& min, const int8_t& max, DataGen gen) {
147  c.min.int8 = min;
148  c.max.int8 = max;
149  c.gen_type = gen;
150  return c;
151 }
152 template<>
153 inline DataProvider::ColDesc DataProvider::ColDesc::Create<int16_t>(
154  const int16_t& min, const int16_t& max, DataGen gen) {
156  c.min.int16 = min;
157  c.max.int16 = max;
158  c.gen_type = gen;
159  return c;
160 }
161 template<>
162 inline DataProvider::ColDesc DataProvider::ColDesc::Create<int32_t>(
163  const int32_t& min, const int32_t& max, DataGen gen) {
165  c.min.int32 = min;
166  c.max.int32 = max;
167  c.gen_type = gen;
168  return c;
169 }
170 template<>
171 inline DataProvider::ColDesc DataProvider::ColDesc::Create<int64_t>(
172  const int64_t& min, const int64_t& max, DataGen gen) {
174  c.min.int64 = min;
175  c.max.int64 = max;
176  c.gen_type = gen;
177  return c;
178 }
179 template<>
180 inline DataProvider::ColDesc DataProvider::ColDesc::Create<float>(
181  const float& min, const float& max, DataGen gen) {
183  c.min.f = min;
184  c.max.f = max;
185  c.gen_type = gen;
186  return c;
187 }
188 template<>
189 inline DataProvider::ColDesc DataProvider::ColDesc::Create<double>(
190  const double& min, const double& max, DataGen gen) {
192  c.min.d = min;
193  c.max.d = max;
194  c.gen_type = gen;
195  return c;
196 }
197 template<> inline
198 DataProvider::ColDesc DataProvider::ColDesc::Create<impala::StringValue>(
199  const impala::StringValue& min, const impala::StringValue& max, DataGen gen) {
200  ColDesc c(impala::TYPE_STRING, 16);
201  c.min.s = min;
202  c.max.s = max;
203  c.gen_type = gen;
204  return c;
205 }
206 
207 
208 template<> inline bool DataProvider::ColDesc::Generate<bool>(double d, int i) const {
209  switch (gen_type) {
210  case UNIFORM_RANDOM:
211  return (int)(round(d * max.b - min.b)) + min.b;
212  case SEQUENTIAL:
213  return (i % 2) ? true : false;
214  }
215  return false;
216 }
217 template<> inline int8_t DataProvider::ColDesc::Generate<int8_t>(double d, int i) const {
218  return Generate<int8_t>(d, i, min.int8, max.int8);
219 }
220 template<> inline int16_t DataProvider::ColDesc::Generate<int16_t>(double d, int i) const {
221  return Generate<int16_t>(d, i, min.int16, max.int16);
222 }
223 template<> inline int32_t DataProvider::ColDesc::Generate<int32_t>(double d, int i) const {
224  return Generate<int32_t>(d, i, min.int32, max.int32);
225 }
226 template<> inline int64_t DataProvider::ColDesc::Generate<int64_t>(double d, int i) const {
227  return Generate<int64_t>(d, i, min.int64, max.int64);
228 }
229 template<> inline float DataProvider::ColDesc::Generate<float>(double d, int i) const {
230  return Generate<float>(d, i, min.f, max.f);
231 }
232 template<> inline double DataProvider::ColDesc::Generate<double>(double d, int i) const {
233  return Generate<double>(d, i, min.d, max.d);
234 }
235 
236 #endif
237 
int total_rows() const
The total number of rows that will be generated.
void SetSeed(int seed)
void Reset(int num_rows, int batch_size, const std::vector< ColDesc > &columns)
T Generate(double d, int i, T min, T max) const
Default generator - used for int and float types.
Definition: data-provider.h:76
static ColDesc Create(const T &min, const T &max, DataGen gen=UNIFORM_RANDOM)
Create a column desc with min/max range and the data gen type.
void * NextBatch(int *rows_returned)
impala::RuntimeProfile * profile_
impala::StringValue s
Definition: data-provider.h:45
See data-provider-test.cc on how to use this.
Definition: data-provider.h:33
impala::RuntimeProfile::Counter * bytes_generated_
boost::minstd_rand rand_generator_
T Generate(double d, int i) const
std::vector< ColDesc > cols_
PrimitiveType
Definition: types.h:27
ObjectPool pool
impala::PrimitiveType type
Definition: data-provider.h:86
boost::scoped_ptr< char > data_
impala::MemPool * pool_
DataProvider(impala::MemPool *pool, impala::RuntimeProfile *profile)
int row_size() const
The size of a row (tuple size)
void Print(std::ostream *, char *data, int num_rows) const
Print the row data in csv format.
ColDesc(impala::PrimitiveType type, int bytes)
Definition: data-provider.h:69
DataGen
How the data should be generated.
Definition: data-provider.h:49