Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
parquet-common.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_EXEC_PARQUET_COMMON_H
17 #define IMPALA_EXEC_PARQUET_COMMON_H
18 
19 #include "gen-cpp/Descriptors_types.h"
20 #include "gen-cpp/parquet_types.h"
21 #include "runtime/decimal-value.h"
22 #include "runtime/string-value.h"
23 #include "util/bit-util.h"
24 
26 namespace impala {
27 
28 class TimestampValue;
29 
30 const uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
31 const uint32_t PARQUET_CURRENT_VERSION = 1;
32 
35 const parquet::Type::type IMPALA_TO_PARQUET_TYPES[] = {
36  parquet::Type::BOOLEAN, // Invalid
37  parquet::Type::BOOLEAN, // NULL type
38  parquet::Type::BOOLEAN,
39  parquet::Type::INT32,
40  parquet::Type::INT32,
41  parquet::Type::INT32,
42  parquet::Type::INT64,
43  parquet::Type::FLOAT,
44  parquet::Type::DOUBLE,
45  parquet::Type::INT96, // Timestamp
46  parquet::Type::BYTE_ARRAY, // String
47  parquet::Type::BYTE_ARRAY, // Date, NYI
48  parquet::Type::BYTE_ARRAY, // DateTime, NYI
49  parquet::Type::BYTE_ARRAY, // Binary NYI
50  parquet::Type::FIXED_LEN_BYTE_ARRAY, // Decimal
51  parquet::Type::BYTE_ARRAY, // VARCHAR(N)
52  parquet::Type::BYTE_ARRAY, // CHAR(N)
53 };
54 
56 const THdfsCompression::type PARQUET_TO_IMPALA_CODEC[] = {
57  THdfsCompression::NONE,
58  THdfsCompression::SNAPPY,
59  THdfsCompression::GZIP,
60  THdfsCompression::LZO
61 };
62 
64 const parquet::CompressionCodec::type IMPALA_TO_PARQUET_CODEC[] = {
65  parquet::CompressionCodec::UNCOMPRESSED,
66  parquet::CompressionCodec::SNAPPY, // DEFAULT
67  parquet::CompressionCodec::GZIP, // GZIP
68  parquet::CompressionCodec::GZIP, // DEFLATE
69  parquet::CompressionCodec::SNAPPY,
70  parquet::CompressionCodec::SNAPPY, // SNAPPY_BLOCKED
71  parquet::CompressionCodec::LZO,
72 };
73 
81  public:
83  template<typename T>
84  static int ByteSize(const T& v) { return sizeof(T); }
85 
88  static int ByteSize(const ColumnType& t) {
89  switch (t.type) {
90  case TYPE_STRING:
91  case TYPE_VARCHAR:
92  case TYPE_CHAR:
93  // CHAR is varlen here because we don't write the padding to the file
94  return -1;
95  case TYPE_TINYINT:
96  case TYPE_SMALLINT:
97  case TYPE_INT:
98  case TYPE_FLOAT:
99  return 4;
100  case TYPE_BIGINT:
101  case TYPE_DOUBLE:
102  return 8;
103  case TYPE_TIMESTAMP:
104  return 12;
105  case TYPE_DECIMAL:
106  return DecimalSize(t);
107  case TYPE_NULL:
108  case TYPE_BOOLEAN: // These types are not plain encoded.
109  default:
110  DCHECK(false);
111  return -1;
112  }
113  }
114 
116  static int DecimalSize(const ColumnType& t) {
117  DCHECK(t.type == TYPE_DECIMAL);
118  // Numbers in the comment is the max positive value that can be represented
119  // with those number of bits (max negative is -(X + 1)).
120  // TODO: use closed form for this?
121  switch (t.precision) {
122  case 1: case 2:
123  return 1; // 127
124  case 3: case 4:
125  return 2; // 32,767
126  case 5: case 6:
127  return 3; // 8,388,607
128  case 7: case 8: case 9:
129  return 4; // 2,147,483,427
130  case 10: case 11:
131  return 5; // 549,755,813,887
132  case 12: case 13: case 14:
133  return 6; // 140,737,488,355,327
134  case 15: case 16:
135  return 7; // 36,028,797,018,963,967
136  case 17: case 18:
137  return 8; // 9,223,372,036,854,775,807
138  case 19: case 20: case 21:
139  return 9; // 2,361,183,241,434,822,606,847
140  case 22: case 23:
141  return 10; // 604,462,909,807,314,587,353,087
142  case 24: case 25: case 26:
143  return 11; // 154,742,504,910,672,534,362,390,527
144  case 27: case 28:
145  return 12; // 39,614,081,257,132,168,796,771,975,167
146  case 29: case 30: case 31:
147  return 13; // 10,141,204,801,825,835,211,973,625,643,007
148  case 32: case 33:
149  return 14; // 2,596,148,429,267,413,814,265,248,164,610,047
150  case 34: case 35:
151  return 15; // 664,613,997,892,457,936,451,903,530,140,172,287
152  case 36: case 37: case 38:
153  return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727
154  default:
155  DCHECK(false);
156  break;
157  }
158  return -1;
159  }
160 
165  template<typename T>
166  static int Encode(uint8_t* buffer, int fixed_len_size, const T& t) {
167  memcpy(buffer, &t, ByteSize(t));
168  return ByteSize(t);
169  }
170 
175  template<typename T>
176  static int Decode(uint8_t* buffer, int fixed_len_size, T* v) {
177  memcpy(v, buffer, ByteSize(*v));
178  return ByteSize(*v);
179  }
180 
183  template <typename T>
184  static int EncodeToFixedLenByteArray(uint8_t* buffer, int fixed_len_size, const T& t);
185 
189  template<typename T>
190  static int DecodeFromFixedLenByteArray(uint8_t* buffer, int fixed_len_size, T* v);
191 };
192 
194 template<> int ParquetPlainEncoder::ByteSize(const bool& b);
195 template<> int ParquetPlainEncoder::Encode(uint8_t*, int fixed_len_size, const bool&);
196 template<> int ParquetPlainEncoder::Decode(uint8_t*, int fixed_len_size, bool* v);
197 
200 template<> inline int ParquetPlainEncoder::ByteSize(const Decimal4Value&) {
201  DCHECK(false);
202  return -1;
203 }
204 template<> inline int ParquetPlainEncoder::ByteSize(const Decimal8Value&) {
205  DCHECK(false);
206  return -1;
207 }
208 template<> inline int ParquetPlainEncoder::ByteSize(const Decimal16Value&) {
209  DCHECK(false);
210  return -1;
211 }
212 
214 template<>
215 inline int ParquetPlainEncoder::ByteSize(const int8_t& v) { return sizeof(int32_t); }
216 template<>
217 inline int ParquetPlainEncoder::ByteSize(const int16_t& v) { return sizeof(int32_t); }
218 
219 template<>
221  return sizeof(int32_t) + v.len;
222 }
223 
224 template<>
226  return 12;
227 }
228 
229 template<>
230 inline int ParquetPlainEncoder::Decode(uint8_t* buffer, int fixed_len_size, int8_t* v) {
231  *v = *buffer;
232  return ByteSize(*v);
233 }
234 template<>
235 inline int ParquetPlainEncoder::Decode(uint8_t* buffer, int fixed_len_size, int16_t* v) {
236  memcpy(v, buffer, sizeof(int16_t));
237  return ByteSize(*v);
238 }
239 
240 template<>
242  uint8_t* buffer, int fixed_len_size, const int8_t& v) {
243  int32_t val = v;
244  memcpy(buffer, &val, sizeof(int32_t));
245  return ByteSize(v);
246 }
247 
248 template<>
250  uint8_t* buffer, int fixed_len_size, const int16_t& v) {
251  int32_t val = v;
252  memcpy(buffer, &val, sizeof(int32_t));
253  return ByteSize(v);
254 }
255 
256 template<>
258  uint8_t* buffer, int fixed_len_size, const StringValue& v) {
259  memcpy(buffer, &v.len, sizeof(int32_t));
260  memcpy(buffer + sizeof(int32_t), v.ptr, v.len);
261  return ByteSize(v);
262 }
263 
264 template<>
266  uint8_t* buffer, int fixed_len_size, StringValue* v) {
267  memcpy(&v->len, buffer, sizeof(int32_t));
268  v->ptr = reinterpret_cast<char*>(buffer) + sizeof(int32_t);
269  int bytesize = ByteSize(*v);
270  if (fixed_len_size > 0) v->len = std::min(v->len, fixed_len_size);
271  // we still read bytesize bytes, even if we truncate
272  return bytesize;
273 }
274 
280 template<>
282  uint8_t* buffer, int fixed_len_size, const Decimal4Value& v) {
283  DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);
284  return fixed_len_size;
285 }
286 
287 template<>
289  uint8_t* buffer, int fixed_len_size, const Decimal8Value& v) {
290  DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);
291  return fixed_len_size;
292 }
293 
294 template<>
296  uint8_t* buffer, int fixed_len_size, const Decimal16Value& v) {
297  DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);
298  return fixed_len_size;
299 }
300 
301 template<>
303  uint8_t* buffer, int fixed_len_size, Decimal4Value* v) {
304  DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);
305  return fixed_len_size;
306 }
307 
308 template<>
310  uint8_t* buffer, int fixed_len_size, Decimal8Value* v) {
311  DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);
312  return fixed_len_size;
313 }
314 
315 template<>
317  uint8_t* buffer, int fixed_len_size, Decimal16Value* v) {
318  DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);
319  return fixed_len_size;
320 }
321 
322 }
323 
324 #endif
const parquet::CompressionCodec::type IMPALA_TO_PARQUET_CODEC[]
Mapping of Impala codec enums to Parquet enums.
static int DecimalSize(const ColumnType &t)
The minimum byte size to store decimals of with precision t.precision.
static int ByteSize(const ColumnType &t)
static void DecodeFromFixedLenByteArray(const uint8_t *buffer, int fixed_len_size, T *v)
Definition: decimal-util.h:87
static int DecodeFromFixedLenByteArray(uint8_t *buffer, int fixed_len_size, T *v)
int precision
Only set if type == TYPE_DECIMAL.
Definition: types.h:68
const uint8_t PARQUET_VERSION_NUMBER[4]
static void EncodeToFixedLenByteArray(uint8_t *buffer, int fixed_len_size, const T &v)
Write decimals as big endian (byte comparable) in fixed_len_size bytes.
Definition: decimal-util.h:61
const THdfsCompression::type PARQUET_TO_IMPALA_CODEC[]
Mapping of Parquet codec enums to Impala enums.
static int ByteSize(const T &v)
Returns the byte size of 'v'.
static int Encode(uint8_t *buffer, int fixed_len_size, const T &t)
PrimitiveType type
Definition: types.h:60
const parquet::Type::type IMPALA_TO_PARQUET_TYPES[]
static int Decode(uint8_t *buffer, int fixed_len_size, T *v)
static int EncodeToFixedLenByteArray(uint8_t *buffer, int fixed_len_size, const T &t)
const uint32_t PARQUET_CURRENT_VERSION