doc/html/parquet-common_8h_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #ifndef IMPALA_EXEC_PARQUET_COMMON_H

 #define IMPALA_EXEC_PARQUET_COMMON_H


 #include "gen-cpp/Descriptors_types.h"

 #include "gen-cpp/parquet_types.h"

 #include "runtime/decimal-value.h"

 #include "runtime/string-value.h"

 #include "util/bit-util.h"


 namespace impala {


 class TimestampValue;


 const uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};

 const uint32_t PARQUET_CURRENT_VERSION = 1;


 const parquet::Type::type IMPALA_TO_PARQUET_TYPES[] = {

   parquet::Type::BOOLEAN,     // Invalid

   parquet::Type::BOOLEAN,     // NULL type

   parquet::Type::BOOLEAN,

   parquet::Type::INT32,

   parquet::Type::INT32,

   parquet::Type::INT32,

   parquet::Type::INT64,

   parquet::Type::FLOAT,

   parquet::Type::DOUBLE,

   parquet::Type::INT96,       // Timestamp

   parquet::Type::BYTE_ARRAY,  // String

   parquet::Type::BYTE_ARRAY,  // Date, NYI

   parquet::Type::BYTE_ARRAY,  // DateTime, NYI

   parquet::Type::BYTE_ARRAY,  // Binary NYI

   parquet::Type::FIXED_LEN_BYTE_ARRAY, // Decimal

   parquet::Type::BYTE_ARRAY,  // VARCHAR(N)

   parquet::Type::BYTE_ARRAY,  // CHAR(N)

 };


 const THdfsCompression::type PARQUET_TO_IMPALA_CODEC[] = {

   THdfsCompression::NONE,

   THdfsCompression::SNAPPY,

   THdfsCompression::GZIP,

   THdfsCompression::LZO

 };


 const parquet::CompressionCodec::type IMPALA_TO_PARQUET_CODEC[] = {

   parquet::CompressionCodec::UNCOMPRESSED,

   parquet::CompressionCodec::SNAPPY,  // DEFAULT

   parquet::CompressionCodec::GZIP,    // GZIP

   parquet::CompressionCodec::GZIP,    // DEFLATE

   parquet::CompressionCodec::SNAPPY,

   parquet::CompressionCodec::SNAPPY,  // SNAPPY_BLOCKED

   parquet::CompressionCodec::LZO,

 };


 class ParquetPlainEncoder {

  public:

   template<typename T>

   static int ByteSize(const T& v) { return sizeof(T); }


   static int ByteSize(const ColumnType& t) {

     switch (t.type) {

       case TYPE_STRING:

       case TYPE_VARCHAR:

       case TYPE_CHAR:

         // CHAR is varlen here because we don't write the padding to the file

         return -1;

       case TYPE_TINYINT:

       case TYPE_SMALLINT:

       case TYPE_INT:

       case TYPE_FLOAT:

         return 4;

       case TYPE_BIGINT:

       case TYPE_DOUBLE:

         return 8;

       case TYPE_TIMESTAMP:

         return 12;

       case TYPE_DECIMAL:

         return DecimalSize(t);

       case TYPE_NULL:

       case TYPE_BOOLEAN: // These types are not plain encoded.

       default:

         DCHECK(false);

         return -1;

     }

   }


   static int DecimalSize(const ColumnType& t) {

     DCHECK(t.type == TYPE_DECIMAL);

     // Numbers in the comment is the max positive value that can be represented

     // with those number of bits (max negative is -(X + 1)).

     // TODO: use closed form for this?

     switch (t.precision) {

       case 1: case 2:

         return 1; // 127

       case 3: case 4:

         return 2; // 32,767

       case 5: case 6:

         return 3; // 8,388,607

       case 7: case 8: case 9:

         return 4; // 2,147,483,427

       case 10: case 11:

         return 5; // 549,755,813,887

       case 12: case 13: case 14:

         return 6; // 140,737,488,355,327

       case 15: case 16:

         return 7; // 36,028,797,018,963,967

       case 17: case 18:

         return 8; // 9,223,372,036,854,775,807

       case 19: case 20: case 21:

         return 9; // 2,361,183,241,434,822,606,847

       case 22: case 23:

         return 10; // 604,462,909,807,314,587,353,087

       case 24: case 25: case 26:

         return 11; // 154,742,504,910,672,534,362,390,527

       case 27: case 28:

         return 12; // 39,614,081,257,132,168,796,771,975,167

       case 29: case 30: case 31:

         return 13; // 10,141,204,801,825,835,211,973,625,643,007

       case 32: case 33:

         return 14; // 2,596,148,429,267,413,814,265,248,164,610,047

       case 34: case 35:

         return 15; // 664,613,997,892,457,936,451,903,530,140,172,287

       case 36: case 37: case 38:

         return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727

       default:

         DCHECK(false);

         break;

     }

     return -1;

   }


   template<typename T>

   static int Encode(uint8_t* buffer, int fixed_len_size, const T& t) {

     memcpy(buffer, &t, ByteSize(t));

     return ByteSize(t);

   }


   template<typename T>

   static int Decode(uint8_t* buffer, int fixed_len_size, T* v) {

     memcpy(v, buffer, ByteSize(*v));

     return ByteSize(*v);

   }


   template <typename T>

   static int EncodeToFixedLenByteArray(uint8_t* buffer, int fixed_len_size, const T& t);


   template<typename T>

   static int DecodeFromFixedLenByteArray(uint8_t* buffer, int fixed_len_size, T* v);

 };


 template<> int ParquetPlainEncoder::ByteSize(const bool& b);

 template<> int ParquetPlainEncoder::Encode(uint8_t*, int fixed_len_size, const bool&);

 template<> int ParquetPlainEncoder::Decode(uint8_t*, int fixed_len_size, bool* v);


 template<> inline int ParquetPlainEncoder::ByteSize(const Decimal4Value&) {

   DCHECK(false);

   return -1;

 }

 template<> inline int ParquetPlainEncoder::ByteSize(const Decimal8Value&) {

   DCHECK(false);

   return -1;

 }

 template<> inline int ParquetPlainEncoder::ByteSize(const Decimal16Value&) {

   DCHECK(false);

   return -1;

 }


 template<>

 inline int ParquetPlainEncoder::ByteSize(const int8_t& v) { return sizeof(int32_t); }

 template<>

 inline int ParquetPlainEncoder::ByteSize(const int16_t& v) { return sizeof(int32_t); }


 template<>

 inline int ParquetPlainEncoder::ByteSize(const StringValue& v) {

   return sizeof(int32_t) + v.len;

 }


 template<>

 inline int ParquetPlainEncoder::ByteSize(const TimestampValue& v) {

   return 12;

 }


 template<>

 inline int ParquetPlainEncoder::Decode(uint8_t* buffer, int fixed_len_size, int8_t* v) {

   *v = *buffer;

   return ByteSize(*v);

 }

 template<>

 inline int ParquetPlainEncoder::Decode(uint8_t* buffer, int fixed_len_size, int16_t* v) {

   memcpy(v, buffer, sizeof(int16_t));

   return ByteSize(*v);

 }


 template<>

 inline int ParquetPlainEncoder::Encode(

     uint8_t* buffer, int fixed_len_size, const int8_t& v) {

   int32_t val = v;

   memcpy(buffer, &val, sizeof(int32_t));

   return ByteSize(v);

 }


 template<>

 inline int ParquetPlainEncoder::Encode(

     uint8_t* buffer, int fixed_len_size, const int16_t& v) {

   int32_t val = v;

   memcpy(buffer, &val, sizeof(int32_t));

   return ByteSize(v);

 }


 template<>

 inline int ParquetPlainEncoder::Encode(

     uint8_t* buffer, int fixed_len_size, const StringValue& v) {

   memcpy(buffer, &v.len, sizeof(int32_t));

   memcpy(buffer + sizeof(int32_t), v.ptr, v.len);

   return ByteSize(v);

 }


 template<>

 inline int ParquetPlainEncoder::Decode(

     uint8_t* buffer, int fixed_len_size, StringValue* v) {

   memcpy(&v->len, buffer, sizeof(int32_t));

   v->ptr = reinterpret_cast<char*>(buffer) + sizeof(int32_t);

   int bytesize = ByteSize(*v);

   if (fixed_len_size > 0) v->len = std::min(v->len, fixed_len_size);

   // we still read bytesize bytes, even if we truncate

   return bytesize;

 }


 template<>

 inline int ParquetPlainEncoder::Encode(

     uint8_t* buffer, int fixed_len_size, const Decimal4Value& v) {

   DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);

   return fixed_len_size;

 }


 template<>

 inline int ParquetPlainEncoder::Encode(

     uint8_t* buffer, int fixed_len_size, const Decimal8Value& v) {

   DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);

   return fixed_len_size;

 }


 template<>

 inline int ParquetPlainEncoder::Encode(

     uint8_t* buffer, int fixed_len_size, const Decimal16Value& v) {

   DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);

   return fixed_len_size;

 }


 template<>

 inline int ParquetPlainEncoder::Decode(

     uint8_t* buffer, int fixed_len_size, Decimal4Value* v) {

   DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);

   return fixed_len_size;

 }


 template<>

 inline int ParquetPlainEncoder::Decode(

     uint8_t* buffer, int fixed_len_size, Decimal8Value* v) {

   DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);

   return fixed_len_size;

 }


 template<>

 inline int ParquetPlainEncoder::Decode(

     uint8_t* buffer, int fixed_len_size, Decimal16Value* v) {

   DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);

   return fixed_len_size;

 }


 }


 #endif

impala::IMPALA_TO_PARQUET_CODEC
const parquet::CompressionCodec::type IMPALA_TO_PARQUET_CODEC[]
Mapping of Impala codec enums to Parquet enums.
Definition: parquet-common.h:64

impala::ParquetPlainEncoder::DecimalSize
static int DecimalSize(const ColumnType &t)
The minimum byte size to store decimals of with precision t.precision.
Definition: parquet-common.h:116

impala::ParquetPlainEncoder::ByteSize
static int ByteSize(const ColumnType &t)
Definition: parquet-common.h:88

impala::TYPE_DOUBLE
Definition: types.h:36

impala::DecimalUtil::DecodeFromFixedLenByteArray
static void DecodeFromFixedLenByteArray(const uint8_t *buffer, int fixed_len_size, T *v)
Definition: decimal-util.h:87

impala::TYPE_CHAR
Definition: types.h:47

impala::StringValue
Definition: string-value.h:33

impala::ParquetPlainEncoder::DecodeFromFixedLenByteArray
static int DecodeFromFixedLenByteArray(uint8_t *buffer, int fixed_len_size, T *v)

impala::TYPE_VARCHAR
Definition: types.h:48

impala::ColumnType::precision
int precision
Only set if type == TYPE_DECIMAL.
Definition: types.h:68

impala::StringValue::len
int len
Definition: string-value.h:38

impala::PARQUET_VERSION_NUMBER
const uint8_t PARQUET_VERSION_NUMBER[4]
Definition: parquet-common.h:30

impala::DecimalUtil::EncodeToFixedLenByteArray
static void EncodeToFixedLenByteArray(uint8_t *buffer, int fixed_len_size, const T &v)
Write decimals as big endian (byte comparable) in fixed_len_size bytes.
Definition: decimal-util.h:61

impala::PARQUET_TO_IMPALA_CODEC
const THdfsCompression::type PARQUET_TO_IMPALA_CODEC[]
Mapping of Parquet codec enums to Impala enums.
Definition: parquet-common.h:56

impala::ParquetPlainEncoder::ByteSize
static int ByteSize(const T &v)
Returns the byte size of 'v'.
Definition: parquet-common.h:84

impala::TYPE_TIMESTAMP
Definition: types.h:37

impala::ParquetPlainEncoder::Encode
static int Encode(uint8_t *buffer, int fixed_len_size, const T &t)
Definition: parquet-common.h:166

impala::TYPE_INT
Definition: types.h:33

impala::ColumnType::type
PrimitiveType type
Definition: types.h:60

impala::IMPALA_TO_PARQUET_TYPES
const parquet::Type::type IMPALA_TO_PARQUET_TYPES[]
Definition: parquet-common.h:35

bit-util.h

impala::TYPE_SMALLINT
Definition: types.h:32

impala::TimestampValue
Definition: timestamp-value.h:65

impala::DecimalValue< int32_t >

impala::TYPE_BOOLEAN
Definition: types.h:30

impala::ParquetPlainEncoder::Decode
static int Decode(uint8_t *buffer, int fixed_len_size, T *v)
Definition: parquet-common.h:176

impala::TYPE_BIGINT
Definition: types.h:34

impala::ParquetPlainEncoder::EncodeToFixedLenByteArray
static int EncodeToFixedLenByteArray(uint8_t *buffer, int fixed_len_size, const T &t)

impala::ParquetPlainEncoder
Definition: parquet-common.h:80

impala::StringValue::ptr
char * ptr
Definition: string-value.h:37

impala::TYPE_NULL
Definition: types.h:29

impala::ColumnType
Definition: types.h:59

decimal-value.h

impala::PARQUET_CURRENT_VERSION
const uint32_t PARQUET_CURRENT_VERSION
Definition: parquet-common.h:31

impala::TYPE_STRING
Definition: types.h:38

impala::TYPE_TINYINT
Definition: types.h:31

string-value.h

impala::TYPE_FLOAT
Definition: types.h:35

impala::TYPE_DECIMAL
Definition: types.h:42