Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
compression-test.cc
Go to the documentation of this file.
1 // Copyright (c) 2012 Cloudera, Inc. All rights reserved.
2 
3 #include <string>
4 #include <gtest/gtest.h>
5 
6 #include "util/compress.h"
7 
8 #include "common/names.h"
9 
10 namespace impala {
11 
12 // Utility benchmark to test how well we can compress random string data.
13 // NumStrings=1000000 MinLen=10 MaxLen=10 Codec=SNAPPY
14 // Uncompressed len: 10000000
15 // Compressed len: 10006377
16 // Sorted Compressed len: 9346971
17 // NumStrings=1000000 MinLen=10 MaxLen=10 Codec=GZIP
18 // Uncompressed len: 10000000
19 // Compressed len: 6352396
20 // Sorted Compressed len: 5712650
21 // NumStrings=1000000 MinLen=5 MaxLen=15 Codec=SNAPPY
22 // Uncompressed len: 9498531
23 // Compressed len: 9503924
24 // Sorted Compressed len: 8825841
25 // NumStrings=1000000 MinLen=5 MaxLen=15 Codec=GZIP
26 // Uncompressed len: 9497973
27 // Compressed len: 6033310
28 // Sorted Compressed len: 5429661
29 
30 // Generates num strings between min_len and max_len.
31 // Outputs the uncompressed/compressed/sorted_compressed sizes.
32 void TestCompression(int num, int min_len, int max_len, THdfsCompression::type codec) {
33  vector<string> strings;
34  uint8_t* buffer = (uint8_t*)malloc(max_len * num);
35  int offset = 0;
36  int len_delta = max_len - min_len;
37  len_delta = max(len_delta, 1);
38  for (int i = 0; i < num; ++i) {
39  int len = rand() % len_delta + min_len;
40  int start = offset;
41  for (int j = 0; j < len; ++j) {
42  buffer[offset++] = rand() % 26 + 'a';
43  }
44  strings.push_back(string((char*)buffer + start, len));
45  }
46 
47  // Sort the input and make a new buffer
48  uint8_t* sorted_buffer = (uint8_t*)malloc(offset);
49  int sorted_offset = 0;
50  sort(strings.begin(), strings.end());
51  for (int i = 0; i < strings.size(); ++i) {
52  memcpy(sorted_buffer + sorted_offset, strings[i].data(), strings[i].size());
53  sorted_offset += strings[i].size();
54  }
55 
56  scoped_ptr<Codec> compressor;
57  Codec::CreateCompressor(NULL, false, codec, &compressor);
58 
59  int64_t compressed_len = compressor->MaxOutputLen(offset);
60  uint8_t* compressed_buffer = (uint8_t*)malloc(compressed_len);
61  compressor->ProcessBlock(true, offset, buffer, &compressed_len, &compressed_buffer);
62 
63  int64_t sorted_compressed_len = compressor->MaxOutputLen(offset);
64  uint8_t* sorted_compressed_buffer = (uint8_t*)malloc(sorted_compressed_len);
65  compressor->ProcessBlock(true, offset, sorted_buffer, &sorted_compressed_len,
66  &sorted_compressed_buffer);
67 
68  cout << "NumStrings=" << num << " MinLen=" << min_len << " MaxLen=" << max_len
69  << " Codec=" << codec << endl;
70  cout << " Uncompressed len: " << offset << endl;
71  cout << " Compressed len: " << compressed_len << endl;
72  cout << " Sorted Compressed len: " << sorted_compressed_len << endl;
73 
74  compressor->Close();
75  free(buffer);
76  free(compressed_buffer);
77  free(sorted_buffer);
78  free(sorted_compressed_buffer);
79 }
80 
81 }
82 
83 int main(int argc, char **argv) {
84  impala::TestCompression(1000000, 10, 10, impala::THdfsCompression::SNAPPY);
85  impala::TestCompression(1000000, 10, 10, impala::THdfsCompression::GZIP);
86  impala::TestCompression(1000000, 5, 15, impala::THdfsCompression::SNAPPY);
87  impala::TestCompression(1000000, 5, 15, impala::THdfsCompression::GZIP);
88  return 0;
89 }
90 
static Status CreateCompressor(MemPool *mem_pool, bool reuse, THdfsCompression::type format, boost::scoped_ptr< Codec > *compressor)
int main(int argc, char **argv)
uint8_t offset[7 *64-sizeof(uint64_t)]
void TestCompression(int num, int min_len, int max_len, THdfsCompression::type codec)