Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
delimited-text-parser-test.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <string>
16 #include <gtest/gtest.h>
17 
19 #include "util/cpu-info.h"
20 
21 #include "common/names.h"
22 
23 namespace impala {
24 
25 void Validate(DelimitedTextParser* parser, const string& data,
26  int expected_offset, char tuple_delim, int expected_num_tuples,
27  int expected_num_fields) {
28  parser->ParserReset();
29  char* data_ptr = const_cast<char*>(data.c_str());
30  int remaining_len = data.size();
31  int offset = parser->FindFirstInstance(data_ptr, remaining_len);
32 
33  EXPECT_EQ(offset, expected_offset) << data;
34  if (offset == -1) return;
35 
36  EXPECT_GE(offset, 1) << data;
37  EXPECT_LT(offset, data.size()) << data;
38  EXPECT_EQ(data[offset - 1], tuple_delim) << data;
39 
40  data_ptr += offset;
41  remaining_len -= offset;
42 
43  char* row_end_locs[100];
44  vector<FieldLocation> field_locations(100);
45  int num_tuples = 0;
46  int num_fields = 0;
47  char* next_column_start;
48  Status status = parser->ParseFieldLocations(
49  100, remaining_len, &data_ptr, &row_end_locs[0], &field_locations[0], &num_tuples,
50  &num_fields, &next_column_start);
51  EXPECT_EQ(num_tuples, expected_num_tuples) << data;
52  EXPECT_EQ(num_fields, expected_num_fields) << data;
53 }
54 
56  const char TUPLE_DELIM = '|';
57  const char FIELD_DELIM = ',';
58  const char COLLECTION_DELIM = ',';
59  const char ESCAPE_CHAR = '@';
60 
61  const int NUM_COLS = 1;
62 
63  // Test without escape
64  bool is_materialized_col[NUM_COLS];
65  for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
66 
67  DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
68  TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM);
69  // Note that only complete tuples "count"
70  Validate(&no_escape_parser, "no_delims", -1, TUPLE_DELIM, 0, 0);
71  Validate(&no_escape_parser, "abc||abc", 4, TUPLE_DELIM, 1, 1);
72  Validate(&no_escape_parser, "|abcd", 1, TUPLE_DELIM, 0, 0);
73  Validate(&no_escape_parser, "a|bcd", 2, TUPLE_DELIM, 0, 0);
74 
75  // Test with escape char
76  DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
77  TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
78  ESCAPE_CHAR);
79  Validate(&escape_parser, "a@|a|bcd", 5, TUPLE_DELIM, 0, 0);
80  Validate(&escape_parser, "a@@|a|bcd", 4, TUPLE_DELIM, 1, 1);
81  Validate(&escape_parser, "a@@@|a|bcd", 7, TUPLE_DELIM, 0, 0);
82  Validate(&escape_parser, "a@@@@|a|bcd", 6, TUPLE_DELIM, 1, 1);
83  Validate(&escape_parser, "a|@@@|a|bcd", 2, TUPLE_DELIM, 1, 1);
84 
85  // // The parser doesn't support this case.
86  // // TODO: update test when it is fixed
87  // Validate(&escape_parser, "@|no_delims", -1, TUPLE_DELIM);
88 
89  // Test null characters
90  const string str1("\0no_delims", 10);
91  const string str2("ab\0||abc", 8);
92  const string str3("\0|\0|\0", 5);
93  const string str4("abc|\0a|abc", 10);
94  const string str5("\0|aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 32);
95  Validate(&no_escape_parser, str1, -1, TUPLE_DELIM, 0, 0);
96  Validate(&no_escape_parser, str2, 4, TUPLE_DELIM, 1, 1);
97  Validate(&no_escape_parser, str3, 2, TUPLE_DELIM, 1, 1);
98  Validate(&no_escape_parser, str4, 4, TUPLE_DELIM, 1, 1);
99  Validate(&no_escape_parser, str5, 2, TUPLE_DELIM, 0, 0);
100 
101  const string str6("\0@|\0|\0", 6);
102  const string str7("\0@@|\0|\0", 6);
103  const string str8("\0@\0@|\0|\0", 8);
104  const string str9("\0@||aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 34);
105  Validate(&escape_parser, str6, 5, TUPLE_DELIM, 0, 0);
106  Validate(&escape_parser, str7, 4, TUPLE_DELIM, 1, 1);
107  Validate(&escape_parser, str8, 7, TUPLE_DELIM, 0, 0);
108  Validate(&escape_parser, str9, 4, TUPLE_DELIM, 0, 0);
109 }
110 
112  const char TUPLE_DELIM = '|';
113  const char FIELD_DELIM = ',';
114  const char COLLECTION_DELIM = ',';
115  const char ESCAPE_CHAR = '@';
116 
117  const int NUM_COLS = 2;
118 
119  bool is_materialized_col[NUM_COLS];
120  for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
121 
122  DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
123  TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM);
124 
125  Validate(&no_escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
126  Validate(&no_escape_parser, "b|c,d|e,f", 2, TUPLE_DELIM, 1, 3);
127  Validate(&no_escape_parser, "a,|c,d|", 3, TUPLE_DELIM, 1, 2);
128  Validate(&no_escape_parser, "a,|c|e", 3, TUPLE_DELIM, 1, 2);
129  const string str10("a,\0|c,d|e", 9);
130  Validate(&no_escape_parser, str10, 4, TUPLE_DELIM, 1, 2);
131 
132  DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
133  TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
134  ESCAPE_CHAR);
135 
136  Validate(&escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
137  Validate(&escape_parser, "a,@|c|e,f", 6, TUPLE_DELIM, 0, 1);
138  Validate(&escape_parser, "a|b,c|d@,e", 2, TUPLE_DELIM, 1, 2);
139 }
140 
141 // TODO: expand test for other delimited text parser functions/cases.
142 // Not all of them work without creating a HdfsScanNode but we can expand
143 // these tests quite a bit more.
144 
145 }
146 
147 int main(int argc, char **argv) {
148  ::testing::InitGoogleTest(&argc, argv);
150  return RUN_ALL_TESTS();
151 }
152 
TEST(AtomicTest, Basic)
Definition: atomic-test.cc:28
int FindFirstInstance(const char *buffer, int len)
Status ParseFieldLocations(int max_tuples, int64_t remaining_len, char **byte_buffer_ptr, char **row_end_locations, FieldLocation *field_locations, int *num_tuples, int *num_fields, char **next_column_start)
int main(int argc, char **argv)
uint8_t offset[7 *64-sizeof(uint64_t)]
void Validate(DelimitedTextParser *parser, const string &data, int expected_offset, char tuple_delim, int expected_num_tuples, int expected_num_fields)
static void Init()
Initialize CpuInfo.
Definition: cpu-info.cc:75
void ParserReset()
Called to initialize parser at beginning of scan range.