Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
text-converter.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <boost/algorithm/string.hpp>
16 
17 #include "codegen/llvm-codegen.h"
18 #include "runtime/descriptors.h"
19 #include "runtime/mem-pool.h"
20 #include "runtime/runtime-state.h"
21 #include "runtime/string-value.h"
23 #include "runtime/tuple.h"
24 #include "text-converter.h"
25 #include "util/string-parser.h"
26 
27 #include "common/names.h"
28 
29 using namespace impala;
30 using namespace llvm;
31 
32 TextConverter::TextConverter(char escape_char, const string& null_col_val,
33  bool check_null)
34  : escape_char_(escape_char),
35  null_col_val_(null_col_val),
36  check_null_(check_null) {
37 }
38 
40  char* new_data = reinterpret_cast<char*>(pool->Allocate(value->len));
41  UnescapeString(value->ptr, new_data, &value->len);
42  value->ptr = new_data;
43 }
44 
45 void TextConverter::UnescapeString(const char* src, char* dest, int* len,
46  int64_t maxlen) {
47  const char* src_end = src + *len;
48  char* dest_end = dest + *len;
49  if (maxlen > 0) dest_end = dest + maxlen;
50  char* dest_ptr = dest;
51  bool escape_next_char = false;
52 
53  while ((src < src_end) && (dest_ptr < dest_end)) {
54  if (*src == escape_char_) {
55  escape_next_char = !escape_next_char;
56  } else {
57  escape_next_char = false;
58  }
59  if (escape_next_char) {
60  ++src;
61  } else {
62  *dest_ptr++ = *src++;
63  }
64  }
65  char* dest_start = reinterpret_cast<char*>(dest);
66  *len = dest_ptr - dest_start;
67 }
68 
69 // Codegen for a function to parse one slot. The IR for a int slot looks like:
70 // define i1 @WriteSlot({ i8, i32 }* %tuple_arg, i8* %data, i32 %len) {
71 // entry:
72 // %parse_result = alloca i32
73 // %0 = call i1 @IsNullString(i8* %data, i32 %len)
74 // br i1 %0, label %set_null, label %check_zero
75 //
76 // set_null: ; preds = %check_zero, %entry
77 // call void @SetNull({ i8, i32 }* %tuple_arg)
78 // ret i1 true
79 //
80 // parse_slot: ; preds = %check_zero
81 // %slot = getelementptr inbounds { i8, i32 }* %tuple_arg, i32 0, i32 1
82 // %1 = call i32 @IrStringToInt32(i8* %data, i32 %len, i32* %parse_result)
83 // %parse_result1 = load i32* %parse_result
84 // %failed = icmp eq i32 %parse_result1, 1
85 // br i1 %failed, label %parse_fail, label %parse_success
86 //
87 // check_zero: ; preds = %entry
88 // %2 = icmp eq i32 %len, 0
89 // br i1 %2, label %set_null, label %parse_slot
90 //
91 // parse_success: ; preds = %parse_slot
92 // store i32 %1, i32* %slot
93 // ret i1 true
94 //
95 // parse_fail: ; preds = %parse_slot
96 // call void @SetNull({ i8, i32 }* %tuple_arg)
97 // ret i1 false
98 // }
100  TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc,
101  const char* null_col_val, int len, bool check_null) {
102  if (slot_desc->type().type == TYPE_CHAR) {
103  LOG(INFO) << "Char isn't supported for CodegenWriteSlot";
104  return NULL;
105  }
106  SCOPED_TIMER(codegen->codegen_timer());
107 
108  // Codegen is_null_string
109  bool is_default_null = (len == 2 && null_col_val[0] == '\\' && null_col_val[1] == 'N');
110  Function* is_null_string_fn;
111  if (is_default_null) {
112  is_null_string_fn = codegen->GetFunction(IRFunction::IS_NULL_STRING);
113  } else {
114  is_null_string_fn = codegen->GetFunction(IRFunction::GENERIC_IS_NULL_STRING);
115  }
116  if (is_null_string_fn == NULL) return NULL;
117 
118  StructType* tuple_type = tuple_desc->GenerateLlvmStruct(codegen);
119  if (tuple_type == NULL) return NULL;
120  PointerType* tuple_ptr_type = PointerType::get(tuple_type, 0);
121 
122  Function* set_null_fn = slot_desc->CodegenUpdateNull(codegen, tuple_type, true);
123  if (set_null_fn == NULL) {
124  LOG(ERROR) << "Could not codegen WriteSlot because slot update codegen failed.";
125  return NULL;
126  }
127 
128  LlvmCodeGen::FnPrototype prototype(
129  codegen, "WriteSlot", codegen->GetType(TYPE_BOOLEAN));
130  prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple_arg", tuple_ptr_type));
131  prototype.AddArgument(LlvmCodeGen::NamedVariable("data", codegen->ptr_type()));
132  prototype.AddArgument(LlvmCodeGen::NamedVariable("len", codegen->GetType(TYPE_INT)));
133 
134  LlvmCodeGen::LlvmBuilder builder(codegen->context());
135  Value* args[3];
136  Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
137 
138  BasicBlock* set_null_block, *parse_slot_block, *check_zero_block = NULL;
139  codegen->CreateIfElseBlocks(fn, "set_null", "parse_slot",
140  &set_null_block, &parse_slot_block);
141 
142  if (!slot_desc->type().IsVarLen()) {
143  check_zero_block = BasicBlock::Create(codegen->context(), "check_zero", fn);
144  }
145 
146  // Check if the data matches the configured NULL string.
147  Value* is_null;
148  if (check_null) {
149  if (is_default_null) {
150  is_null = builder.CreateCall2(is_null_string_fn, args[1], args[2]);
151  } else {
152  is_null = builder.CreateCall4(is_null_string_fn, args[1], args[2],
153  codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
154  const_cast<char*>(null_col_val)),
155  codegen->GetIntConstant(TYPE_INT, len));
156  }
157  } else {
158  // Constant FALSE as branch condition. We rely on later optimization passes
159  // to remove the branch and THEN block.
160  is_null = codegen->false_value();
161  }
162  builder.CreateCondBr(is_null, set_null_block,
163  (slot_desc->type().IsVarLen()) ? parse_slot_block : check_zero_block);
164 
165  if (!slot_desc->type().IsVarLen()) {
166  builder.SetInsertPoint(check_zero_block);
167  // If len <= 0 and it is not a string col, set slot to NULL
168  // The len can be less than 0 if the field contained an escape character which
169  // is only valid for string cols.
170  Value* null_len = builder.CreateICmpSLE(
171  args[2], codegen->GetIntConstant(TYPE_INT, 0));
172  builder.CreateCondBr(null_len, set_null_block, parse_slot_block);
173  }
174 
175  // Codegen parse slot block
176  builder.SetInsertPoint(parse_slot_block);
177  Value* slot = builder.CreateStructGEP(args[0], slot_desc->field_idx(), "slot");
178 
179  if (slot_desc->type().IsVarLen()) {
180  Value* ptr = builder.CreateStructGEP(slot, 0, "string_ptr");
181  Value* len = builder.CreateStructGEP(slot, 1, "string_len");
182 
183  builder.CreateStore(args[1], ptr);
184  // TODO codegen memory allocation for CHAR
185  DCHECK(slot_desc->type().type != TYPE_CHAR);
186  if (slot_desc->type().type == TYPE_VARCHAR) {
187  // determine if we need to truncate the string
188  Value* maxlen = codegen->GetIntConstant(TYPE_INT, slot_desc->type().len);
189  Value* len_lt_maxlen = builder.CreateICmpSLT(args[2], maxlen, "len_lt_maxlen");
190  Value* minlen = builder.CreateSelect(len_lt_maxlen, args[2], maxlen,
191  "select_min_len");
192  builder.CreateStore(minlen, len);
193  } else {
194  builder.CreateStore(args[2], len);
195  }
196  builder.CreateRet(codegen->true_value());
197  } else {
198  IRFunction::Type parse_fn_enum;
199  Function* parse_fn = NULL;
200  switch (slot_desc->type().type) {
201  case TYPE_BOOLEAN:
202  parse_fn_enum = IRFunction::STRING_TO_BOOL;
203  break;
204  case TYPE_TINYINT:
205  parse_fn_enum = IRFunction::STRING_TO_INT8;
206  break;
207  case TYPE_SMALLINT:
208  parse_fn_enum = IRFunction::STRING_TO_INT16;
209  break;
210  case TYPE_INT:
211  parse_fn_enum = IRFunction::STRING_TO_INT32;
212  break;
213  case TYPE_BIGINT:
214  parse_fn_enum = IRFunction::STRING_TO_INT64;
215  break;
216  case TYPE_FLOAT:
217  parse_fn_enum = IRFunction::STRING_TO_FLOAT;
218  break;
219  case TYPE_DOUBLE:
220  parse_fn_enum = IRFunction::STRING_TO_DOUBLE;
221  break;
222  default:
223  DCHECK(false);
224  return NULL;
225  }
226  parse_fn = codegen->GetFunction(parse_fn_enum);
227  DCHECK(parse_fn != NULL);
228 
229  // Set up trying to parse the string to the slot type
230  BasicBlock* parse_success_block, *parse_failed_block;
231  codegen->CreateIfElseBlocks(fn, "parse_success", "parse_fail",
232  &parse_success_block, &parse_failed_block);
233  LlvmCodeGen::NamedVariable parse_result("parse_result", codegen->GetType(TYPE_INT));
234  Value* parse_result_ptr = codegen->CreateEntryBlockAlloca(fn, parse_result);
235  Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE);
236 
237  // Call Impala's StringTo* function
238  Value* result = builder.CreateCall3(parse_fn, args[1], args[2], parse_result_ptr);
239  Value* parse_result_val = builder.CreateLoad(parse_result_ptr, "parse_result");
240 
241  // Check for parse error. TODO: handle overflow
242  Value* parse_failed = builder.CreateICmpEQ(parse_result_val, failed_value, "failed");
243  builder.CreateCondBr(parse_failed, parse_failed_block, parse_success_block);
244 
245  // Parse succeeded
246  builder.SetInsertPoint(parse_success_block);
247  builder.CreateStore(result, slot);
248  builder.CreateRet(codegen->true_value());
249 
250  // Parse failed, set slot to null and return false
251  builder.SetInsertPoint(parse_failed_block);
252  builder.CreateCall(set_null_fn, args[0]);
253  builder.CreateRet(codegen->false_value());
254  }
255 
256  // Case where data is \N or len == 0 and it is not a string col
257  builder.SetInsertPoint(set_null_block);
258  builder.CreateCall(set_null_fn, args[0]);
259  builder.CreateRet(codegen->true_value());
260 
261  return codegen->FinalizeFunction(fn);
262 }
bool IsVarLen() const
Definition: types.h:172
RuntimeProfile::Counter * codegen_timer()
Definition: llvm-codegen.h:135
Utility struct that wraps a variable name and llvm type.
Definition: llvm-codegen.h:149
void UnescapeString(const char *src, char *dest, int *len, int64_t maxlen=-1)
TextConverter(char escape_char, const std::string &null_col_val, bool check_null=true)
llvm::StructType * GenerateLlvmStruct(LlvmCodeGen *codegen)
Definition: descriptors.cc:556
#define SCOPED_TIMER(c)
LLVM code generator. This is the top level object to generate jitted code.
Definition: llvm-codegen.h:107
PrimitiveType type
Definition: types.h:60
llvm::Value * CastPtrToLlvmPtr(llvm::Type *type, const void *ptr)
void AddArgument(const NamedVariable &var)
Add argument.
Definition: llvm-codegen.h:171
const ColumnType & type() const
Definition: descriptors.h:78
ObjectPool pool
llvm::Function * GetFunction(IRFunction::Type)
int len
Only set if type == TYPE_CHAR or type == TYPE_VARCHAR.
Definition: types.h:62
void CreateIfElseBlocks(llvm::Function *fn, const std::string &if_name, const std::string &else_name, llvm::BasicBlock **if_block, llvm::BasicBlock **else_block, llvm::BasicBlock *insert_before=NULL)
llvm::Function * CodegenUpdateNull(LlvmCodeGen *, llvm::StructType *tuple, bool set_null)
Definition: descriptors.cc:510
llvm::Value * true_value()
Returns true/false constants (bool type)
Definition: llvm-codegen.h:380
llvm::Value * false_value()
Definition: llvm-codegen.h:381
llvm::Type * GetType(const ColumnType &type)
Returns llvm type for the column type.
int field_idx() const
Returns the field index in the generated llvm struct for this slot's tuple.
Definition: descriptors.h:87
llvm::Value * GetIntConstant(PrimitiveType type, int64_t val)
Returns the constant 'val' of 'type'.
llvm::Function * FinalizeFunction(llvm::Function *function)
llvm::LLVMContext & context()
Definition: llvm-codegen.h:214
llvm::AllocaInst * CreateEntryBlockAlloca(llvm::Function *f, const NamedVariable &var)
uint8_t * Allocate(int size)
Definition: mem-pool.h:92
llvm::PointerType * ptr_type()
Definition: llvm-codegen.h:393
static llvm::Function * CodegenWriteSlot(LlvmCodeGen *codegen, TupleDescriptor *tuple_desc, SlotDescriptor *slot_desc, const char *null_col_val, int len, bool check_null)