Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
string-functions.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "exprs/string-functions.h"
16 
17 #include <cctype>
18 #include <stdint.h>
19 #include <re2/re2.h>
20 #include <re2/stringpiece.h>
21 
22 #include "exprs/anyval-util.h"
23 #include "exprs/expr.h"
25 #include "runtime/tuple-row.h"
26 #include "util/url-parser.h"
27 
28 #include "common/names.h"
29 
30 using namespace impala_udf;
31 
32 // NOTE: be careful not to use string::append. It is not performant.
33 namespace impala {
34 
35 // This behaves identically to the mysql implementation, namely:
36 // - 1-indexed positions
37 // - supported negative positions (count from the end of the string)
38 // - [optional] len. No len indicates longest substr possible
39 StringVal StringFunctions::Substring(FunctionContext* context,
40  const StringVal& str, const BigIntVal& pos, const BigIntVal& len) {
41  if (str.is_null || pos.is_null || len.is_null) return StringVal::null();
42  int fixed_pos = pos.val;
43  if (fixed_pos < 0) fixed_pos = str.len + fixed_pos + 1;
44  int max_len = str.len - fixed_pos + 1;
45  int fixed_len = ::min(static_cast<int>(len.val), max_len);
46  if (fixed_pos > 0 && fixed_pos <= str.len && fixed_len > 0) {
47  return StringVal(str.ptr + fixed_pos - 1, fixed_len);
48  } else {
49  return StringVal();
50  }
51 }
52 
53 StringVal StringFunctions::Substring(FunctionContext* context,
54  const StringVal& str, const BigIntVal& pos) {
55  // StringVal.len is an int => INT32_MAX
56  return Substring(context, str, pos, BigIntVal(INT32_MAX));
57 }
58 
59 // This behaves identically to the mysql implementation.
60 StringVal StringFunctions::Left(
61  FunctionContext* context, const StringVal& str, const BigIntVal& len) {
62  return Substring(context, str, 1, len);
63 }
64 
65 // This behaves identically to the mysql implementation.
66 StringVal StringFunctions::Right(
67  FunctionContext* context, const StringVal& str, const BigIntVal& len) {
68  // Don't index past the beginning of str, otherwise we'll get an empty string back
69  int64_t pos = ::max(-len.val, static_cast<int64_t>(-str.len));
70  return Substring(context, str, BigIntVal(pos), len);
71 }
72 
73 StringVal StringFunctions::Space(FunctionContext* context, const BigIntVal& len) {
74  if (len.is_null) return StringVal::null();
75  if (len.val <= 0) return StringVal();
76  StringVal result(context, len.val);
77  memset(result.ptr, ' ', len.val);
78  return result;
79 }
80 
81 StringVal StringFunctions::Repeat(
82  FunctionContext* context, const StringVal& str, const BigIntVal& n) {
83  if (str.is_null || n.is_null) return StringVal::null();
84  if (str.len == 0 || n.val <= 0) return StringVal();
85  StringVal result(context, str.len * n.val);
86  uint8_t* ptr = result.ptr;
87  for (int64_t i = 0; i < n.val; ++i) {
88  memcpy(ptr, str.ptr, str.len);
89  ptr += str.len;
90  }
91  return result;
92 }
93 
94 StringVal StringFunctions::Lpad(FunctionContext* context, const StringVal& str,
95  const BigIntVal& len, const StringVal& pad) {
96  if (str.is_null || len.is_null || pad.is_null || len.val < 0) return StringVal::null();
97  // Corner cases: Shrink the original string, or leave it alone.
98  // TODO: Hive seems to go into an infinite loop if pad.len == 0,
99  // so we should pay attention to Hive's future solution to be compatible.
100  if (len.val <= str.len || pad.len == 0) return StringVal(str.ptr, len.val);
101 
102  StringVal result(context, len.val);
103  int padded_prefix_len = len.val - str.len;
104  int pad_index = 0;
105  int result_index = 0;
106  uint8_t* ptr = result.ptr;
107 
108  // Prepend chars of pad.
109  while (result_index < padded_prefix_len) {
110  ptr[result_index++] = pad.ptr[pad_index++];
111  pad_index = pad_index % pad.len;
112  }
113 
114  // Append given string.
115  memcpy(ptr + result_index, str.ptr, str.len);
116  return result;
117 }
118 
119 StringVal StringFunctions::Rpad(FunctionContext* context, const StringVal& str,
120  const BigIntVal& len, const StringVal& pad) {
121  if (str.is_null || len.is_null || pad.is_null || len.val < 0) return StringVal::null();
122  // Corner cases: Shrink the original string, or leave it alone.
123  // TODO: Hive seems to go into an infinite loop if pad->len == 0,
124  // so we should pay attention to Hive's future solution to be compatible.
125  if (len.val <= str.len || pad.len == 0) {
126  return StringVal(str.ptr, len.val);
127  }
128 
129  StringVal result(context, len.val);
130  memcpy(result.ptr, str.ptr, str.len);
131 
132  // Append chars of pad until desired length
133  uint8_t* ptr = result.ptr;
134  int pad_index = 0;
135  int result_len = str.len;
136  while (result_len < len.val) {
137  ptr[result_len++] = pad.ptr[pad_index++];
138  pad_index = pad_index % pad.len;
139  }
140  return result;
141 }
142 
143 IntVal StringFunctions::Length(FunctionContext* context, const StringVal& str) {
144  if (str.is_null) return IntVal::null();
145  return IntVal(str.len);
146 }
147 
148 IntVal StringFunctions::CharLength(FunctionContext* context, const StringVal& str) {
149  if (str.is_null) return IntVal::null();
150  const FunctionContext::TypeDesc* t = context->GetArgType(0);
152  return StringValue::UnpaddedCharLength(reinterpret_cast<char*>(str.ptr), t->len);
153 }
154 
155 StringVal StringFunctions::Lower(FunctionContext* context, const StringVal& str) {
156  if (str.is_null) return StringVal::null();
157  StringVal result(context, str.len);
158  for (int i = 0; i < str.len; ++i) {
159  result.ptr[i] = ::tolower(str.ptr[i]);
160  }
161  return result;
162 }
163 
164 StringVal StringFunctions::Upper(FunctionContext* context, const StringVal& str) {
165  if (str.is_null) return StringVal::null();
166  StringVal result(context, str.len);
167  for (int i = 0; i < str.len; ++i) {
168  result.ptr[i] = ::toupper(str.ptr[i]);
169  }
170  return result;
171 }
172 
173 // Returns a string identical to the input, but with the first character
174 // of each word mapped to its upper-case equivalent. All other characters
175 // will be mapped to their lower-case equivalents. If input == NULL it
176 // will return NULL
177 StringVal StringFunctions::InitCap(FunctionContext* context, const StringVal& str) {
178  if (str.is_null) return StringVal::null();
179  StringVal result(context, str.len);
180  uint8_t* result_ptr = result.ptr;
181  bool word_start = true;
182  for (int i = 0; i < str.len; ++i) {
183  if (isspace(str.ptr[i])) {
184  result_ptr[i] = str.ptr[i];
185  word_start = true;
186  } else {
187  result_ptr[i] = (word_start ? toupper(str.ptr[i]) : tolower(str.ptr[i]));
188  word_start = false;
189  }
190  }
191  return result;
192 }
193 
194 StringVal StringFunctions::Reverse(FunctionContext* context, const StringVal& str) {
195  if (str.is_null) return StringVal::null();
196  StringVal result(context, str.len);
197  std::reverse_copy(str.ptr, str.ptr + str.len, result.ptr);
198  return result;
199 }
200 
201 StringVal StringFunctions::Translate(FunctionContext* context, const StringVal& str,
202  const StringVal& src, const StringVal& dst) {
203  if (str.is_null || src.is_null || dst.is_null) return StringVal::null();
204  StringVal result(context, str.len);
205 
206  // TODO: if we know src and dst are constant, we can prebuild a conversion
207  // table to remove the inner loop.
208  int result_len = 0;
209  for (int i = 0; i < str.len; ++i) {
210  bool matched_src = false;
211  for (int j = 0; j < src.len; ++j) {
212  if (str.ptr[i] == src.ptr[j]) {
213  if (j < dst.len) {
214  result.ptr[result_len++] = dst.ptr[j];
215  } else {
216  // src[j] doesn't map to any char in dst, the char is dropped.
217  }
218  matched_src = true;
219  break;
220  }
221  }
222  if (!matched_src) result.ptr[result_len++] = str.ptr[i];
223  }
224  result.len = result_len;
225  return result;
226 }
227 
228 StringVal StringFunctions::Trim(FunctionContext* context, const StringVal& str) {
229  if (str.is_null) return StringVal::null();
230  // Find new starting position.
231  int32_t begin = 0;
232  while (begin < str.len && str.ptr[begin] == ' ') {
233  ++begin;
234  }
235  // Find new ending position.
236  int32_t end = str.len - 1;
237  while (end > begin && str.ptr[end] == ' ') {
238  --end;
239  }
240  return StringVal(str.ptr + begin, end - begin + 1);
241 }
242 
243 StringVal StringFunctions::Ltrim(FunctionContext* context, const StringVal& str) {
244  if (str.is_null) return StringVal::null();
245  // Find new starting position.
246  int32_t begin = 0;
247  while (begin < str.len && str.ptr[begin] == ' ') {
248  ++begin;
249  }
250  return StringVal(str.ptr + begin, str.len - begin);
251 }
252 
253 StringVal StringFunctions::Rtrim(FunctionContext* context, const StringVal& str) {
254  if (str.is_null) return StringVal::null();
255  if (str.len == 0) return str;
256  // Find new ending position.
257  int32_t end = str.len - 1;
258  while (end > 0 && str.ptr[end] == ' ') {
259  --end;
260  }
261  DCHECK_GE(end, 0);
262  return StringVal(str.ptr, (str.ptr[end] == ' ') ? end : end + 1);
263 }
264 
265 IntVal StringFunctions::Ascii(FunctionContext* context, const StringVal& str) {
266  if (str.is_null) return IntVal::null();
267  // Hive returns 0 when given an empty string.
268  return IntVal((str.len == 0) ? 0 : static_cast<int32_t>(str.ptr[0]));
269 }
270 
271 IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
272  const StringVal& substr) {
273  if (str.is_null || substr.is_null) return IntVal::null();
274  StringValue str_sv = StringValue::FromStringVal(str);
275  StringValue substr_sv = StringValue::FromStringVal(substr);
276  StringSearch search(&substr_sv);
277  // Hive returns positions starting from 1.
278  return IntVal(search.Search(&str_sv) + 1);
279 }
280 
281 IntVal StringFunctions::Locate(FunctionContext* context, const StringVal& substr,
282  const StringVal& str) {
283  return Instr(context, str, substr);
284 }
285 
286 IntVal StringFunctions::LocatePos(FunctionContext* context, const StringVal& substr,
287  const StringVal& str, const BigIntVal& start_pos) {
288  if (str.is_null || substr.is_null || start_pos.is_null) return IntVal::null();
289  // Hive returns 0 for *start_pos <= 0,
290  // but throws an exception for *start_pos > str->len.
291  // Since returning 0 seems to be Hive's error condition, return 0.
292  if (start_pos.val <= 0 || start_pos.val > str.len) return IntVal(0);
293  StringValue substr_sv = StringValue::FromStringVal(substr);
294  StringSearch search(&substr_sv);
295  // Input start_pos.val starts from 1.
296  StringValue adjusted_str(reinterpret_cast<char*>(str.ptr) + start_pos.val - 1,
297  str.len - start_pos.val + 1);
298  int32_t match_pos = search.Search(&adjusted_str);
299  if (match_pos >= 0) {
300  // Hive returns the position in the original string starting from 1.
301  return IntVal(start_pos.val + match_pos);
302  } else {
303  return IntVal(0);
304  }
305 }
306 
307 // The caller owns the returned regex. Returns NULL if the pattern could not be compiled.
308 re2::RE2* CompileRegex(const StringVal& pattern, string* error_str) {
309  re2::StringPiece pattern_sp(reinterpret_cast<char*>(pattern.ptr), pattern.len);
310  re2::RE2::Options options;
311  // Disable error logging in case e.g. every row causes an error
312  options.set_log_errors(false);
313  // Return the leftmost longest match (rather than the first match).
314  options.set_longest_match(true);
315  re2::RE2* re = new re2::RE2(pattern_sp, options);
316  if (!re->ok()) {
317  stringstream ss;
318  ss << "Could not compile regexp pattern: " << AnyValUtil::ToString(pattern) << endl
319  << "Error: " << re->error();
320  *error_str = ss.str();
321  delete re;
322  return NULL;
323  }
324  return re;
325 }
326 
327 void StringFunctions::RegexpPrepare(
329  if (scope != FunctionContext::FRAGMENT_LOCAL) return;
330  if (!context->IsArgConstant(1)) return;
331  DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
332  StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
333  if (pattern->is_null) return;
334 
335  string error_str;
336  re2::RE2* re = CompileRegex(*pattern, &error_str);
337  if (re == NULL) {
338  context->SetError(error_str.c_str());
339  return;
340  }
341  context->SetFunctionState(scope, re);
342 }
343 
344 void StringFunctions::RegexpClose(
346  if (scope != FunctionContext::FRAGMENT_LOCAL) return;
347  re2::RE2* re = reinterpret_cast<re2::RE2*>(context->GetFunctionState(scope));
348  delete re;
349 }
350 
351 StringVal StringFunctions::RegexpExtract(FunctionContext* context, const StringVal& str,
352  const StringVal& pattern, const BigIntVal& index) {
353  if (str.is_null || pattern.is_null || index.is_null) return StringVal::null();
354  if (index.val < 0) return StringVal();
355 
356  re2::RE2* re = reinterpret_cast<re2::RE2*>(
358  scoped_ptr<re2::RE2> scoped_re; // destroys re if we have to locally compile it
359  if (re == NULL) {
360  DCHECK(!context->IsArgConstant(1));
361  string error_str;
362  re = CompileRegex(pattern, &error_str);
363  if (re == NULL) {
364  context->AddWarning(error_str.c_str());
365  return StringVal::null();
366  }
367  scoped_re.reset(re);
368  }
369 
370  re2::StringPiece str_sp(reinterpret_cast<char*>(str.ptr), str.len);
371  int max_matches = 1 + re->NumberOfCapturingGroups();
372  if (index.val >= max_matches) return StringVal();
373  // Use a vector because clang complains about non-POD varlen arrays
374  // TODO: fix this
375  vector<re2::StringPiece> matches(max_matches);
376  bool success =
377  re->Match(str_sp, 0, str.len, re2::RE2::UNANCHORED, &matches[0], max_matches);
378  if (!success) return StringVal();
379  // matches[0] is the whole string, matches[1] the first group, etc.
380  const re2::StringPiece& match = matches[index.val];
381  return AnyValUtil::FromBuffer(context, match.data(), match.size());
382 }
383 
384 StringVal StringFunctions::RegexpReplace(FunctionContext* context, const StringVal& str,
385  const StringVal& pattern, const StringVal& replace) {
386  if (str.is_null || pattern.is_null || replace.is_null) return StringVal::null();
387 
388  re2::RE2* re = reinterpret_cast<re2::RE2*>(
390  scoped_ptr<re2::RE2> scoped_re; // destroys re if state->re is NULL
391  if (re == NULL) {
392  DCHECK(!context->IsArgConstant(1));
393  string error_str;
394  re = CompileRegex(pattern, &error_str);
395  if (re == NULL) {
396  context->AddWarning(error_str.c_str());
397  return StringVal::null();
398  }
399  scoped_re.reset(re);
400  }
401 
402  re2::StringPiece replace_str =
403  re2::StringPiece(reinterpret_cast<char*>(replace.ptr), replace.len);
404  string result_str = AnyValUtil::ToString(str);
405  re2::RE2::GlobalReplace(&result_str, *re, replace_str);
406  return AnyValUtil::FromString(context, result_str);
407 }
408 
410  const StringVal* strs) {
411  return ConcatWs(context, StringVal(), num_children, strs);
412 }
413 
414 StringVal StringFunctions::ConcatWs(FunctionContext* context, const StringVal& sep,
415  int num_children, const StringVal* strs) {
416  DCHECK_GE(num_children, 1);
417  if (sep.is_null) return StringVal::null();
418 
419  // Pass through if there's only one argument
420  if (num_children == 1) return strs[0];
421 
422  if (strs[0].is_null) return StringVal::null();
423  int32_t total_size = strs[0].len;
424 
425  // Loop once to compute the final size and reserve space.
426  for (int32_t i = 1; i < num_children; ++i) {
427  if (strs[i].is_null) return StringVal::null();
428  total_size += sep.len + strs[i].len;
429  }
430  StringVal result(context, total_size);
431  uint8_t* ptr = result.ptr;
432 
433  // Loop again to append the data.
434  memcpy(ptr, strs[0].ptr, strs[0].len);
435  ptr += strs[0].len;
436  for (int32_t i = 1; i < num_children; ++i) {
437  memcpy(ptr, sep.ptr, sep.len);
438  ptr += sep.len;
439  memcpy(ptr, strs[i].ptr, strs[i].len);
440  ptr += strs[i].len;
441  }
442  return result;
443 }
444 
445 IntVal StringFunctions::FindInSet(FunctionContext* context, const StringVal& str,
446  const StringVal& str_set) {
447  if (str.is_null || str_set.is_null) return IntVal::null();
448  // Check str for commas.
449  for (int i = 0; i < str.len; ++i) {
450  if (str.ptr[i] == ',') return IntVal(0);
451  }
452  // The result index starts from 1 since 0 is an error condition.
453  int32_t token_index = 1;
454  int32_t start = 0;
455  int32_t end;
456  StringValue str_sv = StringValue::FromStringVal(str);
457  do {
458  end = start;
459  // Position end.
460  while(str_set.ptr[end] != ',' && end < str_set.len) ++end;
461  StringValue token(reinterpret_cast<char*>(str_set.ptr) + start, end - start);
462  if (str_sv.Eq(token)) return IntVal(token_index);
463 
464  // Re-position start and end past ','
465  start = end + 1;
466  ++token_index;
467  } while (start < str_set.len);
468  return IntVal(0);
469 }
470 
471 void StringFunctions::ParseUrlPrepare(
473  if (scope != FunctionContext::FRAGMENT_LOCAL) return;
474  if (!ctx->IsArgConstant(1)) return;
475  DCHECK_EQ(ctx->GetArgType(1)->type, FunctionContext::TYPE_STRING);
476  StringVal* part = reinterpret_cast<StringVal*>(ctx->GetConstantArg(1));
477  if (part->is_null) return;
478  UrlParser::UrlPart* url_part = new UrlParser::UrlPart;
479  *url_part = UrlParser::GetUrlPart(StringValue::FromStringVal(*part));
480  if (*url_part == UrlParser::INVALID) {
481  stringstream ss;
482  ss << "Invalid URL part: " << AnyValUtil::ToString(*part) << endl
483  << "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', "
484  << "'USERINFO', and 'QUERY')";
485  ctx->SetError(ss.str().c_str());
486  return;
487  }
488  ctx->SetFunctionState(scope, url_part);
489 }
490 
491 StringVal StringFunctions::ParseUrl(
492  FunctionContext* ctx, const StringVal& url, const StringVal& part) {
493  if (url.is_null || part.is_null) return StringVal::null();
495  UrlParser::UrlPart url_part;
496  if (state != NULL) {
497  url_part = *reinterpret_cast<UrlParser::UrlPart*>(state);
498  } else {
499  DCHECK(!ctx->IsArgConstant(1));
500  url_part = UrlParser::GetUrlPart(StringValue::FromStringVal(part));
501  }
502 
503  StringValue result;
504  if (!UrlParser::ParseUrl(StringValue::FromStringVal(url), url_part, &result)) {
505  // url is malformed, or url_part is invalid.
506  if (url_part == UrlParser::INVALID) {
507  stringstream ss;
508  ss << "Invalid URL part: " << AnyValUtil::ToString(part);
509  ctx->AddWarning(ss.str().c_str());
510  } else {
511  stringstream ss;
512  ss << "Could not parse URL: " << AnyValUtil::ToString(url);
513  ctx->AddWarning(ss.str().c_str());
514  }
515  return StringVal::null();
516  }
517  StringVal result_sv;
518  result.ToStringVal(&result_sv);
519  return result_sv;
520 }
521 
522 void StringFunctions::ParseUrlClose(
524  if (scope != FunctionContext::FRAGMENT_LOCAL) return;
525  UrlParser::UrlPart* url_part =
526  reinterpret_cast<UrlParser::UrlPart*>(ctx->GetFunctionState(scope));
527  if (url_part == NULL) return;
528  delete url_part;
529 }
530 
531 StringVal StringFunctions::ParseUrlKey(FunctionContext* ctx, const StringVal& url,
532  const StringVal& part, const StringVal& key) {
533  if (url.is_null || part.is_null || key.is_null) return StringVal::null();
535  UrlParser::UrlPart url_part;
536  if (state != NULL) {
537  url_part = *reinterpret_cast<UrlParser::UrlPart*>(state);
538  } else {
539  DCHECK(!ctx->IsArgConstant(1));
540  url_part = UrlParser::GetUrlPart(StringValue::FromStringVal(part));
541  }
542 
543  StringValue result;
544  if (!UrlParser::ParseUrlKey(StringValue::FromStringVal(url), url_part,
545  StringValue::FromStringVal(key), &result)) {
546  // url is malformed, or url_part is invalid.
547  if (url_part == UrlParser::INVALID) {
548  stringstream ss;
549  ss << "Invalid URL part: " << AnyValUtil::ToString(part);
550  ctx->AddWarning(ss.str().c_str());
551  } else {
552  stringstream ss;
553  ss << "Could not parse URL: " << AnyValUtil::ToString(url);
554  ctx->AddWarning(ss.str().c_str());
555  }
556  return StringVal::null();
557  }
558  StringVal result_sv;
559  result.ToStringVal(&result_sv);
560  return result_sv;
561 }
562 
563 }
bool Eq(const StringValue &other) const
==
static IntVal null()
Definition: udf.h:425
int Search(const StringValue *str) const
uint8_t * ptr
Definition: udf.h:523
re2::RE2 * CompileRegex(const StringVal &pattern, string *error_str)
bool AddWarning(const char *warning_msg)
Definition: udf.cc:345
bool is_null
Definition: udf.h:359
const TypeDesc * GetArgType(int arg_idx) const
Definition: udf.cc:425
void * GetFunctionState(FunctionStateScope scope) const
Definition: udf-ir.cc:38
bool IsArgConstant(int arg_idx) const
Definition: udf-ir.cc:20
void SetFunctionState(FunctionStateScope scope, void *ptr)
Definition: udf.cc:370
static StringVal null()
Definition: udf.h:536
int len
Only valid if type == TYPE_FIXED_BUFFER || type == TYPE_VARCHAR.
Definition: udf.h:79
void SetError(const char *error_msg)
Definition: udf.cc:332
UrlPart
Parts of a URL that can be requested.
Definition: url-parser.h:44
void ToStringVal(impala_udf::StringVal *sv) const
Definition: string-value.h:99
AnyVal * GetConstantArg(int arg_idx) const
Definition: udf-ir.cc:25
StringVal Concat(FunctionContext *context, int n, const StringVal *args)
Definition: udf-test.cc:84