Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
like-predicate.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "exprs/like-predicate.h"
16 
17 #include <string.h>
18 #include <re2/re2.h>
19 #include <re2/stringpiece.h>
20 #include <sstream>
21 
22 #include "gutil/strings/substitute.h"
24 
25 #include "common/names.h"
26 using namespace impala_udf;
27 using namespace re2;
28 
29 namespace impala {
30 // A regex to match any regex pattern is equivalent to a substring search.
31 static const RE2 SUBSTRING_RE(
32  "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
33 
34 // A regex to match any regex pattern which is equivalent to matching a constant string
35 // at the end of the string values.
36 static const RE2 ENDS_WITH_RE(
37  "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
38 
39 // A regex to match any regex pattern which is equivalent to matching a constant string
40 // at the end of the string values.
41 static const RE2 STARTS_WITH_RE(
42  "\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
43 
44 // A regex to match any regex pattern which is equivalent to a constant string match.
45 static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
46 
47 LikePredicate::LikePredicate(const TExprNode& node)
48  : Predicate(node) {
49 }
50 
52 }
53 
56  if (scope != FunctionContext::THREAD_LOCAL) return;
58  state->function_ = LikeFn;
59  context->SetFunctionState(scope, state);
60  if (context->IsArgConstant(1)) {
61  StringVal pattern_val = *reinterpret_cast<StringVal*>(context->GetConstantArg(1));
62  if (pattern_val.is_null) return;
63  StringValue pattern = StringValue::FromStringVal(pattern_val);
64  re2::RE2 substring_re("(?:%+)([^%_]*)(?:%+)");
65  re2::RE2 ends_with_re("(?:%+)([^%_]*)");
66  re2::RE2 starts_with_re("([^%_]*)(?:%+)");
67  re2::RE2 equals_re("([^%_]*)");
68  string pattern_str(pattern.ptr, pattern.len);
69  string search_string;
70  if (RE2::FullMatch(pattern_str, substring_re, &search_string)) {
71  state->SetSearchString(search_string);
73  } else if (RE2::FullMatch(pattern_str, starts_with_re, &search_string)) {
74  state->SetSearchString(search_string);
76  } else if (RE2::FullMatch(pattern_str, ends_with_re, &search_string)) {
77  state->SetSearchString(search_string);
79  } else if (RE2::FullMatch(pattern_str, equals_re, &search_string)) {
80  state->SetSearchString(search_string);
81  state->function_ = ConstantEqualsFn;
82  } else {
83  string re_pattern;
84  ConvertLikePattern(context,
85  *reinterpret_cast<StringVal*>(context->GetConstantArg(1)), &re_pattern);
86  state->regex_.reset(new RE2(re_pattern));
87  if (!state->regex_->ok()) {
88  context->SetError(
89  strings::Substitute("Invalid regex: $0", pattern_val.ptr).c_str());
90  }
91  }
92  }
93 }
94 
96  const StringVal& pattern) {
97  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
98  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
99  return (state->function_)(context, val, pattern);
100 }
101 
104  if (scope == FunctionContext::THREAD_LOCAL) {
105  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
106  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
107  delete state;
108  }
109 }
110 
113  if (scope != FunctionContext::THREAD_LOCAL) return;
114  LikePredicateState* state = new LikePredicateState();
115  context->SetFunctionState(scope, state);
116  state->function_ = RegexFn;
117  if (context->IsArgConstant(1)) {
118  StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
119  if (pattern->is_null) {
120  return;
121  }
122  string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len);
123  string search_string;
124  // The following four conditionals check if the pattern is a constant string,
125  // starts with a constant string and is followed by any number of wildcard characters,
126  // ends with a constant string and is preceded by any number of wildcard characters or
127  // has a constant substring surrounded on both sides by any number of wildcard
128  // characters. In any of these conditions, we can search for the pattern more
129  // efficiently by using our own string match functions rather than regex matching.
130  if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
131  state->SetSearchString(search_string);
132  state->function_ = ConstantEqualsFn;
133  } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
134  state->SetSearchString(search_string);
136  } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
137  state->SetSearchString(search_string);
138  state->function_ = ConstantEndsWithFn;
139  } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
140  state->SetSearchString(search_string);
142  } else {
143  state->regex_.reset(new RE2(pattern_str));
144  stringstream error;
145  if (!state->regex_->ok()) {
146  stringstream error;
147  error << "Invalid regex expression" << pattern->ptr;
148  context->SetError(error.str().c_str());
149  }
151  }
152  }
153 }
154 
156  const StringVal& pattern) {
157  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
158  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
159  return (state->function_)(context, val, pattern);
160 }
161 
164  if (scope == FunctionContext::THREAD_LOCAL) {
165  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
166  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
167  delete state;
168  }
169 }
170 
172  const StringVal& pattern) {
173  return RegexMatch(context, val, pattern, false);
174 }
175 
177  const StringVal& pattern) {
178  return RegexMatch(context, val, pattern, true);
179 }
180 
182  const StringVal& val, const StringVal& pattern) {
183  if (val.is_null) return BooleanVal::null();
184  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
185  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
186  if (state->search_string_sv_.len == 0) return BooleanVal(true);
187  StringValue pattern_value = StringValue::FromStringVal(val);
188  return BooleanVal(state->substring_pattern_.Search(&pattern_value) != -1);
189 }
190 
192  const StringVal& val, const StringVal& pattern) {
193  if (val.is_null) return BooleanVal::null();
194  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
195  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
196  if (val.len < state->search_string_sv_.len) {
197  return BooleanVal(false);
198  } else {
199  StringValue v =
200  StringValue(reinterpret_cast<char*>(val.ptr), state->search_string_sv_.len);
201  return BooleanVal(state->search_string_sv_.Eq((v)));
202  }
203 }
204 
206  const StringVal& val, const StringVal& pattern) {
207  if (val.is_null) return BooleanVal::null();
208  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
209  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
210  if (val.len < state->search_string_sv_.len) {
211  return BooleanVal(false);
212  } else {
213  char* ptr =
214  reinterpret_cast<char*>(val.ptr) + val.len - state->search_string_sv_.len;
215  int len = state->search_string_sv_.len;
216  StringValue v = StringValue(ptr, len);
217  return BooleanVal(state->search_string_sv_.Eq(v));
218  }
219 }
220 
222  const StringVal& pattern) {
223  if (val.is_null) return BooleanVal::null();
224  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
225  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
227 }
228 
230  const StringVal& val, const StringVal& pattern) {
231  if (val.is_null) return BooleanVal::null();
232  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
233  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
234  re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len);
235  return RE2::PartialMatch(operand_sp, *state->regex_);
236 }
237 
239  const StringVal& val, const StringVal& pattern) {
240  if (val.is_null) return BooleanVal::null();
241  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
242  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
243  re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len);
244  return RE2::FullMatch(operand_sp, *state->regex_);
245 }
246 
248  const StringVal& operand_value, const StringVal& pattern_value,
249  bool is_like_pattern) {
250  if (operand_value.is_null || pattern_value.is_null) return BooleanVal::null();
251  if (context->IsArgConstant(1)) {
252  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
253  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
254  if (is_like_pattern) {
255  return RE2::FullMatch(re2::StringPiece(reinterpret_cast<const char*>(
256  operand_value.ptr), operand_value.len), *state->regex_.get());
257  } else {
258  return RE2::PartialMatch(re2::StringPiece(reinterpret_cast<const char*>(
259  operand_value.ptr), operand_value.len), *state->regex_.get());
260  }
261  } else {
262  string re_pattern;
263  if (is_like_pattern) {
264  ConvertLikePattern(context, pattern_value, &re_pattern);
265  } else {
266  re_pattern =
267  string(reinterpret_cast<const char*>(pattern_value.ptr), pattern_value.len);
268  }
269  re2::RE2 re(re_pattern);
270  if (re.ok()) {
271  if (is_like_pattern) {
272  return RE2::FullMatch(re2::StringPiece(
273  reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re);
274  } else {
275  return RE2::PartialMatch(re2::StringPiece(
276  reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re);
277  }
278  } else {
279  context->SetError(
280  strings::Substitute("Invalid regex: $0", pattern_value.ptr).c_str());
281  return BooleanVal(false);
282  }
283  }
284 }
285 
287  string* re_pattern) {
288  re_pattern->clear();
289  LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
290  context->GetFunctionState(FunctionContext::THREAD_LOCAL));
291  bool is_escaped = false;
292  for (int i = 0; i < pattern.len; ++i) {
293  if (!is_escaped && pattern.ptr[i] == '%') {
294  re_pattern->append(".*");
295  } else if (!is_escaped && pattern.ptr[i] == '_') {
296  re_pattern->append(".");
297  // check for escape char before checking for regex special chars, they might overlap
298  } else if (!is_escaped && pattern.ptr[i] == state->escape_char_) {
299  is_escaped = true;
300  } else if (
301  pattern.ptr[i] == '.'
302  || pattern.ptr[i] == '['
303  || pattern.ptr[i] == ']'
304  || pattern.ptr[i] == '{'
305  || pattern.ptr[i] == '}'
306  || pattern.ptr[i] == '('
307  || pattern.ptr[i] == ')'
308  || pattern.ptr[i] == '\\'
309  || pattern.ptr[i] == '*'
310  || pattern.ptr[i] == '+'
311  || pattern.ptr[i] == '?'
312  || pattern.ptr[i] == '|'
313  || pattern.ptr[i] == '^'
314  || pattern.ptr[i] == '$'
315  ) {
316  // escape all regex special characters; see list at
317  // http://www.boost.org/doc/libs/1_47_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html
318  re_pattern->append("\\");
319  re_pattern->append(1, pattern.ptr[i]);
320  is_escaped = false;
321  } else {
322  // regular character or escaped special character
323  re_pattern->append(1, pattern.ptr[i]);
324  is_escaped = false;
325  }
326  }
327 }
328 
329 } // namespace impala
bool Eq(const StringValue &other) const
==
static impala_udf::BooleanVal ConstantStartsWithFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
Handling of like predicates that can be implemented using strncmp.
static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*")
static void LikePrepare(impala_udf::FunctionContext *context, impala_udf::FunctionContext::FunctionStateScope scope)
static impala_udf::BooleanVal ConstantEndsWithFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
Handling of like predicates that can be implemented using strncmp.
static impala_udf::BooleanVal ConstantSubstringFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
Handling of like predicates that map to strstr.
static void RegexClose(impala_udf::FunctionContext *, impala_udf::FunctionContext::FunctionStateScope scope)
static impala_udf::BooleanVal ConstantRegexFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
boost::scoped_ptr< re2::RE2 > regex_
Used for RLIKE and REGEXP predicates if the pattern is a constant aruement.
int Search(const StringValue *str) const
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$")
static impala_udf::BooleanVal Like(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
static impala_udf::BooleanVal ConstantEqualsFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
Handling of like predicates that can be implemented using strcmp.
static void RegexPrepare(impala_udf::FunctionContext *context, impala_udf::FunctionContext::FunctionStateScope scope)
uint8_t * ptr
Definition: udf.h:523
bool is_null
Definition: udf.h:359
static impala_udf::BooleanVal RegexMatch(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern, bool is_like_pattern)
static void ConvertLikePattern(impala_udf::FunctionContext *context, const impala_udf::StringVal &pattern, std::string *re_pattern)
static impala_udf::BooleanVal Regex(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
void * GetFunctionState(FunctionStateScope scope) const
Definition: udf-ir.cc:38
bool IsArgConstant(int arg_idx) const
Definition: udf-ir.cc:20
void SetFunctionState(FunctionStateScope scope, void *ptr)
Definition: udf.cc:370
static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$")
static impala_udf::BooleanVal ConstantRegexFnPartial(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
static StringValue FromStringVal(const impala_udf::StringVal &sv)
Definition: string-value.h:103
void SetSearchString(const std::string &search_string)
static impala_udf::BooleanVal LikeFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)
void SetError(const char *error_msg)
Definition: udf.cc:332
AnyVal * GetConstantArg(int arg_idx) const
Definition: udf-ir.cc:25
static void LikeClose(impala_udf::FunctionContext *context, impala_udf::FunctionContext::FunctionStateScope scope)
static const RE2 SUBSTRING_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*")
static impala_udf::BooleanVal RegexFn(impala_udf::FunctionContext *context, const impala_udf::StringVal &val, const impala_udf::StringVal &pattern)