Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
string-search.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_RUNTIME_STRING_SEARCH_H
17 #define IMPALA_RUNTIME_STRING_SEARCH_H
18 
19 #include <vector>
20 #include <cstring>
21 #include <boost/cstdint.hpp>
22 
23 #include "common/logging.h"
24 #include "runtime/string-value.h"
25 
26 namespace impala {
27 
30 //
34 //
37 //
42 //
52 //
58 //
65 //
70 //
73 //
79 //
83 class StringSearch {
84 
85  public:
86  StringSearch() : pattern_(NULL), mask_(0) {}
87 
89  StringSearch(const StringValue* pattern) : pattern_(pattern), mask_(0), skip_(0) {
90  // Special cases
91  if (pattern_->len <= 1) {
92  return;
93  }
94 
95  // Build compressed lookup table
96  int mlast = pattern_->len - 1;
97  skip_ = mlast - 1;
98 
99  for (int i = 0; i < mlast; ++i) {
100  BloomAdd(pattern_->ptr[i]);
101  if (pattern_->ptr[i] == pattern_->ptr[mlast])
102  skip_ = mlast - i - 1;
103  }
104  BloomAdd(pattern_->ptr[mlast]);
105  }
106 
110  int Search(const StringValue* str) const {
111  // Special cases
112  if (!str || !pattern_ || pattern_->len == 0) {
113  return -1;
114  }
115 
116  int mlast = pattern_->len - 1;
117  int w = str->len - pattern_->len;
118  int n = str->len;
119  int m = pattern_->len;
120  const char* s = str->ptr;
121  const char* p = pattern_->ptr;
122 
123  // Special case if pattern->len == 1
124  if (m == 1) {
125  const char* result = reinterpret_cast<const char*>(memchr(s, p[0], n));
126  if (result != NULL) return result - s;
127  return -1;
128  }
129 
130  // General case.
131  int j;
132  // TODO: the original code seems to have an off by one error. It is possible
133  // to index at w + m which is the length of the input string. Checks have
134  // been added to make sure that w + m < str->len.
135  for (int i = 0; i <= w; i++) {
136  // note: using mlast in the skip path slows things down on x86
137  if (s[i+m-1] == p[m-1]) {
138  // candidate match
139  for (j = 0; j < mlast; j++)
140  if (s[i+j] != p[j]) break;
141  if (j == mlast) {
142  return i;
143  }
144  // miss: check if next character is part of pattern
145  if (i + m < n && !BloomQuery(s[i+m]))
146  i = i + m;
147  else
148  i = i + skip_;
149  } else {
150  // skip: check if next character is part of pattern
151  if (i + m < n && !BloomQuery(s[i+m])) {
152  i = i + m;
153  }
154  }
155  }
156  return -1;
157  }
158 
159  private:
160  static const int BLOOM_WIDTH = 64;
161 
162  void BloomAdd(char c) {
163  mask_ |= (1UL << (c & (BLOOM_WIDTH - 1)));
164  }
165 
166  bool BloomQuery(char c) const {
167  return mask_ & (1UL << (c & (BLOOM_WIDTH - 1)));
168  }
169 
171  int64_t mask_;
172  int64_t skip_;
173 };
174 
175 }
176 
177 #endif
const StringValue * pattern_
int Search(const StringValue *str) const
StringSearch(const StringValue *pattern)
Initialize/Precompute a StringSearch object from the pattern.
Definition: string-search.h:89
static const int BLOOM_WIDTH
bool BloomQuery(char c) const
void BloomAdd(char c)