Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
string-search-sse.h
Go to the documentation of this file.
1 // Copyright (c) 2012 Cloudera, Inc. All rights reserved.
2 
3 #ifndef IMPALA_EXPERIMENTS_STRING_SEARCH_SSE_H
4 #define IMPALA_EXPERIMENTS_STRING_SEARCH_SSE_H
5 
6 #include "common/compiler-util.h"
7 #include "common/logging.h"
8 #include "runtime/string-value.h"
9 
10 namespace impala {
11 
12 // This class implements strstr for non-null terminated strings. It wraps the
13 // standard strstr function (which is sse4 optimized).
19  public:
25  StringSearchSSE(const char* needle) :
26  needle_str_val_(NULL), needle_cstr_(needle) {
27  needle_len_ = strlen(needle);
28  }
29 
32  StringSearchSSE(const StringValue* needle) :
33  needle_str_val_(needle), needle_cstr_(NULL) {
34  needle_len_ = needle->len;
35  }
36 
38 
43  int Search(const StringValue& haystack) const {
44  // Edge cases
45  if (UNLIKELY(haystack.len == 0 && needle_len_ == 0)) return 0;
46  if (UNLIKELY(haystack.len == 0)) return -1;
47  if (UNLIKELY(needle_len_ == 0)) return 0;
48  if (UNLIKELY(haystack.len < needle_len_)) return -1;
49 
50  int result = -1;
51 
52  // temporarily null terminated input string
53  char last_char_haystack = haystack.ptr[haystack.len - 1];
54  haystack.ptr[haystack.len - 1] = '\0';
55 
56  // Use strchr if needle_len_ is 1
57  if (needle_len_ == 1) {
58  char c = (needle_str_val_ == NULL) ? needle_cstr_[0] : needle_str_val_->ptr[0];
59  char* s = strchr(haystack.ptr, c);
60  if (s != NULL) {
61  result = s - haystack.ptr;
62  } else if (last_char_haystack == c) {
63  result = haystack.len - 1;
64  }
65  // Undo change to haystack
66  haystack.ptr[haystack.len - 1] = last_char_haystack;
67  return result;
68  }
69 
70  // needle is null terminated. We just need to run strstr on the
71  // null terminated haystack, and if there is no match, try a match
72  // on the last needle_len chars.
73  if (LIKELY(needle_cstr_ != NULL)) {
74  char* s = strstr(haystack.ptr, needle_cstr_);
75  // Undo change to haystack
76  haystack.ptr[haystack.len - 1] = last_char_haystack;
77 
78  if (s != NULL) {
79  result = s - haystack.ptr;
80  } else {
81  // If we didn't find a match, try the last needle->len chars
82  char* end = haystack.ptr + haystack.len - needle_len_;
83  bool match = true;
84  for (int i = 0; i < needle_len_; ++i) {
85  if (LIKELY(end[i] != needle_cstr_[i])) {
86  match = false;
87  break;
88  }
89  }
90  if (UNLIKELY(match)) result = haystack.len - needle_len_;
91  }
92  return result;
93  } else {
94  // Needle is not null terminated. Terminate it on the fly.
95  const char last_char_needle = needle_str_val_->ptr[needle_len_ - 1];
96  needle_str_val_->ptr[needle_len_ - 1] = '\0';
97 
98  int offset = 0;
99  char* haystack_pos = haystack.ptr;
100  while (offset <= haystack.len - needle_len_) {
101  char* search = strstr(haystack_pos, needle_str_val_->ptr);
102 
103  // If the shortened strings didn't match, then the full strings
104  // must not match
105  if (search == NULL) break;
106 
107  offset += (search - haystack_pos);
108 
109  // The match happened at the very end of string. This is the case where:
110  // needle = "abc" (null terminated to "ab")
111  // haystack is "aaabc" (null terminated to "aaab")
112  // In this case, we just need to compare the last chars from both.
113  if (offset == haystack.len - needle_len_) {
114  if (last_char_needle == last_char_haystack) result = offset;
115  break;
116  } else {
117  // If the shortened strings match, match the last character
118  // Partial match within the string. Compare the last and if doesn't
119  // match, continue searching
120  if (search[needle_len_ - 1] == last_char_needle) {
121  result = offset;
122  break;
123  } else {
124  // Advance the haystack. This can be made more efficient by using
125  // a boyer-moore like approach (instead of just advancing by one).
126  haystack_pos = search + 1;
127  ++offset;
128  }
129  }
130  }
131 
132  // Undo change to haystack
133  haystack.ptr[haystack.len - 1] = last_char_haystack;
134  // Undo changes to needle
135  needle_str_val_->ptr[needle_len_ - 1] = last_char_needle;
136  return result;
137  }
138  }
139 
140  private:
143  const char* needle_cstr_;
144 
146 };
147 
148 }
149 
150 #endif
StringSearchSSE(const StringValue *needle)
const StringValue * needle_str_val_
Only one of these two will be non-null. Both are unowned.
StringSearchSSE(const char *needle)
int Search(const StringValue &haystack) const
#define UNLIKELY(expr)
Definition: compiler-util.h:33
#define LIKELY(expr)
Definition: compiler-util.h:32
uint8_t offset[7 *64-sizeof(uint64_t)]