Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
string-parser.h
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 
16 #ifndef IMPALA_UTIL_STRING_PARSER_H
17 #define IMPALA_UTIL_STRING_PARSER_H
18 
19 #include <limits>
20 #include <boost/type_traits.hpp>
21 #include "common/compiler-util.h"
22 
23 #include "common/logging.h"
24 #include "runtime/decimal-value.h"
25 #include "util/decimal-util.h"
26 
27 namespace impala {
28 
31 //
38 //
42 //
49 class StringParser {
50  public:
51  enum ParseResult {
56  };
57 
58  template <typename T>
59  static inline T StringToInt(const char* s, int len, ParseResult* result) {
60  T ans = StringToIntInternal<T>(s, len, result);
61  if (LIKELY(*result == PARSE_SUCCESS)) return ans;
62 
63  int i = SkipLeadingWhitespace(s, len);
64  return StringToIntInternal<T>(s + i, len - i, result);
65  }
66 
68  template <typename T>
69  static inline T StringToInt(const char* s, int len, int base, ParseResult* result) {
70  T ans = StringToIntInternal<T>(s, len, base, result);
71  if (LIKELY(*result == PARSE_SUCCESS)) return ans;
72 
73  int i = SkipLeadingWhitespace(s, len);
74  return StringToIntInternal<T>(s + i, len - i, base, result);
75  }
76 
77  template <typename T>
78  static inline T StringToFloat(const char* s, int len, ParseResult* result) {
79  T ans = StringToFloatInternal<T>(s, len, result);
80  if (LIKELY(*result == PARSE_SUCCESS)) return ans;
81 
82  int i = SkipLeadingWhitespace(s, len);
83  return StringToFloatInternal<T>(s + i, len - i, result);
84  }
85 
87  static inline bool StringToBool(const char* s, int len, ParseResult* result) {
88  bool ans = StringToBoolInternal(s, len, result);
89  if (LIKELY(*result == PARSE_SUCCESS)) return ans;
90 
91  int i = SkipLeadingWhitespace(s, len);
92  return StringToBoolInternal(s + i, len - i, result);
93  }
94 
99  template <typename T>
100  static inline DecimalValue<T> StringToDecimal(const uint8_t* s, int len,
101  const ColumnType& type, StringParser::ParseResult* result) {
102  return StringToDecimal<T>(reinterpret_cast<const char*>(s), len, type, result);
103  }
104 
105  template <typename T>
106  static inline DecimalValue<T> StringToDecimal(const char* s, int len,
107  const ColumnType& type, StringParser::ParseResult* result) {
108  // Special cases:
109  // 1) '' == Fail, an empty string fails to parse.
110  // 2) ' # ' == #, leading and trailing white space is ignored.
111  // 3) '.' == 0, a single dot parses as zero (for consistency with other types).
112  // 4) '#.' == '#', a trailing dot is ignored.
113 
114  // Ignore leading and trailing spaces.
115  while (len > 0 && IsWhitespace(*s)) {
116  ++s;
117  --len;
118  }
119  while (len > 0 && IsWhitespace(s[len - 1])) {
120  --len;
121  }
122 
123  bool is_negative = false;
124  if (len > 0) {
125  switch (*s) {
126  case '-':
127  is_negative = true;
128  case '+':
129  ++s;
130  --len;
131  }
132  }
133 
134  // Ignore leading zeros.
135  bool found_value = false;
136  while (len > 0 && UNLIKELY(*s == '0')) {
137  found_value = true;
138  ++s;
139  --len;
140  }
141 
142  // Ignore leading zeros even after a dot. This allows for differetiating between
143  // cases like 0.01e2, which would fit in a DECIMAL(1, 0), and 0.10e2, which would
144  // overflow.
145  int scale = 0;
146  int found_dot = 0;
147  if (len > 0 && *s == '.') {
148  found_dot = 1;
149  ++s;
150  --len;
151  while (len > 0 && UNLIKELY(*s == '0')) {
152  found_value = true;
153  ++scale;
154  ++s;
155  --len;
156  }
157  }
158 
159  int precision = 0;
160  bool found_exponent = false;
161  int8_t exponent = 0;
162  T value = 0;
163  for (int i = 0; i < len; ++i) {
164  const char& c = s[i];
165  if (LIKELY('0' <= c && c <= '9')) {
166  found_value = true;
167  // Ignore digits once the type's precision limit is reached. This avoids
168  // overflowing the underlying storage while handling a string like
169  // 10000000000e-10 into a DECIMAL(1, 0). Adjustments for ignored digits and
170  // an exponent will be made later.
171  if (LIKELY(type.precision > precision)) {
172  value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis...
173  }
174  DCHECK(value >= 0); // For some reason DCHECK_GE doesn't work with int128_t.
175  ++precision;
176  scale += found_dot;
177  } else if (c == '.' && LIKELY(!found_dot)) {
178  found_dot = 1;
179  } else if ((c == 'e' || c == 'E') && LIKELY(!found_exponent)) {
180  found_exponent = true;
181  exponent = StringToIntInternal<int8_t>(s + i + 1, len - i - 1, result);
182  if (UNLIKELY(*result != StringParser::PARSE_SUCCESS)) return DecimalValue<T>(0);
183  break;
184  } else {
185  *result = StringParser::PARSE_FAILURE;
186  return DecimalValue<T>(0);
187  }
188  }
189 
190  // Find the number of truncated digits before adjusting the precision for an exponent.
191  int truncated_digit_count = precision - type.precision;
192  if (exponent > scale) {
193  // Ex: 0.1e3 (which at this point would have precision == 1 and scale == 1), the
194  // scale must be set to 0 and the value set to 100 which means a precision of 3.
195  precision += exponent - scale;
196  value *= DecimalUtil::GetScaleMultiplier<T>(exponent - scale);
197  scale = 0;
198  } else {
199  // Ex: 100e-4, the scale must be set to 4 but no adjustment to the value is needed,
200  // the precision must also be set to 4 but that will be done below for the
201  // non-exponent case anyways.
202  scale -= exponent;
203  }
204  // Ex: 0.001, at this point would have precision 1 and scale 3 since leading zeros
205  // were ignored during previous parsing.
206  if (scale > precision) precision = scale;
207 
208  // Microbenchmarks show that beyond this point, returning on parse failure is slower
209  // than just letting the function run out.
210  *result = StringParser::PARSE_SUCCESS;
211  if (UNLIKELY(precision - scale > type.precision - type.scale)) {
213  } else if (UNLIKELY(scale > type.scale)) {
215  int shift = scale - type.scale;
216  if (truncated_digit_count > 0) shift -= truncated_digit_count;
217  if (shift > 0) value /= DecimalUtil::GetScaleMultiplier<T>(shift);
218  DCHECK(value >= 0);
219  } else if (UNLIKELY(!found_value && !found_dot)) {
220  *result = StringParser::PARSE_FAILURE;
221  }
222 
223  if (type.scale > scale) {
224  value *= DecimalUtil::GetScaleMultiplier<T>(type.scale - scale);
225  }
226 
227  return DecimalValue<T>(is_negative ? -value : value);
228  }
229 
230  private:
235  template <typename T>
236  static inline T StringToIntInternal(const char* s, int len, ParseResult* result) {
237  if (UNLIKELY(len <= 0)) {
238  *result = PARSE_FAILURE;
239  return 0;
240  }
241 
242  typedef typename boost::make_unsigned<T>::type UnsignedT;
243  UnsignedT val = 0;
244  UnsignedT max_val = std::numeric_limits<T>::max();
245  bool negative = false;
246  int i = 0;
247  switch (*s) {
248  case '-':
249  negative = true;
250  max_val = std::numeric_limits<T>::max() + 1;
251  case '+': ++i;
252  }
253 
254  // This is the fast path where the string cannot overflow.
255  if (LIKELY(len - i < StringParseTraits<T>::max_ascii_len())) {
256  val = StringToIntNoOverflow<UnsignedT>(s + i, len - i, result);
257  return static_cast<T>(negative ? -val : val);
258  }
259 
260  const T max_div_10 = max_val / 10;
261  const T max_mod_10 = max_val % 10;
262 
263  int first = i;
264  for (; i < len; ++i) {
265  if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
266  T digit = s[i] - '0';
267  // This is a tricky check to see if adding this digit will cause an overflow.
268  if (UNLIKELY(val > (max_div_10 - (digit > max_mod_10)))) {
269  *result = PARSE_OVERFLOW;
270  return negative ? -max_val : max_val;
271  }
272  val = val * 10 + digit;
273  } else {
274  if ((UNLIKELY(i == first || !IsAllWhitespace(s + i, len - i)))) {
275  // Reject the string because either the first char was not a digit,
276  // or the remaining chars are not all whitespace
277  *result = PARSE_FAILURE;
278  return 0;
279  }
280  // Returning here is slightly faster than breaking the loop.
281  *result = PARSE_SUCCESS;
282  return static_cast<T>(negative ? -val : val);
283  }
284  }
285  *result = PARSE_SUCCESS;
286  return static_cast<T>(negative ? -val : val);
287  }
288 
291  template <typename T>
292  static inline T StringToIntInternal(const char* s, int len, int base,
293  ParseResult* result) {
294  typedef typename boost::make_unsigned<T>::type UnsignedT;
295  UnsignedT val = 0;
296  UnsignedT max_val = std::numeric_limits<T>::max();
297  bool negative = false;
298  if (UNLIKELY(len <= 0)) {
299  *result = PARSE_FAILURE;
300  return 0;
301  }
302  int i = 0;
303  switch (*s) {
304  case '-':
305  negative = true;
306  max_val = std::numeric_limits<T>::max() + 1;
307  case '+': i = 1;
308  }
309 
310  const T max_div_base = max_val / base;
311  const T max_mod_base = max_val % base;
312 
313  int first = i;
314  for (; i < len; ++i) {
315  T digit;
316  if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
317  digit = s[i] - '0';
318  } else if (s[i] >= 'a' && s[i] <= 'z') {
319  digit = (s[i] - 'a' + 10);
320  } else if (s[i] >= 'A' && s[i] <= 'Z') {
321  digit = (s[i] - 'A' + 10);
322  } else {
323  if ((UNLIKELY(i == first || !IsAllWhitespace(s + i, len - i)))) {
324  // Reject the string because either the first char was not an alpha/digit,
325  // or the remaining chars are not all whitespace
326  *result = PARSE_FAILURE;
327  return 0;
328  }
329  // skip trailing whitespace.
330  break;
331  }
332 
333  // Bail, if we encounter a digit that is not available in base.
334  if (digit >= base) break;
335 
336  // This is a tricky check to see if adding this digit will cause an
337  // overflow.
338  if (UNLIKELY(val > (max_div_base - (digit > max_mod_base)))) {
339  *result = PARSE_OVERFLOW;
340  return static_cast<T>(negative ? -max_val : max_val);
341  }
342  val = val * base + digit;
343  }
344  *result = PARSE_SUCCESS;
345  return static_cast<T>(negative ? -val : val);
346  }
347 
356  template <typename T>
357  static inline T StringToFloatInternal(const char* s, int len, ParseResult* result) {
358  if (UNLIKELY(len <= 0)) {
359  *result = PARSE_FAILURE;
360  return 0;
361  }
362 
363  // Use double here to not lose precision while accumulating the result
364  double val = 0;
365  bool negative = false;
366  int i = 0;
367  double divide = 1;
368  bool decimal = false;
369  int64_t remainder = 0;
370  // The number of significant figures we've encountered so far (i.e., digits excluding
371  // leading 0s). This technically shouldn't count trailing 0s either, but for us it
372  // doesn't matter if we count them based on the implementation below.
373  int sig_figs = 0;
374  switch (*s) {
375  case '-': negative = true;
376  case '+': i = 1;
377  }
378  int first = i;
379  for (; i < len; ++i) {
380  if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
381  if (s[i] != '0' || sig_figs > 0) ++sig_figs;
382  if (decimal) {
383  // According to the IEEE floating-point spec, a double has up to 15-17
384  // significant decimal digits (see
385  // http://en.wikipedia.org/wiki/Double-precision_floating-point_format). We stop
386  // processing digits after we've already seen at least 18 sig figs to avoid
387  // overflowing 'remainder' (we stop after 18 instead of 17 to get the rounding
388  // right).
389  if (sig_figs <= 18) {
390  remainder = remainder * 10 + s[i] - '0';
391  divide *= 10;
392  }
393  } else {
394  val = val * 10 + s[i] - '0';
395  }
396  } else if (s[i] == '.') {
397  decimal = true;
398  } else if (s[i] == 'e' || s[i] == 'E') {
399  break;
400  } else if (s[i] == 'i' || s[i] == 'I') {
401  if (len > i + 2 && (s[i+1] == 'n' || s[i+1] == 'N') &&
402  (s[i+2] == 'f' || s[i+2] == 'F')) {
403  // Note: Hive writes inf as Infinity, at least for text. We'll be a little loose
404  // here and interpret any column with inf as a prefix as infinity rather than
405  // checking every remaining byte.
406  *result = PARSE_SUCCESS;
407  return negative ? -INFINITY : INFINITY;
408  } else {
409  // Starts with 'i', but isn't inf...
410  *result = PARSE_FAILURE;
411  return 0;
412  }
413  } else if (s[i] == 'n' || s[i] == 'N') {
414  if (len > i + 2 && (s[i+1] == 'a' || s[i+1] == 'A') &&
415  (s[i+2] == 'n' || s[i+2] == 'N')) {
416  *result = PARSE_SUCCESS;
417  return negative ? -NAN : NAN;
418  } else {
419  // Starts with 'n', but isn't NaN...
420  *result = PARSE_FAILURE;
421  return 0;
422  }
423  } else {
424  if ((UNLIKELY(i == first || !IsAllWhitespace(s + i, len - i)))) {
425  // Reject the string because either the first char was not a digit, "," or "e",
426  // or the remaining chars are not all whitespace
427  *result = PARSE_FAILURE;
428  return 0;
429  }
430  // skip trailing whitespace.
431  break;
432  }
433  }
434 
435  val += remainder / divide;
436 
437  if (i < len && (s[i] == 'e' || s[i] == 'E')) {
438  // Create a C-string from s starting after the optional '-' sign and fall back to
439  // strtod to avoid conversion inaccuracy for scientific notation.
440  // Do not use boost::lexical_cast because it causes codegen to crash for an
441  // unknown reason (exception handling?).
442  char c_str[len - negative + 1];
443  memcpy(c_str, s + negative, len - negative);
444  c_str[len - negative] = '\0';
445  char* s_end;
446  val = strtod(c_str, &s_end);
447  if (s_end != c_str + len - negative) {
448  // skip trailing whitespace
449  int trailing_len = len - negative - (int)(s_end - c_str);
450  if (UNLIKELY(!IsAllWhitespace(s_end, trailing_len))) {
451  *result = PARSE_FAILURE;
452  return val;
453  }
454  }
455  }
456 
457  // Determine if it is an overflow case and update the result
458  if (UNLIKELY(val == std::numeric_limits<T>::infinity())) {
459  *result = PARSE_OVERFLOW;
460  } else {
461  *result = PARSE_SUCCESS;
462  }
463  return (T)(negative ? -val : val);
464  }
465 
468  static inline bool StringToBoolInternal(const char* s, int len, ParseResult* result) {
469  *result = PARSE_SUCCESS;
470  if (len >= 4 && (s[0] == 't' || s[0] == 'T')) {
471  bool match = (s[1] == 'r' || s[1] == 'R') &&
472  (s[2] == 'u' || s[2] == 'U') &&
473  (s[3] == 'e' || s[3] == 'E');
474  if (match && LIKELY(IsAllWhitespace(s+4, len-4))) return true;
475  } else if (len >= 5 && (s[0] == 'f' || s[0] == 'F')) {
476  bool match = (s[1] == 'a' || s[1] == 'A') &&
477  (s[2] == 'l' || s[2] == 'L') &&
478  (s[3] == 's' || s[3] == 'S') &&
479  (s[4] == 'e' || s[4] == 'E');
480  if (match && LIKELY(IsAllWhitespace(s+5, len-5))) return false;
481  }
482  *result = PARSE_FAILURE;
483  return false;
484  }
485 
487  static inline int SkipLeadingWhitespace(const char* s, int len) {
488  int i = 0;
489  while(i < len && IsWhitespace(s[i])) ++i;
490  return i;
491  }
492 
494  static inline bool IsAllWhitespace(const char* s, int len) {
495  for (int i = 0; i < len; ++i) {
496  if (!LIKELY(IsWhitespace(s[i]))) return false;
497  }
498  return true;
499  }
500 
501  template<typename T>
503  public:
506  static int max_ascii_len();
507  };
508 
512  template <typename T>
513  static inline T StringToIntNoOverflow(const char* s, int len, ParseResult* result) {
514  T val = 0;
515  if (UNLIKELY(len == 0)) {
516  *result = PARSE_SUCCESS;
517  return val;
518  }
519  // Factor out the first char for error handling speeds up the loop.
520  if (LIKELY(s[0] >= '0' && s[0] <= '9')) {
521  val = s[0] - '0';
522  } else {
523  *result = PARSE_FAILURE;
524  return 0;
525  }
526  for (int i = 1; i < len; ++i) {
527  if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
528  T digit = s[i] - '0';
529  val = val * 10 + digit;
530  } else {
531  if ((UNLIKELY(!IsAllWhitespace(s + i, len - i)))) {
532  *result = PARSE_FAILURE;
533  return 0;
534  }
535  *result = PARSE_SUCCESS;
536  return val;
537  }
538  }
539  *result = PARSE_SUCCESS;
540  return val;
541  }
542 
543  static inline bool IsWhitespace(const char& c) {
544  return c == ' ' || UNLIKELY(c == '\t' || c == '\n' || c == '\v' || c == '\f'
545  || c == '\r');
546  }
547 };
548 
549 template<>
551 
552 template<>
554 
555 template<>
557 
558 template<>
560 
561 }
562 
563 #endif
static DecimalValue< T > StringToDecimal(const uint8_t *s, int len, const ColumnType &type, StringParser::ParseResult *result)
static DecimalValue< T > StringToDecimal(const char *s, int len, const ColumnType &type, StringParser::ParseResult *result)
static T StringToFloat(const char *s, int len, ParseResult *result)
Definition: string-parser.h:78
static T StringToIntInternal(const char *s, int len, int base, ParseResult *result)
int precision
Only set if type == TYPE_DECIMAL.
Definition: types.h:68
static T StringToFloatInternal(const char *s, int len, ParseResult *result)
static int SkipLeadingWhitespace(const char *s, int len)
Returns the position of the first non-whitespace character in s.
static bool IsWhitespace(const char &c)
static bool StringToBool(const char *s, int len, ParseResult *result)
Parses a string for 'true' or 'false', case insensitive.
Definition: string-parser.h:87
static bool IsAllWhitespace(const char *s, int len)
Returns true if s only contains whitespace.
static T StringToInt(const char *s, int len, int base, ParseResult *result)
Convert a string s representing a number in given base into a decimal number.
Definition: string-parser.h:69
static T StringToIntInternal(const char *s, int len, ParseResult *result)
#define UNLIKELY(expr)
Definition: compiler-util.h:33
static bool StringToBoolInternal(const char *s, int len, ParseResult *result)
#define LIKELY(expr)
Definition: compiler-util.h:32
static T StringToIntNoOverflow(const char *s, int len, ParseResult *result)
static T StringToInt(const char *s, int len, ParseResult *result)
Definition: string-parser.h:59