Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
timestamp-parse-util.h
Go to the documentation of this file.
1 // Copyright 2013 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef IMPALA_RUNTIME_TIMESTAMP_PARSE_UTIL_H
16 #define IMPALA_RUNTIME_TIMESTAMP_PARSE_UTIL_H
17 
18 #include <boost/assign/list_of.hpp>
19 #include <boost/date_time/posix_time/posix_time.hpp>
20 #include <boost/foreach.hpp>
21 #include <boost/unordered_map.hpp>
22 #include "common/status.h"
24 #include "util/string-parser.h"
25 
26 namespace impala {
27 
30  UNKNOWN = 0,
44 };
45 
52  int pos;
54  int len;
57  const char* val;
58 
60  : type(type),
61  pos(pos),
62  len(len),
63  val(val) {
64  }
65 };
66 
72  const char* fmt;
73  int fmt_len;
81  std::vector<DateTimeFormatToken> toks;
84 
86  Reset(NULL, 0);
87  }
88 
89  DateTimeFormatContext(const char* fmt, int fmt_len) {
90  Reset(fmt, fmt_len);
91  }
92 
93  void Reset(const char* fmt, int fmt_len) {
94  this->fmt = fmt;
95  this->fmt_len = fmt_len;
96  this->fmt_out_len = fmt_len;
97  this->has_date_toks = false;
98  this->has_time_toks = false;
99  this->toks.clear();
100  }
101 };
102 
105  int year;
106  int month;
107  int day;
108  int hour;
109  int minute;
110  int second;
111  int32_t fraction;
112 
114  : year(0),
115  month(0),
116  day(0),
117  hour(0),
118  minute(0),
119  second(0),
120  fraction(0) {
121  }
122 };
123 
126  public:
130  static void Init();
131 
135  static inline bool ParseFormatTokens(DateTimeFormatContext* dt_ctx) {
136  DCHECK(dt_ctx != NULL);
137  DCHECK(dt_ctx->fmt != NULL);
138  DCHECK(dt_ctx->fmt_len > 0);
139  DCHECK(dt_ctx->toks.size() == 0);
140  const char* str_begin = dt_ctx->fmt;
141  const char* str_end = str_begin + dt_ctx->fmt_len;
142  const char* str = str_begin;
143  // Parse the tokens from the format string
144  while (str < str_end) {
145  if (isdigit(*str)) return false;
146  // Ignore T|Z|non aA-zZ chars but track them as separators (required for printing).
147  if ((*str == 'T') || (*str == 'Z') || (!isalpha(*str))) {
148  dt_ctx->toks.push_back(DateTimeFormatToken(SEPARATOR, str - str_begin, 1, str));
149  ++str;
150  continue;
151  }
152  DateTimeFormatTokenType tok_type = UNKNOWN;
153  switch (*str) {
154  case 'y': tok_type = YEAR; break;
155  case 'M': tok_type = MONTH_IN_YEAR; break;
156  case 'd': tok_type = DAY_IN_MONTH; break;
157  case 'H': tok_type = HOUR_IN_DAY; break;
158  case 'm': tok_type = MINUTE_IN_HOUR; break;
159  case 's': tok_type = SECOND_IN_MINUTE; break;
160  case 'S': tok_type = FRACTION; break;
161  // Error on aA-zZ reserved characters that are not used yet.
162  default: return false;
163  }
164  dt_ctx->has_date_toks |= tok_type < HOUR_IN_DAY;
165  dt_ctx->has_time_toks |= tok_type >= HOUR_IN_DAY;
166  // Get the token group length
167  int tok_len = 1;
168  char tok_chr = *str;
169  const char* curr_tok_chr = str + 1;
170  while (curr_tok_chr < str_end) {
171  if (*curr_tok_chr != tok_chr) break;
172  ++tok_len;
173  ++curr_tok_chr;
174  }
175  if (tok_type == MONTH_IN_YEAR) {
176  if (UNLIKELY(tok_len > 3)) return false;
177  if (tok_len == 3) tok_type = MONTH_IN_YEAR_SLT;
178  }
179  // In an output scenario, fmt_out_len is used to determine the print buffer size.
180  // If the format uses short token groups e.g. yyyy-MM-d, there must to be enough
181  // room in the buffer for wider values e.g. 2013-12-16.
182  if (tok_len == 1) ++dt_ctx->fmt_out_len;
183  DateTimeFormatToken tok(tok_type, str - str_begin, tok_len, str);
184  str += tok.len;
185  dt_ctx->toks.push_back(tok);
186  }
187  return dt_ctx->has_date_toks || dt_ctx->has_time_toks;
188  }
189 
202  static inline bool Parse(const char* str, int len, boost::gregorian::date* d,
203  boost::posix_time::time_duration* t) {
205  DCHECK(d != NULL);
206  DCHECK(t != NULL);
207  if (UNLIKELY(str == NULL || len <= 0)) {
208  *d = boost::gregorian::date();
209  *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
210  return false;
211  }
212  // Remove leading white space.
213  while (len > 0 && isspace(*str)) {
214  ++str;
215  --len;
216  }
217  // Strip the trailing blanks.
218  while (len > 0 && isspace(str[len - 1])) --len;
219  // Only process what we have to.
221  // Determine the default formatting context that's required for parsing.
222  DateTimeFormatContext* dt_ctx = NULL;
223  if (LIKELY(len >= DEFAULT_TIME_FMT_LEN)) {
224  // This string starts with a date component
225  if (str[4] == '-') {
226  switch (len) {
227  case DEFAULT_DATE_FMT_LEN: {
228  dt_ctx = &DEFAULT_DATE_CTX;
229  break;
230  }
232  switch (str[10]) {
233  case ' ': dt_ctx = &DEFAULT_SHORT_DATE_TIME_CTX; break;
234  case 'T': dt_ctx = &DEFAULT_SHORT_ISO_DATE_TIME_CTX; break;
235  }
236  break;
237  }
239  switch (str[10]) {
240  case ' ': dt_ctx = &DEFAULT_DATE_TIME_CTX[9]; break;
241  case 'T': dt_ctx = &DEFAULT_ISO_DATE_TIME_CTX[9]; break;
242  }
243  break;
244  }
245  default: {
246  // There is likely a fractional component that's below the expected 9 chars.
247  // We will need to work out which default context to use that corresponds to
248  // the fractional length in the string.
250  switch (str[10]) {
251  case ' ': {
252  dt_ctx =
254  break;
255  }
256  case 'T': {
257  dt_ctx = &DEFAULT_ISO_DATE_TIME_CTX
259  break;
260  }
261  }
262  }
263  break;
264  }
265  }
266  } else if (str[2] == ':') {
268  if (len > DEFAULT_TIME_FMT_LEN && str[8] == '.') {
269  dt_ctx = &DEFAULT_TIME_FRAC_CTX[len - DEFAULT_TIME_FMT_LEN - 1];
270  } else {
271  dt_ctx = &DEFAULT_TIME_CTX;
272  }
273  }
274  }
275  if (LIKELY(dt_ctx != NULL)) {
276  return Parse(str, len, *dt_ctx, d, t);
277  } else {
278  *d = boost::gregorian::date();
279  *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
280  return false;
281  }
282  }
283 
292  static inline bool Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx,
293  boost::gregorian::date* d, boost::posix_time::time_duration* t) {
295  DCHECK(dt_ctx.toks.size() > 0);
296  DCHECK(d != NULL);
297  DCHECK(t != NULL);
298  DateTimeParseResult dt_result;
299  if (UNLIKELY(str == NULL || len <= 0 ||
300  !ParseDateTime(str, len, dt_ctx, &dt_result))) {
301  *d = boost::gregorian::date();
302  *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
303  return false;
304  }
305  if (dt_ctx.has_date_toks) {
306  try {
307  *d = boost::gregorian::date(dt_result.year, dt_result.month, dt_result.day);
308  } catch (boost::exception& e) {
309  VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-"
310  << dt_result.day;
311  *d = boost::gregorian::date();
312  *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
313  return false;
314  }
315  } else {
316  *d = boost::gregorian::date();
317  }
318  if (dt_ctx.has_time_toks) {
319  *t = boost::posix_time::time_duration(dt_result.hour, dt_result.minute,
320  dt_result.second, dt_result.fraction);
321  } else {
322  *t = boost::posix_time::time_duration(0, 0, 0, 0);
323  }
324  return true;
325  }
326 
335  static inline int Format(const DateTimeFormatContext& dt_ctx,
336  const boost::gregorian::date& d, const boost::posix_time::time_duration& t,
337  int len, char* buff) {
339  DCHECK(dt_ctx.toks.size() > 0);
340  DCHECK(len > dt_ctx.fmt_out_len);
341  DCHECK(buff != NULL);
342  if (dt_ctx.has_date_toks && d.is_special()) return -1;
343  if (dt_ctx.has_time_toks && t.is_special()) return -1;
344  char* str = buff;
345  BOOST_FOREACH(const DateTimeFormatToken& tok, dt_ctx.toks) {
346  int32_t num_val = -1;
347  const char* str_val = NULL;
348  int str_val_len = 0;
349  switch (tok.type) {
350  case YEAR: {
351  num_val = d.year();
352  if (tok.len <= 3) num_val %= 100;
353  break;
354  }
355  case MONTH_IN_YEAR: num_val = d.month().as_number(); break;
356  case MONTH_IN_YEAR_SLT: {
357  str_val = d.month().as_short_string();
358  str_val_len = 3;
359  break;
360  }
361  case DAY_IN_MONTH: num_val = d.day(); break;
362  case HOUR_IN_DAY: num_val = t.hours(); break;
363  case MINUTE_IN_HOUR: num_val = t.minutes(); break;
364  case SECOND_IN_MINUTE: num_val = t.seconds(); break;
365  case FRACTION: {
366  num_val = t.fractional_seconds();
367  if (num_val > 0) for (int j = tok.len; j < 9; ++j) num_val /= 10;
368  break;
369  }
370  case SEPARATOR: {
371  str_val = tok.val;
372  str_val_len = tok.len;
373  break;
374  }
375  default: DCHECK(false) << "Unknown date/time format token";
376  }
377  if (num_val > -1) {
378  str += sprintf(str, "%0*d", tok.len, num_val);
379  } else {
380  memcpy(str, str_val, str_val_len);
381  str += str_val_len;
382  }
383  }
385  *str = '\0';
386  return str - buff;
387  }
388 
389  private:
390  static inline bool ParseDateTime(const char* str, int str_len,
391  const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result) {
392  DCHECK(dt_ctx.fmt_len > 0);
393  DCHECK(dt_ctx.toks.size() > 0);
394  DCHECK(dt_result != NULL);
395  if (str_len <= 0 || str_len < dt_ctx.fmt_len || str == NULL) return false;
397  BOOST_FOREACH(const DateTimeFormatToken& tok, dt_ctx.toks) {
398  const char* tok_val = str + tok.pos;
399  if (tok.type == SEPARATOR) {
400  if (UNLIKELY(*tok_val != *tok.val)) return false;
401  continue;
402  }
403  switch (tok.type) {
404  case YEAR: {
405  dt_result->year = StringParser::StringToInt<int>(tok_val, tok.len, &status);
406  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
407  if (UNLIKELY(dt_result->year < 1 || dt_result->year > 9999)) return false;
408  if (tok.len < 4 && dt_result->year < 99) dt_result->year += 2000;
409  break;
410  }
411  case MONTH_IN_YEAR: {
412  dt_result->month = StringParser::StringToInt<int>(tok_val, tok.len, &status);
413  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
414  if (UNLIKELY(dt_result->month < 1 || dt_result->month > 12)) return false;
415  break;
416  }
417  case MONTH_IN_YEAR_SLT: {
418  char raw_buff[tok.len];
419  std::transform(tok_val, tok_val + tok.len, raw_buff, ::tolower);
420  StringValue buff(raw_buff, tok.len);
421  boost::unordered_map<StringValue, int>::const_iterator iter =
422  REV_MONTH_INDEX.find(buff);
423  if (UNLIKELY(iter == REV_MONTH_INDEX.end())) return false;
424  dt_result->month = iter->second;
425  break;
426  }
427  case DAY_IN_MONTH: {
428  dt_result->day = StringParser::StringToInt<int>(tok_val, tok.len, &status);
429  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
430  // TODO: Validate that the value of day is correct for the given month.
431  if (UNLIKELY(dt_result->day < 1 || dt_result->day > 31)) return false;
432  break;
433  }
434  case HOUR_IN_DAY: {
435  dt_result->hour = StringParser::StringToInt<int>(tok_val, tok.len, &status);
436  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
437  if (UNLIKELY(dt_result->hour < 0 || dt_result->hour > 23)) return false;
438  break;
439  }
440  case MINUTE_IN_HOUR: {
441  dt_result->minute = StringParser::StringToInt<int>(tok_val, tok.len, &status);
442  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
443  if (UNLIKELY(dt_result->minute < 0 || dt_result->minute > 59)) return false;
444  break;
445  }
446  case SECOND_IN_MINUTE: {
447  dt_result->second = StringParser::StringToInt<int>(tok_val, tok.len, &status);
448  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
449  if (UNLIKELY(dt_result->second < 0 || dt_result->second > 59)) return false;
450  break;
451  }
452  case FRACTION: {
453  dt_result->fraction =
454  StringParser::StringToInt<int32_t>(tok_val, tok.len, &status);
455  if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
456  // A user may specify a time of 04:30:22.1238, the parser will return 1238 for
457  // the fractional portion. This does not represent the intended value of
458  // 123800000, therefore the number must be scaled up.
459  for (int i = tok.len; i < 9; ++i) dt_result->fraction *= 10;
460  break;
461  }
462  default: DCHECK(false) << "Unknown date/time format token";
463  }
464  }
465  return true;
466  }
467 
469  static const int DEFAULT_DATE_FMT_LEN = 10;
470  static const int DEFAULT_TIME_FMT_LEN = 8;
471  static const int DEFAULT_TIME_FRAC_FMT_LEN = 18;
472  static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN = 19;
473  static const int DEFAULT_DATE_TIME_FMT_LEN = 29;
474 
476  static bool initialized_;
477 
479  static boost::unordered_map<StringValue, int> REV_MONTH_INDEX;
480 
492 };
493 
494 }
495 
496 #endif
static DateTimeFormatContext DEFAULT_DATE_TIME_CTX[10]
Used to store metadata about a token group within a date/time format.
Used for parsing both default and custom formatted timestamp values.
static const int DEFAULT_TIME_FRAC_FMT_LEN
DateTimeFormatContext(const char *fmt, int fmt_len)
static bool ParseFormatTokens(DateTimeFormatContext *dt_ctx)
DateTimeFormatTokenType
Used to indicate the type of a date/time format token group.
static bool Parse(const char *str, int len, boost::gregorian::date *d, boost::posix_time::time_duration *t)
static boost::unordered_map< StringValue, int > REV_MONTH_INDEX
Lazily initialized pseudo-constant hashmap for mapping month names to an index.
void Reset(const char *fmt, int fmt_len)
static bool initialized_
Used to indicate if the parsing state has been initialized.
static const int DEFAULT_DATE_TIME_FMT_LEN
static const int DEFAULT_DATE_FMT_LEN
Constants to hold default format lengths.
DateTimeFormatTokenType type
Indicates the type of date/time format token e.g. year.
static DateTimeFormatContext DEFAULT_TIME_FRAC_CTX[10]
static DateTimeFormatContext DEFAULT_TIME_CTX
int len
The length of the token group.
static DateTimeFormatContext DEFAULT_SHORT_DATE_TIME_CTX
static int Format(const DateTimeFormatContext &dt_ctx, const boost::gregorian::date &d, const boost::posix_time::time_duration &t, int len, char *buff)
static DateTimeFormatContext DEFAULT_ISO_DATE_TIME_CTX[10]
#define VLOG_ROW
Definition: logging.h:59
static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN
static bool Parse(const char *str, int len, const DateTimeFormatContext &dt_ctx, boost::gregorian::date *d, boost::posix_time::time_duration *t)
static bool ParseDateTime(const char *str, int str_len, const DateTimeFormatContext &dt_ctx, DateTimeParseResult *dt_result)
DateTimeFormatToken(DateTimeFormatTokenType type, int pos, int len, const char *val)
#define UNLIKELY(expr)
Definition: compiler-util.h:33
#define LIKELY(expr)
Definition: compiler-util.h:32
std::vector< DateTimeFormatToken > toks
static DateTimeFormatContext DEFAULT_DATE_CTX
Stores the results of parsing a date/time string.
static DateTimeFormatContext DEFAULT_SHORT_ISO_DATE_TIME_CTX
static const int DEFAULT_TIME_FMT_LEN