Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
url-coding.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "util/url-coding.h"
16 
17 #include <exception>
18 #include <sstream>
19 #include <boost/algorithm/string.hpp>
20 #include <boost/archive/iterators/base64_from_binary.hpp>
21 #include <boost/archive/iterators/binary_from_base64.hpp>
22 #include <boost/archive/iterators/transform_width.hpp>
23 #include <boost/foreach.hpp>
24 
25 #include "common/logging.h"
26 
27 #include "common/names.h"
28 
29 using boost::algorithm::is_any_of;
30 using boost::archive::iterators::base64_from_binary;
31 using boost::archive::iterators::binary_from_base64;
32 using boost::archive::iterators::transform_width;
33 using namespace impala;
34 using std::uppercase;
35 
36 namespace impala {
37 
38 // Hive selectively encodes characters. This is the whitelist of
39 // characters it will encode.
40 // See common/src/java/org/apache/hadoop/hive/common/FileUtils.java
41 // in the Hive source code for the source of this list.
42 static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF");
43 
44 // It is more convenient to maintain the complement of the set of
45 // characters to escape when not in Hive-compat mode.
46 static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
47 
48 static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
49  (*out).reserve(in_len);
50  stringstream ss;
51  for (int i = 0; i < in_len; ++i) {
52  const char ch = in[i];
53  // Escape the character iff a) we are in Hive-compat mode and the
54  // character is in the Hive whitelist or b) we are not in
55  // Hive-compat mode, and the character is not alphanumeric or one
56  // of the four commonly excluded characters.
57  if ((hive_compat && HiveShouldEscape(ch)) ||
58  (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) {
59  ss << '%' << uppercase << hex << static_cast<uint32_t>(ch);
60  } else {
61  ss << ch;
62  }
63  }
64 
65  (*out) = ss.str();
66 }
67 
68 void UrlEncode(const vector<uint8_t>& in, string* out, bool hive_compat) {
69  if (in.empty()) {
70  *out = "";
71  } else {
72  UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out, hive_compat);
73  }
74 }
75 
76 void UrlEncode(const string& in, string* out, bool hive_compat) {
77  UrlEncode(in.c_str(), in.size(), out, hive_compat);
78 }
79 
80 // Adapted from
81 // http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/
82 // example/http/server3/request_handler.cpp
83 // See http://www.boost.org/LICENSE_1_0.txt for license for this method.
84 bool UrlDecode(const string& in, string* out, bool hive_compat) {
85  out->clear();
86  out->reserve(in.size());
87  for (size_t i = 0; i < in.size(); ++i) {
88  if (in[i] == '%') {
89  if (i + 3 <= in.size()) {
90  int value = 0;
91  istringstream is(in.substr(i + 1, 2));
92  if (is >> hex >> value) {
93  (*out) += static_cast<char>(value);
94  i += 2;
95  } else {
96  return false;
97  }
98  } else {
99  return false;
100  }
101  } else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+'
102  (*out) += ' ';
103  } else {
104  (*out) += in[i];
105  }
106  }
107  return true;
108 }
109 
110 static inline void Base64Encode(const char* in, int in_len, stringstream* out) {
111  typedef base64_from_binary<transform_width<const char*, 6, 8> > base64_encode;
112  // Base64 encodes 8 byte chars as 6 bit values.
113  stringstream::pos_type len_before = out->tellp();
114  copy(base64_encode(in), base64_encode(in + in_len), std::ostream_iterator<char>(*out));
115  int bytes_written = out->tellp() - len_before;
116  // Pad with = to make it valid base64 encoded string
117  int num_pad = bytes_written % 4;
118  if (num_pad != 0) {
119  num_pad = 4 - num_pad;
120  for (int i = 0; i < num_pad; ++i) {
121  (*out) << "=";
122  }
123  }
124  DCHECK_EQ((out->tellp() - len_before) % 4, 0);
125 }
126 
127 void Base64Encode(const vector<uint8_t>& in, string* out) {
128  if (in.empty()) {
129  *out = "";
130  } else {
131  stringstream ss;
132  Base64Encode(in, &ss);
133  *out = ss.str();
134  }
135 }
136 
137 void Base64Encode(const vector<uint8_t>& in, stringstream* out) {
138  if (!in.empty()) {
139  // Boost does not like non-null terminated strings
140  string tmp(reinterpret_cast<const char*>(&in[0]), in.size());
141  Base64Encode(tmp.c_str(), tmp.size(), out);
142  }
143 }
144 
145 void Base64Encode(const string& in, string* out) {
146  stringstream ss;
147  Base64Encode(in.c_str(), in.size(), &ss);
148  *out = ss.str();
149 }
150 
151 void Base64Encode(const string& in, stringstream* out) {
152  Base64Encode(in.c_str(), in.size(), out);
153 }
154 
155 bool Base64Decode(const string& in, string* out) {
156  typedef transform_width<
157  binary_from_base64<string::const_iterator> , 8, 6> base64_decode;
158  string tmp = in;
159  // Replace padding with base64 encoded NULL
160  replace(tmp.begin(), tmp.end(), '=', 'A');
161  try {
162  *out = string(base64_decode(tmp.begin()), base64_decode(tmp.end()));
163  } catch(std::exception& e) {
164  return false;
165  }
166 
167  // Remove trailing '\0' that were added as padding. Since \0 is special,
168  // the boost functions get confused so do this manually.
169  int num_padded_chars = 0;
170  for (int i = out->size() - 1; i >= 0; --i) {
171  if ((*out)[i] != '\0') break;
172  ++num_padded_chars;
173  }
174  out->resize(out->size() - num_padded_chars);
175  return true;
176 }
177 
178 void EscapeForHtml(const string& in, stringstream* out) {
179  DCHECK(out != NULL);
180  BOOST_FOREACH(const char& c, in) {
181  switch (c) {
182  case '<': (*out) << "&lt;";
183  break;
184  case '>': (*out) << "&gt;";
185  break;
186  case '&': (*out) << "&amp;";
187  break;
188  default: (*out) << c;
189  }
190  }
191 }
192 
193 }
static function< bool(char)> HiveShouldEscape
Definition: url-coding.cc:42
bool UrlDecode(const string &in, string *out, bool hive_compat)
Definition: url-coding.cc:84
static void Base64Encode(const char *in, int in_len, stringstream *out)
Definition: url-coding.cc:110
static void UrlEncode(const char *in, int in_len, string *out, bool hive_compat)
Definition: url-coding.cc:48
bool Base64Decode(const string &in, string *out)
Definition: url-coding.cc:155
void EscapeForHtml(const string &in, stringstream *out)
Definition: url-coding.cc:178
static function< bool(char)> ShouldNotEscape
Definition: url-coding.cc:46