Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
url-parser.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "util/url-parser.h"
17 
18 #include "common/names.h"
19 
20 namespace impala {
21 
22 const StringValue UrlParser::url_authority(const_cast<char*>("AUTHORITY"), 9);
23 const StringValue UrlParser::url_file(const_cast<char*>("FILE"), 4);
24 const StringValue UrlParser::url_host(const_cast<char*>("HOST"), 4);
25 const StringValue UrlParser::url_path(const_cast<char*>("PATH"), 4);
26 const StringValue UrlParser::url_protocol(const_cast<char*>("PROTOCOL"), 8);
27 const StringValue UrlParser::url_query(const_cast<char*>("QUERY"), 5);
28 const StringValue UrlParser::url_ref(const_cast<char*>("REF"), 3);
29 const StringValue UrlParser::url_userinfo(const_cast<char*>("USERINFO"), 8);
30 const StringValue UrlParser::protocol(const_cast<char*>("://"), 3);
31 const StringValue UrlParser::at(const_cast<char*>("@"), 1);
32 const StringValue UrlParser::slash(const_cast<char*>("/"), 1);
33 const StringValue UrlParser::colon(const_cast<char*>(":"), 1);
34 const StringValue UrlParser::question(const_cast<char*>("?"), 1);
35 const StringValue UrlParser::hash(const_cast<char*>("#"), 1);
42 
43 bool UrlParser::ParseUrl(const StringValue& url, UrlPart part, StringValue* result) {
44  result->ptr = NULL;
45  result->len = 0;
46  // Remove leading and trailing spaces.
47  StringValue trimmed_url = url.Trim();
48 
49  // All parts require checking for the protocol.
50  int32_t protocol_pos = protocol_search.Search(&trimmed_url);
51  if (protocol_pos < 0) return false;
52  // Positioned to first char after '://'.
53  StringValue protocol_end = trimmed_url.Substring(protocol_pos + protocol.len);
54 
55  switch(part) {
56  case AUTHORITY: {
57  // Find first '/'.
58  int32_t end_pos = slash_search.Search(&protocol_end);
59  *result = protocol_end.Substring(0, end_pos);
60  break;
61  }
62 
63  case FILE:
64  case PATH: {
65  // Find first '/'.
66  int32_t start_pos = slash_search.Search(&protocol_end);
67  if (start_pos < 0) {
68  // Return empty string. This is what Hive does.
69  return true;
70  }
71  StringValue path_start = protocol_end.Substring(start_pos);
72  int32_t end_pos;
73  if (part == FILE) {
74  // End at '#'.
75  end_pos = hash_search.Search(&path_start);
76  } else {
77  // End string at next '?' or '#'.
78  end_pos = question_search.Search(&path_start);
79  if (end_pos < 0) {
80  // No '?' was found, look for '#'.
81  end_pos = hash_search.Search(&path_start);
82  }
83  }
84  *result = path_start.Substring(0, end_pos);
85  break;
86  }
87 
88  case HOST: {
89  // Find '@'.
90  int32_t start_pos = at_search.Search(&protocol_end);
91  if (start_pos < 0) {
92  // No '@' was found, i.e., no user:pass info was given, start after protocol.
93  start_pos = 0;
94  } else {
95  // Skip '@'.
96  start_pos += at.len;
97  }
98  StringValue host_start = protocol_end.Substring(start_pos);
99 
100  // Find the start of the query
101  int32_t query_start_pos = question_search.Search(&host_start);
102  StringValue url_only = host_start.Substring(0, query_start_pos);
103 
104  // Find the first '/' in url_only to determine host<:port>
105  int32_t hostport_end_pos = slash_search.Search(&url_only);
106  StringValue hostport = url_only.Substring(0, hostport_end_pos);
107 
108  // Find ':' to strip out port.
109  int32_t end_pos = colon_search.Search(&hostport);
110  *result = hostport.Substring(0, end_pos);
111  break;
112  }
113 
114  case PROTOCOL: {
115  *result = trimmed_url.Substring(0, protocol_pos);
116  break;
117  }
118 
119  case QUERY: {
120  // Find first '?'.
121  int32_t start_pos = question_search.Search(&protocol_end);
122  if (start_pos < 0) {
123  // Indicate no query was found.
124  return false;
125  }
126  StringValue query_start = protocol_end.Substring(start_pos + question.len);
127  // End string at next '#'.
128  int32_t end_pos = hash_search.Search(&query_start);
129  *result = query_start.Substring(0, end_pos);
130  break;
131  }
132 
133  case REF: {
134  // Find '#'.
135  int32_t start_pos = hash_search.Search(&protocol_end);
136  if (start_pos < 0) {
137  // Indicate no user and pass were given.
138  return false;
139  }
140  *result = protocol_end.Substring(start_pos + hash.len);
141  break;
142  }
143 
144  case USERINFO: {
145  // Find '@'.
146  int32_t end_pos = at_search.Search(&protocol_end);
147  if (end_pos < 0) {
148  // Indicate no user and pass were given.
149  return false;
150  }
151  *result = protocol_end.Substring(0, end_pos);
152  break;
153  }
154 
155  case INVALID: return false;
156  }
157  return true;
158 }
159 
161  const StringValue& key, StringValue* result) {
162  // Part must be query to ask for a specific query key.
163  if (part != QUERY) {
164  return false;
165  }
166  // Remove leading and trailing spaces.
167  StringValue trimmed_url = url.Trim();
168 
169  // Search for the key in the url, ignoring malformed URLs for now.
170  StringSearch key_search(&key);
171  while(trimmed_url.len > 0) {
172  // Search for the key in the current substring.
173  int32_t key_pos = key_search.Search(&trimmed_url);
174  bool match = true;
175  if (key_pos < 0) {
176  return false;
177  }
178  // Key pos must be != 0 because it must be preceded by a '?' or a '&'.
179  // Check that the char before key_pos is either '?' or '&'.
180  if (key_pos == 0 ||
181  (trimmed_url.ptr[key_pos - 1] != '?' && trimmed_url.ptr[key_pos - 1] != '&')) {
182  match = false;
183  }
184  // Advance substring beyond matching key.
185  trimmed_url = trimmed_url.Substring(key_pos + key.len);
186  if (!match) {
187  continue;
188  }
189  if (trimmed_url.len <= 0) {
190  break;
191  }
192  // Next character must be '=', otherwise the match cannot be a key in the query part.
193  if (trimmed_url.ptr[0] != '=') {
194  continue;
195  }
196  int32_t pos = 1;
197  // Find ending position of key's value by matching '#' or '&'.
198  while(pos < trimmed_url.len) {
199  switch(trimmed_url.ptr[pos]) {
200  case '#':
201  case '&':
202  *result = trimmed_url.Substring(1, pos - 1);
203  return true;
204  }
205  ++pos;
206  }
207  // Ending position is end of string.
208  *result = trimmed_url.Substring(1);
209  return true;
210  }
211  return false;
212 }
213 
215  // Quick filter on requested URL part, based on first character.
216  // Hive requires the requested URL part to be all upper case.
217  switch(part.ptr[0]) {
218  case 'A': {
219  if (!part.Eq(url_authority)) return INVALID;
220  return AUTHORITY;
221  }
222  case 'F': {
223  if (!part.Eq(url_file)) return INVALID;
224  return FILE;
225  }
226  case 'H': {
227  if (!part.Eq(url_host)) return INVALID;
228  return HOST;
229  }
230  case 'P': {
231  if (part.Eq(url_path)) {
232  return PATH;
233  } else if (part.Eq(url_protocol)) {
234  return PROTOCOL;
235  } else {
236  return INVALID;
237  }
238  }
239  case 'Q': {
240  if (!part.Eq(url_query)) return INVALID;
241  return QUERY;
242  }
243  case 'R': {
244  if (!part.Eq(url_ref)) return INVALID;
245  return REF;
246  }
247  case 'U': {
248  if (!part.Eq(url_userinfo)) return INVALID;
249  return USERINFO;
250  }
251  default: return INVALID;
252  }
253 }
254 
255 }
bool Eq(const StringValue &other) const
==
static const StringValue url_path
Definition: url-parser.h:77
static const StringValue url_authority
Definition: url-parser.h:74
const StringSearch UrlParser::colon_search & colon
Definition: url-parser.cc:39
const StringSearch UrlParser::slash_search & slash
Definition: url-parser.cc:38
const StringSearch UrlParser::protocol_search & protocol
Definition: url-parser.cc:36
const StringSearch UrlParser::at_search & at
Definition: url-parser.cc:37
int Search(const StringValue *str) const
const StringSearch UrlParser::question_search & question
Definition: url-parser.cc:40
const StringSearch UrlParser::hash_search & hash
Definition: url-parser.cc:41
static const StringValue question
Definition: url-parser.h:87
static const StringSearch protocol_search
Definition: url-parser.h:89
StringValue Trim() const
Trims leading and trailing spaces.
static const StringValue protocol
Definition: url-parser.h:83
static const StringSearch question_search
Definition: url-parser.h:93
static UrlPart GetUrlPart(const StringValue &part)
Definition: url-parser.cc:214
static const StringValue url_userinfo
Definition: url-parser.h:81
static const StringValue url_file
Definition: url-parser.h:75
static const StringValue hash
Definition: url-parser.h:88
static const StringSearch at_search
Definition: url-parser.h:90
static bool ParseUrlKey(const StringValue &url, UrlPart part, const StringValue &key, StringValue *result)
Definition: url-parser.cc:160
static const StringSearch colon_search
Definition: url-parser.h:92
static const StringSearch hash_search
Definition: url-parser.h:94
static const StringValue colon
Definition: url-parser.h:86
static const StringValue slash
Definition: url-parser.h:85
static const StringValue url_protocol
Definition: url-parser.h:78
static const StringValue url_host
Definition: url-parser.h:76
static bool ParseUrl(const StringValue &url, UrlPart part, StringValue *result)
Definition: url-parser.cc:43
StringValue Substring(int start_pos) const
Returns the substring starting at start_pos until the end of string.
static const StringValue url_ref
Definition: url-parser.h:80
UrlPart
Parts of a URL that can be requested.
Definition: url-parser.h:44
static const StringValue url_query
Definition: url-parser.h:79
static const StringSearch slash_search
Definition: url-parser.h:91
static const StringValue at
Definition: url-parser.h:84