Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
symbols-util.cc
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "util/symbols-util.h"
16 #include <cxxabi.h>
17 #include <sstream>
18 #include <boost/algorithm/string.hpp>
19 #include <boost/algorithm/string/regex.hpp>
20 
21 #include "common/names.h"
22 
23 using boost::algorithm::split_regex;
24 using boost::regex;
25 using namespace impala;
26 
27 // For the rules about gcc-compatible name mangling, see:
28 // http://mentorembedded.github.io/cxx-abi/abi.html#mangling
29 // This implementation *is* not generally compatible. It is harded coded to
30 // only work with functions that implement the UDF or UDA signature. That is,
31 // functions of the form:
32 // namespace::Function(impala_udf::FunctionContext*, const impala_udf::AnyVal&, etc)
33 //
34 // The general idea is to walk the types left to right and output them. This happens
35 // in a single pass. User literals are output as <len><literal>. There are many reserved,
36 // usually single character tokens for native types and specifying if something is a
37 // pointer.
38 //
39 // One additional piece of complexity is that repeated literals are compressed out.
40 // As literals are output, they are associated with an ID. The next time that
41 // we encounter the literal, we output the ID instead.
42 // We don't implement this generally since the way the literals are added to the
43 // dictionary is much more general than we need.
44 // e.g. for the literal ns1::ns2::class::type,
45 // the dictionary would add 4 literals: 'ns1', 'ns1::ns2', 'ns1::ns2::class',
46 // 'ns1::ns2::class::type'
47 // We instead take some shortcuts since we know all the argument types are
48 // types we define.
49 
50 // Mangled symbols must start with this.
51 const char* MANGLE_PREFIX = "_Z";
52 
53 bool SymbolsUtil::IsMangled(const string& symbol) {
54  return strncmp(symbol.c_str(), MANGLE_PREFIX, strlen(MANGLE_PREFIX)) == 0;
55 }
56 
57 string SymbolsUtil::Demangle(const string& name) {
58  string result;
59  int status = 0;
60  char* mangled_name = abi::__cxa_demangle(name.c_str(), NULL, NULL, &status);
61  if (status == 0) result = mangled_name;
62  if (mangled_name != NULL) free(mangled_name);
63  return result;
64 }
65 
66 string SymbolsUtil::DemangleNameOnly(const string& symbol) {
67  string fn_name = Demangle(symbol);
68  // Chop off argument list (e.g. "foo(int)" => "foo")
69  fn_name = fn_name.substr(0, fn_name.find('('));
70  // Chop off namespace and/or class name if present (e.g. "impala::foo" => "foo")
71  fn_name = fn_name.substr(fn_name.find_last_of(':') + 1);
72  return fn_name;
73 }
74 
75 // Appends <Length><String> to the stream.
76 // e.g. Hello --> "5Hello"
77 static void AppendMangledToken(const string& s, stringstream* out) {
78  DCHECK(!s.empty());
79  (*out) << s.size() << s;
80 }
81 
82 // Outputs the seq_id. This is base 36 encoded with an S prefix and _ suffix.
83 // As an added optimization, the "seq_id - 1" value is output with the first
84 // token as just "S".
85 // e.g. seq_id 0: "S_"
86 // seq_id 1: "S0_"
87 // seq_id 2: "S1_"
88 static void AppendSeqId(int seq_id, stringstream* out) {
89  DCHECK_GE(seq_id, 0);
90  if (seq_id == 0) {
91  (*out) << "S_";
92  return;
93  }
94  --seq_id;
95  char buffer[10];
96  char* ptr = buffer + 10;
97  if (seq_id == 0) *--ptr = '0';
98  while (seq_id != 0) {
99  DCHECK(ptr > buffer);
100  char c = static_cast<char>(seq_id % 36);
101  *--ptr = (c < 10 ? '0' + c : 'A' + c - 10);
102  seq_id /=36;
103  }
104  (*out) << "S";
105  out->write(ptr, 10 - (ptr - buffer));
106  (*out) << "_";
107 }
108 
109 static void AppendAnyValType(int namespace_id, const ColumnType& type, stringstream* s) {
110  (*s) << "N";
111  // All the AnyVal types are in the impala_udf namespace, that token
112  // already came with impala_udf::FunctionContext
113  AppendSeqId(namespace_id, s);
114 
115  switch (type.type) {
116  case TYPE_BOOLEAN:
117  AppendMangledToken("BooleanVal", s);
118  break;
119  case TYPE_TINYINT:
120  AppendMangledToken("TinyIntVal", s);
121  break;
122  case TYPE_SMALLINT:
123  AppendMangledToken("SmallIntVal", s);
124  break;
125  case TYPE_INT:
126  AppendMangledToken("IntVal", s);
127  break;
128  case TYPE_BIGINT:
129  AppendMangledToken("BigIntVal", s);
130  break;
131  case TYPE_FLOAT:
132  AppendMangledToken("FloatVal", s);
133  break;
134  case TYPE_DOUBLE:
135  AppendMangledToken("DoubleVal", s);
136  break;
137  case TYPE_STRING:
138  case TYPE_VARCHAR:
139  case TYPE_CHAR:
140  AppendMangledToken("StringVal", s);
141  break;
142  case TYPE_TIMESTAMP:
143  AppendMangledToken("TimestampVal", s);
144  break;
145  case TYPE_DECIMAL:
146  AppendMangledToken("DecimalVal", s);
147  break;
148  default:
149  DCHECK(false) << "NYI: " << type.DebugString();
150  }
151  (*s) << "E"; // end impala_udf namespace
152 }
153 
155  const vector<ColumnType>& arg_types, bool has_var_args,
156  ColumnType* ret_arg_type) {
157  // We need to split fn_name by :: to separate scoping from tokens
158  vector<string> name_tokens;
159  split_regex(name_tokens, fn_name, regex("::"));
160 
161  // Mangled names use substitution as a builtin compression. The first time a token
162  // is seen, we output the raw token string and store the index ("seq_id"). The
163  // next time we see the same token, we output the index instead.
164  int seq_id = 0;
165 
166  // Sequence id for the impala_udf namespace token
167  int impala_udf_seq_id = -1;
168 
169  stringstream ss;
170  ss << MANGLE_PREFIX;
171  if (name_tokens.size() > 1) {
172  ss << "N"; // Start namespace
173  seq_id += name_tokens.size() - 1; // Append for all the name space tokens.
174  }
175  for (int i = 0; i < name_tokens.size(); ++i) {
176  AppendMangledToken(name_tokens[i], &ss);
177  }
178  if (name_tokens.size() > 1) ss << "E"; // End fn namespace
179  ss << "PN"; // First argument and start of FunctionContext namespace
180  AppendMangledToken("impala_udf", &ss);
181  impala_udf_seq_id = seq_id++;
182  AppendMangledToken("FunctionContext", &ss);
183  ++seq_id;
184  ss << "E"; // E indicates end of namespace
185 
186  map<PrimitiveType, int> argument_map;
187  for (int i = 0; i < arg_types.size(); ++i) {
188  int repeated_symbol_idx = -1; // Set to >0, if we've seen the symbol.
189  if (argument_map.find(arg_types[i].type) != argument_map.end()) {
190  repeated_symbol_idx = argument_map[arg_types[i].type];
191  }
192 
193  if (has_var_args && i == arg_types.size() - 1) {
194  // We always specify varargs as int32 followed by the type.
195  ss << "i"; // The argument for the number of varargs.
196  ss << "P"; // This indicates what follows is a ptr (that is the array of varargs)
197  ++seq_id; // For "P"
198  if (repeated_symbol_idx > 0) {
199  AppendSeqId(repeated_symbol_idx - 1, &ss);
200  continue;
201  }
202  } else {
203  if (repeated_symbol_idx > 0) {
204  AppendSeqId(repeated_symbol_idx, &ss);
205  continue;
206  }
207  ss << "R"; // This indicates it is a reference type
208  ++seq_id; // For R.
209  }
210 
211  ss << "K"; // This indicates it is const
212  seq_id += 2; // For impala_udf::*Val, which is two tokens.
213  AppendAnyValType(impala_udf_seq_id, arg_types[i], &ss);
214  argument_map[arg_types[i].type] = seq_id;
215  }
216 
217  // Output return argument.
218  if (ret_arg_type != NULL) {
219  int repeated_symbol_idx = -1;
220  if (argument_map.find(ret_arg_type->type) != argument_map.end()) {
221  repeated_symbol_idx = argument_map[ret_arg_type->type];
222  }
223  ss << "P"; // Return argument is a pointer
224 
225  if (repeated_symbol_idx != -1) {
226  // This is always last and a pointer type.
227  AppendSeqId(argument_map[ret_arg_type->type] - 2, &ss);
228  } else {
229  AppendAnyValType(impala_udf_seq_id, *ret_arg_type, &ss);
230  }
231  }
232 
233  return ss.str();
234 }
235 
237  // We need to split fn_name by :: to separate scoping from tokens
238  vector<string> name_tokens;
239  split_regex(name_tokens, fn_name, regex("::"));
240 
241  // Mangled names use substitution as a builtin compression. The first time a token
242  // is seen, we output the raw token string and store the index ("seq_id"). The
243  // next time we see the same token, we output the index instead.
244  int seq_id = 0;
245 
246  stringstream ss;
247  ss << MANGLE_PREFIX;
248  if (name_tokens.size() > 1) {
249  ss << "N"; // Start namespace
250  seq_id += name_tokens.size() - 1; // Append for all the name space tokens.
251  }
252  for (int i = 0; i < name_tokens.size(); ++i) {
253  AppendMangledToken(name_tokens[i], &ss);
254  }
255  if (name_tokens.size() > 1) ss << "E"; // End fn namespace
256 
257  ss << "PN"; // FunctionContext* argument and start of FunctionContext namespace
258  AppendMangledToken("impala_udf", &ss);
259  AppendMangledToken("FunctionContext", &ss);
260  ss << "E"; // E indicates end of namespace
261 
262  ss << "NS"; // FunctionStateScope argument
263  ss << seq_id;
264  ss << "_";
265  AppendMangledToken("FunctionStateScope", &ss);
266  ss << "E"; // E indicates end of namespace
267 
268  return ss.str();
269 }
static void AppendMangledToken(const string &s, stringstream *out)
Definition: symbols-util.cc:77
static std::string ManglePrepareOrCloseFunction(const std::string &fn_name)
std::string DebugString() const
Definition: types.cc:194
static bool IsMangled(const std::string &symbol)
Returns true if this symbol is mangled.
Definition: symbols-util.cc:53
PrimitiveType type
Definition: types.h:60
static void AppendAnyValType(int namespace_id, const ColumnType &type, stringstream *s)
static std::string Demangle(const std::string &name)
Definition: symbols-util.cc:57
const char * MANGLE_PREFIX
Definition: symbols-util.cc:51
static std::string MangleUserFunction(const std::string &fn_name, const std::vector< ColumnType > &arg_types, bool has_var_args=false, ColumnType *ret_argument=NULL)
static void AppendSeqId(int seq_id, stringstream *out)
Definition: symbols-util.cc:88
string name
Definition: cpu-info.cc:50
static std::string DemangleNameOnly(const std::string &symbol)
Definition: symbols-util.cc:66