Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
redactor.cc
Go to the documentation of this file.
1 // Copyright 2015 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "redactor.h"
16 
17 #include <cerrno>
18 #include <cstring> // strcmp, strcasestr
19 #include <ostream>
20 #include <sstream>
21 #include <sys/stat.h>
22 #include <vector>
23 
24 #include <gutil/strings/substitute.h>
25 #include <rapidjson/document.h>
26 #include <rapidjson/filestream.h>
27 #include <rapidjson/rapidjson.h>
28 #include <rapidjson/reader.h>
29 #include <re2/re2.h>
30 #include <re2/stringpiece.h>
31 
32 #include "common/logging.h"
33 
34 namespace impala {
35 
36 using rapidjson::Document;
37 using rapidjson::Value;
38 using std::endl;
39 using std::map;
40 using std::ostream;
41 using std::ostringstream;
42 using std::string;
43 using std::vector;
44 using strings::Substitute;
45 
46 typedef re2::RE2 Regex;
47 
48 struct Rule {
49  // Factory constructor. The factory pattern is used because constructing a
50  // case-insensitive Regex requires multiple lines and a Rule should be immutable so
51  // the Regex should be const. Const members must be initialized in the initialization
52  // list but multi-line statements cannot be used there. Keeping the Rule class
53  // immutable was preferred over having a direct constructor, though either should be
54  // fine.
55  static Rule Create(const string& trigger, const string& search_regex,
56  const string& replacement, bool case_sensitive) {
57  Regex::Options options;
58  options.set_case_sensitive(case_sensitive);
59  Regex re(search_regex, options);
60  return Rule(trigger, re, replacement);
61  }
62 
63  // For use with vector.
64  Rule(const Rule& other)
65  : trigger(other.trigger),
66  search_pattern(other.search_pattern.pattern(), other.search_pattern.options()),
67  replacement(other.replacement) {}
68 
69  const string trigger;
71  const string replacement;
72 
73  bool case_sensitive() const { return search_pattern.options().case_sensitive(); }
74 
75  // For use with vector.
76  const Rule& operator=(const Rule& other) {
77  *this = Rule(other);
78  return *this;
79  }
80 
81  private:
82  // For use with the factory constructor. The case-sensitivity option in
83  // 'regex_options' also applies to 'trigger'.
84  Rule(const string& trigger, const Regex& search_pattern, const string& replacement)
85  : trigger(trigger),
86  search_pattern(search_pattern.pattern(), search_pattern.options()),
87  replacement(replacement) {}
88 };
89 
90 typedef vector<Rule> Rules;
91 
92 // The actual rules in effect, if any.
93 static Rules* g_rules;
94 
95 string NameOfTypeOfJsonValue(const Value& value) {
96  switch (value.GetType()) {
97  case rapidjson::kNullType:
98  return "Null";
99  case rapidjson::kFalseType:
100  case rapidjson::kTrueType:
101  return "Bool";
102  case rapidjson::kObjectType:
103  return "Object";
104  case rapidjson::kArrayType:
105  return "Array";
106  case rapidjson::kStringType:
107  return "String";
108  case rapidjson::kNumberType:
109  if (value.IsInt()) return "Integer";
110  if (value.IsDouble()) return "Float";
111  default:
112  DCHECK(false);
113  return "Unknown";
114  }
115 }
116 
117 // This class will parse a version 1 rule file and populate g_rules.
118 class RulesParser {
119  public:
120  // Perform the parsing and populate g_rules. Any errors encountered will be written
121  // to error_message.
122  string Parse(const Document& rules_doc) {
123  error_message_.str("");
124  bool found_rules = false;
125  for (Value::ConstMemberIterator member = rules_doc.MemberBegin();
126  member != rules_doc.MemberEnd(); ++member) {
127  if (strcmp("rules", member->name.GetString()) == 0) {
128  found_rules = true;
129  ParseRules(member->value);
130  } else if (strcmp("version", member->name.GetString()) == 0) {
131  // Ignore
132  } else {
134  << "unexpected property '" << member->name.GetString() << "' must be removed";
135  }
136  }
137  if (!found_rules) {
138  AddDocParseError() << "an array of rules is required";
139  }
140  return error_message_.str();
141  }
142 
143  private:
144  ostringstream error_message_;
145 
146  // The current index of the rule being parsed. 'SizeType' avoids an ambiguity between
147  // json_value[int] (array) and json_value[const char*] (object), otherwise 0 could be
148  // a null pointer.
149  rapidjson::SizeType rule_idx_;
150 
151  // Parse an array of rules.
152  void ParseRules(const Value& rules) {
153  if (!rules.IsArray()) {
154  AddDocParseError() << "'rules' must be of type Array but is a "
155  << NameOfTypeOfJsonValue(rules);
156  return;
157  }
158  for (rule_idx_ = 0; rule_idx_ < rules.Size(); ++rule_idx_) {
159  const Value& rule = rules[rule_idx_];
160  if (!rule.IsObject()) {
161  AddRuleParseError() << "rule should be a JSON Object but is a "
162  << NameOfTypeOfJsonValue(rule);
163  continue;
164  }
165  ParseRule(rule);
166  }
167  }
168 
169  // Parse a rule and populate g_rules.
170  void ParseRule(const Value& json_rule) {
171  bool found_replace = false;
172  bool case_sensitive = true;
173  string search_text, replace, trigger;
174  for (Value::ConstMemberIterator member = json_rule.MemberBegin();
175  member != json_rule.MemberEnd(); ++member) {
176  if (strcmp("search", member->name.GetString()) == 0) {
177  if (!ReadRuleProperty("search", json_rule, &search_text)) return;
178  if (search_text.empty()) {
179  AddRuleParseError() << "search property must be a non-empty regex";
180  return;
181  }
182  } else if (strcmp("replace", member->name.GetString()) == 0) {
183  found_replace = true;
184  if (!ReadRuleProperty("replace", json_rule, &replace)) return;
185  } else if (strcmp("trigger", member->name.GetString()) == 0) {
186  if (!ReadRuleProperty("trigger", json_rule, &trigger, /*required*/ false)) return;
187  } else if (strcmp("caseSensitive", member->name.GetString()) == 0) {
188  if (!ReadRuleProperty("caseSensitive", json_rule, &case_sensitive, false)) return;
189  } else if (strcmp("description", member->name.GetString()) == 0) {
190  // Ignore, this property is for user documentation.
191  } else {
192  // Future properties may change the meaning of current properties so ignoring
193  // unknown properties is not safe.
194  AddRuleParseError() << "unexpected property '" << member->name.GetString()
195  << "' must be removed";
196  return;
197  }
198  }
199  if (search_text.empty()) { // Only empty if not found
200  AddRuleParseError() << "a 'search' property is required";
201  return;
202  } else if (!found_replace) {
203  AddRuleParseError() << "a 'replace' property is required";
204  return;
205  }
206  const Rule& rule = Rule::Create(trigger, search_text, replace, case_sensitive);
207  if (!rule.search_pattern.ok()) {
208  AddRuleParseError() << "search regex is invalid; " << rule.search_pattern.error();
209  return;
210  }
211  (*g_rules).push_back(rule);
212  }
213 
214  // Reads a rule property of the given name and assigns the property value to the out
215  // parameter. A true return value indicates success.
216  template<typename T>
217  bool ReadRuleProperty(const string& name, const Value& rule, T* value,
218  bool required = true) {
219  const Value& json_value = rule[name.c_str()];
220  if (json_value.IsNull()) {
221  if (required) {
222  AddRuleParseError() << name << " property is required and cannot be null";
223  return false;
224  }
225  return true;
226  }
227  return ValidateTypeAndExtractValue(name, json_value, value);
228  }
229 
230 // Extract a value stored in a rapidjson::Value and assign it to the out parameter.
231 // The type will be validated before extraction. A true return value indicates success.
232 // The name parameter is only used to generate an error message upon failure.
233 #define EXTRACT_VALUE(json_type, cpp_type) \
234  bool ValidateTypeAndExtractValue(const string& name, const Value& json_value, \
235  cpp_type* value) { \
236  if (!json_value.Is ## json_type()) { \
237  AddRuleParseError() << name << " property must be of type " #json_type \
238  << " but is a " << NameOfTypeOfJsonValue(json_value); \
239  return false; \
240  } \
241  *value = json_value.Get ## json_type(); \
242  return true; \
243  }
244 EXTRACT_VALUE(String, string)
245 EXTRACT_VALUE(Bool, bool)
246 
247  ostream& AddDocParseError() {
248  if (error_message_.tellp()) error_message_ << endl;
249  error_message_ << "Error parsing redaction rules; ";
250  return error_message_;
251  }
252 
253  ostream& AddRuleParseError() {
254  if (error_message_.tellp()) error_message_ << endl;
255  error_message_ << "Error parsing redaction rule #" << (rule_idx_ + 1) << "; ";
256  return error_message_;
257  }
258 };
259 
260 string SetRedactionRulesFromFile(const string& rules_file_path) {
261  if (g_rules == NULL) g_rules = new Rules();
262  g_rules->clear();
263 
264  // Read the file.
265  FILE* rules_file = fopen(rules_file_path.c_str(), "r");
266  if (rules_file == NULL) {
267  return Substitute("Could not open redaction rules file '$0'; $1",
268  rules_file_path, strerror(errno));
269  }
270  // Check for an empty file and ignore it. This is done to play nice with automated
271  // cluster configuration tools that will generate empty files when no rules are in
272  // effect. Without this the JSON parser would produce an error.
273  struct stat rules_file_stats;
274  if (fstat(fileno(rules_file), &rules_file_stats)) {
275  fclose(rules_file);
276  return Substitute("Error reading redaction rules file; $0", strerror(errno));
277  }
278  if (rules_file_stats.st_size == 0) {
279  fclose(rules_file);
280  return "";
281  }
282 
283  rapidjson::FileStream stream(rules_file);
284  Document rules_doc;
285  rules_doc.ParseStream<rapidjson::kParseDefaultFlags>(stream);
286  fclose(rules_file);
287  if (rules_doc.HasParseError()) {
288  return Substitute("Error parsing redaction rules; $0", rules_doc.GetParseError());
289  }
290  if (!rules_doc.IsObject()) {
291  return "Error parsing redaction rules; root element must be a JSON Object.";
292  }
293  const Value& version = rules_doc["version"];
294  if (version.IsNull()) {
295  return "Error parsing redaction rules; a document version is required.";
296  }
297  if (!version.IsInt()) {
298  return Substitute("Error parsing redaction rules; version must be an Integer but "
299  "is a $0", NameOfTypeOfJsonValue(version));
300  }
301  if (version.GetInt() != 1) {
302  return "Error parsing redaction rules; only version 1 is supported.";
303  }
304 
305  RulesParser rules_parser;
306  return rules_parser.Parse(rules_doc);
307 }
308 
309 void Redact(string* value, bool* changed) {
310  DCHECK(value != NULL);
311  if (g_rules == NULL || g_rules->empty()) return;
312  for (Rules::const_iterator rule = g_rules->begin(); rule != g_rules->end(); ++rule) {
313  if (rule->case_sensitive()) {
314  if (value->find(rule->trigger) == string::npos) continue;
315  } else {
316  if (strcasestr(value->c_str(), rule->trigger.c_str()) == NULL) continue;
317  }
318  int replacement_count = re2::RE2::GlobalReplace(
319  value, rule->search_pattern, rule->replacement);
320  if (changed != NULL && !*changed) *changed = replacement_count;
321  }
322 }
323 
324 }
bool ReadRuleProperty(const string &name, const Value &rule, T *value, bool required=true)
Definition: redactor.cc:217
const string replacement
Definition: redactor.cc:71
Rule(const string &trigger, const Regex &search_pattern, const string &replacement)
Definition: redactor.cc:84
void Redact(string *value, bool *changed)
Definition: redactor.cc:309
string Parse(const Document &rules_doc)
Definition: redactor.cc:122
rapidjson::SizeType rule_idx_
Definition: redactor.cc:149
static Rules * g_rules
Definition: redactor.cc:93
const Rule & operator=(const Rule &other)
Definition: redactor.cc:76
ostringstream error_message_
Definition: redactor.cc:144
string SetRedactionRulesFromFile(const string &rules_file_path)
Definition: redactor.cc:260
ostream & AddDocParseError()
Definition: redactor.cc:247
Rule(const Rule &other)
Definition: redactor.cc:64
const string trigger
Definition: redactor.cc:69
static Rule Create(const string &trigger, const string &search_regex, const string &replacement, bool case_sensitive)
Definition: redactor.cc:55
#define EXTRACT_VALUE(json_type, cpp_type)
Definition: redactor.cc:233
const Regex search_pattern
Definition: redactor.cc:70
void ParseRules(const Value &rules)
Definition: redactor.cc:152
string NameOfTypeOfJsonValue(const Value &value)
Definition: redactor.cc:95
re2::RE2 Regex
Definition: redactor.cc:46
bool case_sensitive() const
Definition: redactor.cc:73
string name
Definition: cpu-info.cc:50
void ParseRule(const Value &json_rule)
Definition: redactor.cc:170
vector< Rule > Rules
Definition: redactor.cc:90
ostream & AddRuleParseError()
Definition: redactor.cc:253