Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
HdfsStorageDescriptor.java
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 package com.cloudera.impala.catalog;
16 
17 import java.util.List;
18 import java.util.Map;
19 
20 import org.apache.hadoop.hive.metastore.api.SerDeInfo;
21 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
22 import org.apache.hadoop.hive.serde.serdeConstants;
23 import org.slf4j.Logger;
24 import org.slf4j.LoggerFactory;
25 
26 import com.google.common.base.Preconditions;
27 import com.google.common.collect.ImmutableList;
28 import com.google.common.collect.Maps;
29 
33 public class HdfsStorageDescriptor {
34  private static final char DEFAULT_LINE_DELIM = '\n';
35  // hive by default uses ctrl-a as field delim
36  private static final char DEFAULT_FIELD_DELIM = '\u0001';
37  // hive by default has no escape char
38  public static final char DEFAULT_ESCAPE_CHAR = '\u0000';
39 
40  // Serde parameters that are recognized by table writers.
41  private static final String BLOCK_SIZE = "blocksize";
42  private static final String COMPRESSION = "compression";
43 
44  // Important: don't change the ordering of these keys - if e.g. FIELD_DELIM is not
45  // found, the value of LINE_DELIM is used, so LINE_DELIM must be found first.
46  // Package visible for testing.
47  final static List<String> DELIMITER_KEYS = ImmutableList.of(
48  serdeConstants.LINE_DELIM, serdeConstants.FIELD_DELIM,
49  serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM,
50  serdeConstants.ESCAPE_CHAR, serdeConstants.QUOTE_CHAR);
51 
52  // The Parquet serde shows up multiple times as the location of the implementation
53  // has changed between Impala versions.
54  final static List<String> COMPATIBLE_SERDES = ImmutableList.of(
55  "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", // (seq / text / parquet)
56  "org.apache.hadoop.hive.serde2.avro.AvroSerDe", // (avro)
57  "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe", // (rc)
58  "parquet.hive.serde.ParquetHiveSerDe", // (parquet - legacy)
59  // TODO: Verify the following Parquet SerDe works with Impala and add
60  // support for the new input/output format classes. See CDH-17085.
61  "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"); // (parquet)
62 
63  private final static Logger LOG = LoggerFactory.getLogger(HdfsStorageDescriptor.class);
64 
66  private final byte lineDelim_;
67  private final byte fieldDelim_;
68  private final byte collectionDelim_;
69  private final byte mapKeyDelim_;
70  private final byte escapeChar_;
71  private final byte quoteChar_;
72  private final int blockSize_;
73 
74  public void setFileFormat(HdfsFileFormat fileFormat) {
75  fileFormat_ = fileFormat;
76  }
77 
85  private static Map<String, Byte> extractDelimiters(SerDeInfo serdeInfo)
87  // The metastore may return null for delimiter parameters,
88  // which means we need to use a default instead.
89  // We tried long and hard to find default values for delimiters in Hive,
90  // but could not find them.
91  Map<String, Byte> delimMap = Maps.newHashMap();
92 
93  for (String delimKey: DELIMITER_KEYS) {
94  String delimValue = serdeInfo.getParameters().get(delimKey);
95  if (delimValue == null) {
96  if (delimKey.equals(serdeConstants.FIELD_DELIM)) {
97  delimMap.put(delimKey, (byte) DEFAULT_FIELD_DELIM);
98  } else if (delimKey.equals(serdeConstants.ESCAPE_CHAR)) {
99  delimMap.put(delimKey, (byte) DEFAULT_ESCAPE_CHAR);
100  } else if (delimKey.equals(serdeConstants.LINE_DELIM)) {
101  delimMap.put(delimKey, (byte) DEFAULT_LINE_DELIM);
102  } else {
103  delimMap.put(delimKey, delimMap.get(serdeConstants.FIELD_DELIM));
104  }
105  } else {
106  Byte delimByteValue = parseDelim(delimValue);
107  if (delimByteValue == null) {
108  throw new InvalidStorageDescriptorException("Invalid delimiter: '" +
109  delimValue + "'. Delimiter must be specified as a single character or " +
110  "as a decimal value in the range [-128:127]");
111  }
112  delimMap.put(delimKey, parseDelim(delimValue));
113  }
114  }
115  return delimMap;
116  }
117 
134  public static Byte parseDelim(String delimVal) {
135  Preconditions.checkNotNull(delimVal);
136  try {
137  // In the future we could support delimiters specified in hex format, but we would
138  // need support from the Hive side.
139  return Byte.parseByte(delimVal);
140  } catch (NumberFormatException e) {
141  if (delimVal.length() == 1) return (byte) delimVal.charAt(0);
142  }
143  return null;
144  }
145 
146  public HdfsStorageDescriptor(String tblName, HdfsFileFormat fileFormat, byte lineDelim,
147  byte fieldDelim, byte collectionDelim, byte mapKeyDelim, byte escapeChar,
148  byte quoteChar, int blockSize) {
149  this.fileFormat_ = fileFormat;
150  this.lineDelim_ = lineDelim;
151  this.fieldDelim_ = fieldDelim;
152  this.collectionDelim_ = collectionDelim;
153  this.mapKeyDelim_ = mapKeyDelim;
154  this.quoteChar_ = quoteChar;
155  this.blockSize_ = blockSize;
156 
157  // You can set the escape character as a tuple or row delim. Empirically,
158  // this is ignored by hive.
159  if (escapeChar == fieldDelim ||
160  escapeChar == lineDelim ||
161  escapeChar == collectionDelim) {
162  // TODO: we should output the table name here but it's hard to get to now.
163  this.escapeChar_ = DEFAULT_ESCAPE_CHAR;
164  LOG.warn("Escape character for table, " + tblName + " is set to "
165  + "the same character as one of the delimiters. Ignoring escape character.");
166  } else {
167  this.escapeChar_ = escapeChar;
168  }
169  }
170 
177  // Mandatory since Exception implements Serialisable
178  private static final long serialVersionUID = -555234913768134760L;
179  public InvalidStorageDescriptorException(String s) { super(s); }
180  public InvalidStorageDescriptorException(Exception ex) {
181  super(ex.getMessage(), ex);
182  }
183  public InvalidStorageDescriptorException(String msg, Throwable cause) {
184  super(msg, cause);
185  }
186  }
187 
195  public static HdfsStorageDescriptor fromStorageDescriptor(String tblName,
196  StorageDescriptor sd)
198  Map<String, Byte> delimMap = extractDelimiters(sd.getSerdeInfo());
199  if (!COMPATIBLE_SERDES.contains(sd.getSerdeInfo().getSerializationLib())) {
200  throw new InvalidStorageDescriptorException(String.format("Impala does not " +
201  "support tables of this type. REASON: SerDe library '%s' is not " +
202  "supported.", sd.getSerdeInfo().getSerializationLib()));
203  }
204  // Extract the blocksize and compression specification from the SerDe parameters,
205  // if present.
206  Map<String, String> parameters = sd.getSerdeInfo().getParameters();
207  int blockSize = 0;
208  String blockValue = parameters.get(BLOCK_SIZE);
209  if (blockValue != null) {
210  blockSize = Integer.parseInt(blockValue);
211  }
212 
213  try {
214  return new HdfsStorageDescriptor(tblName,
215  HdfsFileFormat.fromJavaClassName(sd.getInputFormat()),
216  delimMap.get(serdeConstants.LINE_DELIM),
217  delimMap.get(serdeConstants.FIELD_DELIM),
218  delimMap.get(serdeConstants.COLLECTION_DELIM),
219  delimMap.get(serdeConstants.MAPKEY_DELIM),
220  delimMap.get(serdeConstants.ESCAPE_CHAR),
221  delimMap.get(serdeConstants.QUOTE_CHAR),
222  blockSize);
223  } catch (IllegalArgumentException ex) {
224  // Thrown by fromJavaClassName
225  throw new InvalidStorageDescriptorException(ex);
226  }
227  }
228 
229  public byte getLineDelim() { return lineDelim_; }
230  public byte getFieldDelim() { return fieldDelim_; }
231  public byte getCollectionDelim() { return collectionDelim_; }
232  public byte getMapKeyDelim() { return mapKeyDelim_; }
233  public byte getEscapeChar() { return escapeChar_; }
234  public byte getQuoteChar() { return quoteChar_; }
236  public int getBlockSize() { return blockSize_; }
237 }
static Map< String, Byte > extractDelimiters(SerDeInfo serdeInfo)
HdfsStorageDescriptor(String tblName, HdfsFileFormat fileFormat, byte lineDelim, byte fieldDelim, byte collectionDelim, byte mapKeyDelim, byte escapeChar, byte quoteChar, int blockSize)
static HdfsFileFormat fromJavaClassName(String className)
static HdfsStorageDescriptor fromStorageDescriptor(String tblName, StorageDescriptor sd)