Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
HdfsFileFormat.java
Go to the documentation of this file.
1 // Copyright 2012 Cloudera Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 package com.cloudera.impala.catalog;
16 
17 import java.util.Map;
18 
19 import com.cloudera.impala.thrift.THdfsFileFormat;
20 import com.google.common.base.Preconditions;
21 import com.google.common.collect.ImmutableMap;
22 
26 public enum HdfsFileFormat {
32 
33  // Input format class for RCFile tables read by Hive.
34  private static final String RCFILE_INPUT_FORMAT =
35  "org.apache.hadoop.hive.ql.io.RCFileInputFormat";
36 
37  // Input format class for Text tables read by Hive.
38  private static final String TEXT_INPUT_FORMAT =
39  "org.apache.hadoop.mapred.TextInputFormat";
40 
41  // Input format class for LZO compressed Text tables read by Hive.
42  public static final String LZO_TEXT_INPUT_FORMAT =
43  "com.hadoop.mapred.DeprecatedLzoTextInputFormat";
44 
45  // Output format class for LZO compressed Text tables read by Hive.
46  public static final String LZO_TEXT_OUTPUT_FORMAT =
47  "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
48 
49  // Input format class for Sequence file tables read by Hive.
50  private static final String SEQUENCE_INPUT_FORMAT =
51  "org.apache.hadoop.mapred.SequenceFileInputFormat";
52 
53  // Input format class for Parquet tables read by Hive.
54  // The location (i.e. java class path) for the SerDe has
55  // changed during its development. Impala will treat any
56  // of these format classes as Parquet
57  private static final String[] PARQUET_INPUT_FORMATS = {
58  "com.cloudera.impala.hive.serde.ParquetInputFormat",
59  "parquet.hive.DeprecatedParquetInputFormat",
60  "parquet.hive.MapredParquetInputFormat",
61  "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
62  };
63 
64  // Input format class for Avro tables read by hive.
65  private static final String AVRO_INPUT_FORMAT =
66  "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat";
67 
68  private static final Map<String, HdfsFileFormat> VALID_FORMATS =
69  ImmutableMap.<String, HdfsFileFormat>builder()
70  .put(RCFILE_INPUT_FORMAT, RC_FILE)
71  .put(TEXT_INPUT_FORMAT, TEXT)
72  .put(LZO_TEXT_INPUT_FORMAT, TEXT)
73  .put(SEQUENCE_INPUT_FORMAT, SEQUENCE_FILE)
74  .put(AVRO_INPUT_FORMAT, AVRO)
75  .put(PARQUET_INPUT_FORMATS[0], PARQUET)
76  .put(PARQUET_INPUT_FORMATS[1], PARQUET)
77  .put(PARQUET_INPUT_FORMATS[2], PARQUET)
78  .put(PARQUET_INPUT_FORMATS[3], PARQUET)
79  .build();
83  public static boolean isHdfsFormatClass(String formatClass) {
84  return VALID_FORMATS.containsKey(formatClass);
85  }
86 
91  public static HdfsFileFormat fromHdfsInputFormatClass(String inputFormatClass) {
92  Preconditions.checkNotNull(inputFormatClass);
93  return VALID_FORMATS.get(inputFormatClass);
94  }
95 
100  public static HdfsFileFormat fromJavaClassName(String className) {
101  Preconditions.checkNotNull(className);
102  if (isHdfsFormatClass(className)) {
103  return VALID_FORMATS.get(className);
104  }
105  throw new IllegalArgumentException(className);
106  }
107 
108  public String toJavaClassName() {
109  for (Map.Entry<String, HdfsFileFormat> e: VALID_FORMATS.entrySet()) {
110  if (e.getValue().equals(this)) return e.getKey();
111  }
112 
113  throw new IllegalArgumentException(this.toString());
114  }
115 
116  public static HdfsFileFormat fromThrift(THdfsFileFormat thriftFormat) {
117  switch (thriftFormat) {
118  case RC_FILE: return HdfsFileFormat.RC_FILE;
119  case TEXT: return HdfsFileFormat.TEXT;
120  case SEQUENCE_FILE: return HdfsFileFormat.SEQUENCE_FILE;
121  case AVRO: return HdfsFileFormat.AVRO;
122  case PARQUET: return HdfsFileFormat.PARQUET;
123  default:
124  throw new RuntimeException("Unknown THdfsFileFormat: "
125  + thriftFormat + " - should never happen!");
126  }
127  }
128 
129  public THdfsFileFormat toThrift() {
130  switch (this) {
131  case RC_FILE: return THdfsFileFormat.RC_FILE;
132  case TEXT: return THdfsFileFormat.TEXT;
133  case SEQUENCE_FILE: return THdfsFileFormat.SEQUENCE_FILE;
134  case AVRO: return THdfsFileFormat.AVRO;
135  case PARQUET: return THdfsFileFormat.PARQUET;
136  default:
137  throw new RuntimeException("Unknown HdfsFormat: "
138  + this + " - should never happen!");
139  }
140  }
141 
142  public String toSql(HdfsCompression compressionType) {
143  switch (this) {
144  case RC_FILE: return "RCFILE";
145  case TEXT:
146  if (compressionType == HdfsCompression.LZO ||
147  compressionType == HdfsCompression.LZO_INDEX) {
148  // TODO: Update this when we can write LZO text.
149  // It is not currently possible to create a table with LZO compressed text files
150  // in Impala, but this is valid in Hive.
151  return String.format("INPUTFORMAT '%s' OUTPUTFORMAT '%s'",
152  LZO_TEXT_INPUT_FORMAT,
153  LZO_TEXT_OUTPUT_FORMAT);
154  }
155  return "TEXTFILE";
156  case SEQUENCE_FILE: return "SEQUENCEFILE";
157  case AVRO: return "AVRO";
158  case PARQUET: return "PARQUET";
159  default:
160  throw new RuntimeException("Unknown HdfsFormat: "
161  + this + " - should never happen!");
162  }
163  }
164 
165  /*
166  * Checks whether a file is supported in Impala based on the file extension.
167  * Returns true if the file format is supported. If the file format is not
168  * supported, then it returns false and 'errorMsg' contains details on the
169  * incompatibility.
170  *
171  * Impala supports LZO, GZIP, SNAPPY and BZIP2 on text files for partitions that have
172  * been declared in the metastore as TEXT. LZO files can have their own input format.
173  * For now, raise an error on any other type.
174  */
175  public boolean isFileCompressionTypeSupported(String fileName,
176  StringBuilder errorMsg) {
177  // Check to see if the file has a compression suffix.
178  // TODO: Add LZ4
179  HdfsCompression compressionType = HdfsCompression.fromFileName(fileName);
180  switch (compressionType) {
181  case LZO:
182  case LZO_INDEX:
183  // Index files are read by the LZO scanner directly.
184  case GZIP:
185  case SNAPPY:
186  case BZIP2:
187  case NONE:
188  return true;
189  case DEFLATE:
190  // TODO: Ensure that text/deflate works correctly
191  if (this == TEXT) {
192  errorMsg.append("Expected compressed text file with {.lzo,.gzip,.snappy,.bz2} "
193  + "suffix: " + fileName);
194  return false;
195  } else {
196  return true;
197  }
198  default:
199  errorMsg.append("Unknown compression suffix: " + fileName);
200  return false;
201  }
202  }
203 
207  public boolean isSplittable(HdfsCompression compression) {
208  switch (this) {
209  case TEXT:
210  return compression == HdfsCompression.NONE;
211  case RC_FILE:
212  case SEQUENCE_FILE:
213  case AVRO:
214  return true;
215  case PARQUET:
216  return false;
217  default:
218  throw new RuntimeException("Unknown HdfsFormat: "
219  + this + " - should never happen!");
220  }
221  }
222 }
boolean isSplittable(HdfsCompression compression)
static HdfsFileFormat fromThrift(THdfsFileFormat thriftFormat)
static boolean isHdfsFormatClass(String formatClass)
String toSql(HdfsCompression compressionType)
static HdfsFileFormat fromHdfsInputFormatClass(String inputFormatClass)
static HdfsFileFormat fromJavaClassName(String className)
boolean isFileCompressionTypeSupported(String fileName, StringBuilder errorMsg)