aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiu Guo <xguo27@gmail.com>2015-11-23 08:53:40 -0800
committerYin Huai <yhuai@databricks.com>2015-11-23 08:53:40 -0800
commit94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336 (patch)
tree65d3262c5538a085058d873fcbce72b39b55b910
parent4be360d4ee6cdb4d06306feca38ddef5212608cf (diff)
downloadspark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.tar.gz
spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.tar.bz2
spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.zip
[SPARK-11628][SQL] support column datatype of char(x) to recognize HiveChar
Can someone review my code to make sure I'm not missing anything? Thanks! Author: Xiu Guo <xguo27@gmail.com> Author: Xiu Guo <guoxi@us.ibm.com> Closes #9612 from xguo27/SPARK-11628.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala6
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala8
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala5
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala25
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala3
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala3
6 files changed, 43 insertions, 7 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
index 2b83651f90..515c071c28 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
@@ -52,7 +52,8 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
"(?i)decimal".r ^^^ DecimalType.USER_DEFAULT |
"(?i)date".r ^^^ DateType |
"(?i)timestamp".r ^^^ TimestampType |
- varchar
+ varchar |
+ char
protected lazy val fixedDecimalType: Parser[DataType] =
("(?i)decimal".r ~> "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
@@ -60,6 +61,9 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
DecimalType(precision.toInt, scale.toInt)
}
+ protected lazy val char: Parser[DataType] =
+ "(?i)char".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
+
protected lazy val varchar: Parser[DataType] =
"(?i)varchar".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
index 1e3409a9db..bebf708965 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
@@ -49,7 +49,9 @@ class DataTypeParserSuite extends SparkFunSuite {
checkDataType("DATE", DateType)
checkDataType("timestamp", TimestampType)
checkDataType("string", StringType)
+ checkDataType("ChaR(5)", StringType)
checkDataType("varchAr(20)", StringType)
+ checkDataType("cHaR(27)", StringType)
checkDataType("BINARY", BinaryType)
checkDataType("array<doublE>", ArrayType(DoubleType, true))
@@ -83,7 +85,8 @@ class DataTypeParserSuite extends SparkFunSuite {
|struct<
| struct:struct<deciMal:DECimal, anotherDecimal:decimAL(5,2)>,
| MAP:Map<timestamp, varchar(10)>,
- | arrAy:Array<double>>
+ | arrAy:Array<double>,
+ | anotherArray:Array<char(9)>>
""".stripMargin,
StructType(
StructField("struct",
@@ -91,7 +94,8 @@ class DataTypeParserSuite extends SparkFunSuite {
StructField("deciMal", DecimalType.USER_DEFAULT, true) ::
StructField("anotherDecimal", DecimalType(5, 2), true) :: Nil), true) ::
StructField("MAP", MapType(TimestampType, StringType), true) ::
- StructField("arrAy", ArrayType(DoubleType, true), true) :: Nil)
+ StructField("arrAy", ArrayType(DoubleType, true), true) ::
+ StructField("anotherArray", ArrayType(StringType, true), true) :: Nil)
)
// A column name can be a reserved word in our DDL parser and SqlParser.
checkDataType(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 12af8068c3..26c1ff5204 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -85,6 +85,7 @@ case class AllDataTypesScan(
Date.valueOf("1970-01-01"),
new Timestamp(20000 + i),
s"varchar_$i",
+ s"char_$i",
Seq(i, i + 1),
Seq(Map(s"str_$i" -> Row(i.toLong))),
Map(i -> i.toString),
@@ -115,6 +116,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
Date.valueOf("1970-01-01"),
new Timestamp(20000 + i),
s"varchar_$i",
+ s"char_$i",
Seq(i, i + 1),
Seq(Map(s"str_$i" -> Row(i.toLong))),
Map(i -> i.toString),
@@ -154,6 +156,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
|dateField dAte,
|timestampField tiMestamp,
|varcharField varchaR(12),
+ |charField ChaR(18),
|arrayFieldSimple Array<inT>,
|arrayFieldComplex Array<Map<String, Struct<key:bigInt>>>,
|mapFieldSimple MAP<iNt, StRing>,
@@ -207,6 +210,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
StructField("dateField", DateType, true) ::
StructField("timestampField", TimestampType, true) ::
StructField("varcharField", StringType, true) ::
+ StructField("charField", StringType, true) ::
StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
StructField("arrayFieldComplex",
ArrayType(
@@ -248,6 +252,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
| dateField,
| timestampField,
| varcharField,
+ | charField,
| arrayFieldSimple,
| arrayFieldComplex,
| mapFieldSimple,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 36f0708f9d..95b57d6ad1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive
import scala.collection.JavaConverters._
-import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
+import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveVarchar}
import org.apache.hadoop.hive.serde2.objectinspector.primitive._
import org.apache.hadoop.hive.serde2.objectinspector.{StructField => HiveStructField, _}
import org.apache.hadoop.hive.serde2.typeinfo.{DecimalTypeInfo, TypeInfoFactory}
@@ -61,6 +61,7 @@ import org.apache.spark.unsafe.types.UTF8String
* Primitive Type
* Java Boxed Primitives:
* org.apache.hadoop.hive.common.type.HiveVarchar
+ * org.apache.hadoop.hive.common.type.HiveChar
* java.lang.String
* java.lang.Integer
* java.lang.Boolean
@@ -75,6 +76,7 @@ import org.apache.spark.unsafe.types.UTF8String
* java.sql.Timestamp
* Writables:
* org.apache.hadoop.hive.serde2.io.HiveVarcharWritable
+ * org.apache.hadoop.hive.serde2.io.HiveCharWritable
* org.apache.hadoop.io.Text
* org.apache.hadoop.io.IntWritable
* org.apache.hadoop.hive.serde2.io.DoubleWritable
@@ -93,7 +95,8 @@ import org.apache.spark.unsafe.types.UTF8String
* Struct: Object[] / java.util.List / java POJO
* Union: class StandardUnion { byte tag; Object object }
*
- * NOTICE: HiveVarchar is not supported by catalyst, it will be simply considered as String type.
+ * NOTICE: HiveVarchar/HiveChar is not supported by catalyst, it will be simply considered as
+ * String type.
*
*
* 2. Hive ObjectInspector is a group of flexible APIs to inspect value in different data
@@ -137,6 +140,7 @@ import org.apache.spark.unsafe.types.UTF8String
* Primitive Object Inspectors:
* WritableConstantStringObjectInspector
* WritableConstantHiveVarcharObjectInspector
+ * WritableConstantHiveCharObjectInspector
* WritableConstantHiveDecimalObjectInspector
* WritableConstantTimestampObjectInspector
* WritableConstantIntObjectInspector
@@ -259,6 +263,8 @@ private[hive] trait HiveInspectors {
UTF8String.fromString(poi.getWritableConstantValue.toString)
case poi: WritableConstantHiveVarcharObjectInspector =>
UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue)
+ case poi: WritableConstantHiveCharObjectInspector =>
+ UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue)
case poi: WritableConstantHiveDecimalObjectInspector =>
HiveShim.toCatalystDecimal(
PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
@@ -303,11 +309,15 @@ private[hive] trait HiveInspectors {
case _ if data == null => null
case poi: VoidObjectInspector => null // always be null for void object inspector
case pi: PrimitiveObjectInspector => pi match {
- // We think HiveVarchar is also a String
+ // We think HiveVarchar/HiveChar is also a String
case hvoi: HiveVarcharObjectInspector if hvoi.preferWritable() =>
UTF8String.fromString(hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue)
case hvoi: HiveVarcharObjectInspector =>
UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
+ case hvoi: HiveCharObjectInspector if hvoi.preferWritable() =>
+ UTF8String.fromString(hvoi.getPrimitiveWritableObject(data).getHiveChar.getValue)
+ case hvoi: HiveCharObjectInspector =>
+ UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
case x: StringObjectInspector if x.preferWritable() =>
UTF8String.fromString(x.getPrimitiveWritableObject(data).toString)
case x: StringObjectInspector =>
@@ -377,6 +387,15 @@ private[hive] trait HiveInspectors {
null
}
+ case _: JavaHiveCharObjectInspector =>
+ (o: Any) =>
+ if (o != null) {
+ val s = o.asInstanceOf[UTF8String].toString
+ new HiveChar(s, s.size)
+ } else {
+ null
+ }
+
case _: JavaHiveDecimalObjectInspector =>
(o: Any) =>
if (o != null) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 69f481c49a..70ee02823e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -382,6 +382,9 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
case oi: HiveVarcharObjectInspector =>
(value: Any, row: MutableRow, ordinal: Int) =>
row.update(ordinal, UTF8String.fromString(oi.getPrimitiveJavaObject(value).getValue))
+ case oi: HiveCharObjectInspector =>
+ (value: Any, row: MutableRow, ordinal: Int) =>
+ row.update(ordinal, UTF8String.fromString(oi.getPrimitiveJavaObject(value).getValue))
case oi: HiveDecimalObjectInspector =>
(value: Any, row: MutableRow, ordinal: Int) =>
row.update(ordinal, HiveShim.toCatalystDecimal(oi, value))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 48bbb21e6c..346840079b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -321,7 +321,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
def convertFilters(table: Table, filters: Seq[Expression]): String = {
// hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
val varcharKeys = table.getPartitionKeys.asScala
- .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
+ .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME) ||
+ col.getType.startsWith(serdeConstants.CHAR_TYPE_NAME))
.map(col => col.getName).toSet
filters.collect {