[SPARK-11628][SQL] support column datatype of char(x) to recognize HiveChar

Can someone review my code to make sure I'm not missing anything? Thanks! Author: Xiu Guo <xguo27@gmail.com> Author: Xiu Guo <guoxi@us.ibm.com> Closes #9612 from xguo27/SPARK-11628.
author: Xiu Guo <xguo27@gmail.com> 2015-11-23 08:53:40 -0800
committer: Yin Huai <yhuai@databricks.com> 2015-11-23 08:53:40 -0800
commit: 94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336 (patch)
tree: 65d3262c5538a085058d873fcbce72b39b55b910
parent: 4be360d4ee6cdb4d06306feca38ddef5212608cf (diff)
download: spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.tar.gz
spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.tar.bz2
spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.zip
6 files changed, 43 insertions, 7 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
index 2b83651f90..515c071c28 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
@@ -52,7 +52,8 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
     "(?i)decimal".r ^^^ DecimalType.USER_DEFAULT |
     "(?i)date".r ^^^ DateType |
     "(?i)timestamp".r ^^^ TimestampType |
-    varchar
+    varchar |
+    char
 
   protected lazy val fixedDecimalType: Parser[DataType] =
     ("(?i)decimal".r ~> "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
@@ -60,6 +61,9 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
         DecimalType(precision.toInt, scale.toInt)
     }
 
+  protected lazy val char: Parser[DataType] =
+    "(?i)char".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
+
   protected lazy val varchar: Parser[DataType] =
     "(?i)varchar".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
index 1e3409a9db..bebf708965 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
@@ -49,7 +49,9 @@ class DataTypeParserSuite extends SparkFunSuite {
   checkDataType("DATE", DateType)
   checkDataType("timestamp", TimestampType)
   checkDataType("string", StringType)
+  checkDataType("ChaR(5)", StringType)
   checkDataType("varchAr(20)", StringType)
+  checkDataType("cHaR(27)", StringType)
   checkDataType("BINARY", BinaryType)
 
   checkDataType("array<doublE>", ArrayType(DoubleType, true))
@@ -83,7 +85,8 @@ class DataTypeParserSuite extends SparkFunSuite {
       |struct<
       |  struct:struct<deciMal:DECimal, anotherDecimal:decimAL(5,2)>,
       |  MAP:Map<timestamp, varchar(10)>,
-      |  arrAy:Array<double>>
+      |  arrAy:Array<double>,
+      |  anotherArray:Array<char(9)>>
     """.stripMargin,
     StructType(
       StructField("struct",
@@ -91,7 +94,8 @@ class DataTypeParserSuite extends SparkFunSuite {
           StructField("deciMal", DecimalType.USER_DEFAULT, true) ::
           StructField("anotherDecimal", DecimalType(5, 2), true) :: Nil), true) ::
       StructField("MAP", MapType(TimestampType, StringType), true) ::
-      StructField("arrAy", ArrayType(DoubleType, true), true) :: Nil)
+      StructField("arrAy", ArrayType(DoubleType, true), true) ::
+      StructField("anotherArray", ArrayType(StringType, true), true) :: Nil)
   )
   // A column name can be a reserved word in our DDL parser and SqlParser.
   checkDataType(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 12af8068c3..26c1ff5204 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -85,6 +85,7 @@ case class AllDataTypesScan(
         Date.valueOf("1970-01-01"),
         new Timestamp(20000 + i),
         s"varchar_$i",
+        s"char_$i",
         Seq(i, i + 1),
         Seq(Map(s"str_$i" -> Row(i.toLong))),
         Map(i -> i.toString),
@@ -115,6 +116,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
       Date.valueOf("1970-01-01"),
       new Timestamp(20000 + i),
       s"varchar_$i",
+      s"char_$i",
       Seq(i, i + 1),
       Seq(Map(s"str_$i" -> Row(i.toLong))),
       Map(i -> i.toString),
@@ -154,6 +156,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
         |dateField dAte,
         |timestampField tiMestamp,
         |varcharField varchaR(12),
+        |charField ChaR(18),
         |arrayFieldSimple Array<inT>,
         |arrayFieldComplex Array<Map<String, Struct<key:bigInt>>>,
         |mapFieldSimple MAP<iNt, StRing>,
@@ -207,6 +210,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
       StructField("dateField", DateType, true) ::
       StructField("timestampField", TimestampType, true) ::
       StructField("varcharField", StringType, true) ::
+      StructField("charField", StringType, true) ::
       StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
       StructField("arrayFieldComplex",
         ArrayType(
@@ -248,6 +252,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
           | dateField,
           | timestampField,
           | varcharField,
+          | charField,
           | arrayFieldSimple,
           | arrayFieldComplex,
           | mapFieldSimple,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 36f0708f9d..95b57d6ad1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive
 
 import scala.collection.JavaConverters._
 
-import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
+import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveVarchar}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
 import org.apache.hadoop.hive.serde2.objectinspector.{StructField => HiveStructField, _}
 import org.apache.hadoop.hive.serde2.typeinfo.{DecimalTypeInfo, TypeInfoFactory}
@@ -61,6 +61,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * Primitive Type
  *   Java Boxed Primitives:
  *       org.apache.hadoop.hive.common.type.HiveVarchar
+ *       org.apache.hadoop.hive.common.type.HiveChar
  *       java.lang.String
  *       java.lang.Integer
  *       java.lang.Boolean
@@ -75,6 +76,7 @@ import org.apache.spark.unsafe.types.UTF8String
  *       java.sql.Timestamp
  *   Writables:
  *       org.apache.hadoop.hive.serde2.io.HiveVarcharWritable
+ *       org.apache.hadoop.hive.serde2.io.HiveCharWritable
  *       org.apache.hadoop.io.Text
  *       org.apache.hadoop.io.IntWritable
  *       org.apache.hadoop.hive.serde2.io.DoubleWritable
@@ -93,7 +95,8 @@ import org.apache.spark.unsafe.types.UTF8String
  *   Struct: Object[] / java.util.List / java POJO
  *   Union: class StandardUnion { byte tag; Object object }
  *
- * NOTICE: HiveVarchar is not supported by catalyst, it will be simply considered as String type.
+ * NOTICE: HiveVarchar/HiveChar is not supported by catalyst, it will be simply considered as
+ *  String type.
  *
  *
  * 2. Hive ObjectInspector is a group of flexible APIs to inspect value in different data
@@ -137,6 +140,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * Primitive Object Inspectors:
  *     WritableConstantStringObjectInspector
  *     WritableConstantHiveVarcharObjectInspector
+ *     WritableConstantHiveCharObjectInspector
  *     WritableConstantHiveDecimalObjectInspector
  *     WritableConstantTimestampObjectInspector
  *     WritableConstantIntObjectInspector
@@ -259,6 +263,8 @@ private[hive] trait HiveInspectors {
       UTF8String.fromString(poi.getWritableConstantValue.toString)
     case poi: WritableConstantHiveVarcharObjectInspector =>
       UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue)
+    case poi: WritableConstantHiveCharObjectInspector =>
+      UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue)
     case poi: WritableConstantHiveDecimalObjectInspector =>
       HiveShim.toCatalystDecimal(
         PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
@@ -303,11 +309,15 @@ private[hive] trait HiveInspectors {
     case _ if data == null => null
     case poi: VoidObjectInspector => null // always be null for void object inspector
     case pi: PrimitiveObjectInspector => pi match {
-      // We think HiveVarchar is also a String
+      // We think HiveVarchar/HiveChar is also a String
       case hvoi: HiveVarcharObjectInspector if hvoi.preferWritable() =>
         UTF8String.fromString(hvoi.getPrimitiveWritableObject(data).getHiveVarchar.getValue)
       case hvoi: HiveVarcharObjectInspector =>
         UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
+      case hvoi: HiveCharObjectInspector if hvoi.preferWritable() =>
+        UTF8String.fromString(hvoi.getPrimitiveWritableObject(data).getHiveChar.getValue)
+      case hvoi: HiveCharObjectInspector =>
+        UTF8String.fromString(hvoi.getPrimitiveJavaObject(data).getValue)
       case x: StringObjectInspector if x.preferWritable() =>
         UTF8String.fromString(x.getPrimitiveWritableObject(data).toString)
       case x: StringObjectInspector =>
@@ -377,6 +387,15 @@ private[hive] trait HiveInspectors {
           null
         }
 
+    case _: JavaHiveCharObjectInspector =>
+      (o: Any) =>
+        if (o != null) {
+          val s = o.asInstanceOf[UTF8String].toString
+          new HiveChar(s, s.size)
+        } else {
+          null
+        }
+
     case _: JavaHiveDecimalObjectInspector =>
       (o: Any) =>
         if (o != null) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 69f481c49a..70ee02823e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -382,6 +382,9 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
         case oi: HiveVarcharObjectInspector =>
           (value: Any, row: MutableRow, ordinal: Int) =>
             row.update(ordinal, UTF8String.fromString(oi.getPrimitiveJavaObject(value).getValue))
+        case oi: HiveCharObjectInspector =>
+          (value: Any, row: MutableRow, ordinal: Int) =>
+            row.update(ordinal, UTF8String.fromString(oi.getPrimitiveJavaObject(value).getValue))
         case oi: HiveDecimalObjectInspector =>
           (value: Any, row: MutableRow, ordinal: Int) =>
             row.update(ordinal, HiveShim.toCatalystDecimal(oi, value))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 48bbb21e6c..346840079b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -321,7 +321,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
   def convertFilters(table: Table, filters: Seq[Expression]): String = {
     // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
     val varcharKeys = table.getPartitionKeys.asScala
-      .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
+      .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME) ||
+        col.getType.startsWith(serdeConstants.CHAR_TYPE_NAME))
       .map(col => col.getName).toSet
 
     filters.collect {
author	Xiu Guo <xguo27@gmail.com>	2015-11-23 08:53:40 -0800
committer	Yin Huai <yhuai@databricks.com>	2015-11-23 08:53:40 -0800
commit	94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336 (patch)
tree	65d3262c5538a085058d873fcbce72b39b55b910
parent	4be360d4ee6cdb4d06306feca38ddef5212608cf (diff)
download	spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.tar.gz spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.tar.bz2 spark-94ce65dfcbba1fe3a1fc9d8002c37d9cd1a11336.zip