aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchirag <chirag.aggarwal@guavus.com>2014-10-13 13:47:26 -0700
committerMichael Armbrust <michael@databricks.com>2014-10-13 13:47:51 -0700
commite6e37701f12be82fa77dc28d825ddd36a1ab7594 (patch)
tree9eb5855f43cd6b3ee90945681a71bff3cb844809
parent371321cadee7df39258bd374eb59c1e32451d96b (diff)
downloadspark-e6e37701f12be82fa77dc28d825ddd36a1ab7594.tar.gz
spark-e6e37701f12be82fa77dc28d825ddd36a1ab7594.tar.bz2
spark-e6e37701f12be82fa77dc28d825ddd36a1ab7594.zip
SPARK-3807: SparkSql does not work for tables created using custom serde
SparkSql crashes on selecting tables using custom serde. Example: ---------------- CREATE EXTERNAL TABLE table_name PARTITIONED BY ( a int) ROW FORMAT 'SERDE "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer" with serdeproperties("serialization.format"="org.apache.thrift.protocol.TBinaryProtocol","serialization.class"="ser_class") STORED AS SEQUENCEFILE; The following exception is seen on running a query like 'select * from table_name limit 1': ERROR CliDriver: org.apache.hadoop.hive.serde2.SerDeException: java.lang.NullPointerException at org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer.initialize(ThriftDeserializer.java:68) at org.apache.hadoop.hive.ql.plan.TableDesc.getDeserializer(TableDesc.java:80) at org.apache.spark.sql.hive.execution.HiveTableScan.addColumnMetadataToConf(HiveTableScan.scala:86) at org.apache.spark.sql.hive.execution.HiveTableScan.<init>(HiveTableScan.scala:100) at org.apache.spark.sql.hive.HiveStrategies$HiveTableScans$$anonfun$14.apply(HiveStrategies.scala:188) at org.apache.spark.sql.hive.HiveStrategies$HiveTableScans$$anonfun$14.apply(HiveStrategies.scala:188) at org.apache.spark.sql.SQLContext$SparkPlanner.pruneFilterProject(SQLContext.scala:364) at org.apache.spark.sql.hive.HiveStrategies$HiveTableScans$.apply(HiveStrategies.scala:184) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at org.apache.spark.sql.catalyst.planning.QueryPlanner.apply(QueryPlanner.scala:59) at org.apache.spark.sql.catalyst.planning.QueryPlanner.planLater(QueryPlanner.scala:54) at org.apache.spark.sql.execution.SparkStrategies$BasicOperators$.apply(SparkStrategies.scala:280) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at org.apache.spark.sql.catalyst.planning.QueryPlanner.apply(QueryPlanner.scala:59) at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:402) at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:400) at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:406) at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:406) at org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:406) at org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:59) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:291) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:413) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:226) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) at java.lang.reflect.Method.invoke(Unknown Source) at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:328) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.lang.NullPointerException Author: chirag <chirag.aggarwal@guavus.com> Closes #2674 from chiragaggarwal/branch-1.1 and squashes the following commits: 370c31b [chirag] SPARK-3807: Add a test case to validate the fix. 1f26805 [chirag] SPARK-3807: SparkSql does not work for tables created using custom serde (Incorporated Review Comments) ba4bc0c [chirag] SPARK-3807: SparkSql does not work for tables created using custom serde 5c73b72 [chirag] SPARK-3807: SparkSql does not work for tables created using custom serde (cherry picked from commit 925e22d3132b983a2fcee31e3878b680c7ff92da) Signed-off-by: Michael Armbrust <michael@databricks.com>
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala2
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala6
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala3
3 files changed, 9 insertions, 2 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c5fee5e470..75a19656af 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -305,7 +305,7 @@ private[hive] case class MetastoreRelation
val partitionKeys = hiveQlTable.getPartitionKeys.map(_.toAttribute)
/** Non-partitionKey attributes */
- val attributes = table.getSd.getCols.map(_.toAttribute)
+ val attributes = hiveQlTable.getCols.map(_.toAttribute)
val output = attributes ++ partitionKeys
}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index a32147584f..5b83b77d80 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -86,10 +86,14 @@ case class HiveTableScan(
ColumnProjectionUtils.appendReadColumnIDs(hiveConf, neededColumnIDs)
ColumnProjectionUtils.appendReadColumnNames(hiveConf, attributes.map(_.name))
+ val tableDesc = relation.tableDesc
+ val deserializer = tableDesc.getDeserializerClass.newInstance
+ deserializer.initialize(hiveConf, tableDesc.getProperties)
+
// Specifies types and object inspectors of columns to be scanned.
val structOI = ObjectInspectorUtils
.getStandardObjectInspector(
- relation.tableDesc.getDeserializer.getObjectInspector,
+ deserializer.getObjectInspector,
ObjectInspectorCopyOption.JAVA)
.asInstanceOf[StructObjectInspector]
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 2829105f43..3e100775e4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -802,6 +802,9 @@ class HiveQuerySuite extends HiveComparisonTest {
clear()
}
+ createQueryTest("select from thrift based table",
+ "SELECT * from src_thrift")
+
// Put tests that depend on specific Hive settings before these last two test,
// since they modify /clear stuff.
}