[SPARK-6024][SQL] When a data source table has too many columns, it's schema cannot be stored in metastore.

JIRA: https://issues.apache.org/jira/browse/SPARK-6024 Author: Yin Huai <yhuai@databricks.com> Closes #4795 from yhuai/wideSchema and squashes the following commits: 4882e6f [Yin Huai] Address comments. 73e71b4 [Yin Huai] Address comments. 143927a [Yin Huai] Simplify code. cc1d472 [Yin Huai] Make the schema wider. 12bacae [Yin Huai] If the JSON string of a schema is too large, split it before storing it in metastore. e9b4f70 [Yin Huai] Failed test.
author: Yin Huai <yhuai@databricks.com> 2015-02-26 20:46:05 -0800
committer: Reynold Xin <rxin@databricks.com> 2015-02-26 20:46:05 -0800
commit: 5e5ad6558d60cfbf360708584e883e80d363e33e (patch)
tree: fbd88acd39179dd93d38f9b1fb48696c3a8aa638 /sql/core
parent: 4ad5153f5449319a7e82c9013ccff4494ab58ef1 (diff)
download: spark-5e5ad6558d60cfbf360708584e883e80d363e33e.tar.gz
spark-5e5ad6558d60cfbf360708584e883e80d363e33e.tar.bz2
spark-5e5ad6558d60cfbf360708584e883e80d363e33e.zip
1 files changed, 10 insertions, 0 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index a08c0f5ce3..4815620c6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -51,6 +51,11 @@ private[spark] object SQLConf {
 
   // This is used to set the default data source
   val DEFAULT_DATA_SOURCE_NAME = "spark.sql.sources.default"
+  // This is used to control the when we will split a schema's JSON string to multiple pieces
+  // in order to fit the JSON string in metastore's table property (by default, the value has
+  // a length restriction of 4000 characters). We will split the JSON string of a schema
+  // to its length exceeds the threshold.
+  val SCHEMA_STRING_LENGTH_THRESHOLD = "spark.sql.sources.schemaStringLengthThreshold"
 
   // Whether to perform eager analysis when constructing a dataframe.
   // Set to false when debugging requires the ability to look at invalid query plans.
@@ -177,6 +182,11 @@ private[sql] class SQLConf extends Serializable {
   private[spark] def defaultDataSourceName: String =
     getConf(DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.parquet")
 
+  // Do not use a value larger than 4000 as the default value of this property.
+  // See the comments of SCHEMA_STRING_LENGTH_THRESHOLD above for more information.
+  private[spark] def schemaStringLengthThreshold: Int =
+    getConf(SCHEMA_STRING_LENGTH_THRESHOLD, "4000").toInt
+
   private[spark] def dataFrameEagerAnalysis: Boolean =
     getConf(DATAFRAME_EAGER_ANALYSIS, "true").toBoolean
author	Yin Huai <yhuai@databricks.com>	2015-02-26 20:46:05 -0800
committer	Reynold Xin <rxin@databricks.com>	2015-02-26 20:46:05 -0800
commit	5e5ad6558d60cfbf360708584e883e80d363e33e (patch)
tree	fbd88acd39179dd93d38f9b1fb48696c3a8aa638 /sql/core
parent	4ad5153f5449319a7e82c9013ccff4494ab58ef1 (diff)
download	spark-5e5ad6558d60cfbf360708584e883e80d363e33e.tar.gz spark-5e5ad6558d60cfbf360708584e883e80d363e33e.tar.bz2 spark-5e5ad6558d60cfbf360708584e883e80d363e33e.zip