From 603f4453a16825cc5773cfe24d6ae4cee5ec949a Mon Sep 17 00:00:00 2001 From: Bill Chambers Date: Wed, 11 May 2016 17:42:13 -0700 Subject: [SPARK-15264][SPARK-15274][SQL] CSV Reader Error on Blank Column Names ## What changes were proposed in this pull request? When a CSV begins with: - `,,` OR - `"","",` meaning that the first column names are either empty or blank strings and `header` is specified to be `true`, then the column name is replaced with `C` + the index number of that given column. For example, if you were to read in the CSV: ``` "","second column" "hello", "there" ``` Then column names would become `"C0", "second column"`. This behavior aligns with what currently happens when `header` is specified to be `false` in recent versions of Spark. ### Current Behavior in Spark <=1.6 In Spark <=1.6, a CSV with a blank column name becomes a blank string, `""`, meaning that this column cannot be accessed. However the CSV reads in without issue. ### Current Behavior in Spark 2.0 Spark throws a NullPointerError and will not read in the file. #### Reproduction in 2.0 https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/346304/2828750690305044/484361/latest.html ## How was this patch tested? A new test was added to `CSVSuite` to account for this issue. We then have asserts that test for being able to select both the empty column names as well as the regular column names. Author: Bill Chambers Author: Bill Chambers Closes #13041 from anabranch/master. --- python/pyspark/sql/readwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 7fd7583972..5cb186016e 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -358,7 +358,7 @@ class DataFrameReader(object): >>> df = spark.read.csv('python/test_support/sql/ages.csv') >>> df.dtypes - [('C0', 'string'), ('C1', 'string')] + [('_c0', 'string'), ('_c1', 'string')] """ if schema is not None: self.schema(schema) -- cgit v1.2.3