[SPARK-15342] [SQL] [PYSPARK] PySpark test for non ascii column name does not actually test with unicode column name

## What changes were proposed in this pull request? The PySpark SQL `test_column_name_with_non_ascii` wants to test non-ascii column name. But it doesn't actually test it. We need to construct an unicode explicitly using `unicode` under Python 2. ## How was this patch tested? Existing tests. Author: Liang-Chi Hsieh <simonh@tw.ibm.com> Closes #13134 from viirya/correct-non-ascii-colname-pytest.
author: Liang-Chi Hsieh <simonh@tw.ibm.com> 2016-05-18 11:18:33 -0700
committer: Davies Liu <davies.liu@gmail.com> 2016-05-18 11:18:33 -0700
commit: 3d1e67f903ab3512fcad82b94b1825578f8117c9 (patch)
tree: 6f392bcbfbf0836ce44bccd95fab53a4d27e4b6b /python
parent: 8fb1d1c7f3ed1b62625052a532b7388ebec71bbf (diff)
download: spark-3d1e67f903ab3512fcad82b94b1825578f8117c9.tar.gz
spark-3d1e67f903ab3512fcad82b94b1825578f8117c9.tar.bz2
spark-3d1e67f903ab3512fcad82b94b1825578f8117c9.zip
2 files changed, 11 insertions, 3 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index e86f44281d..1790432edd 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1044,8 +1044,15 @@ class SQLTests(ReusedPySparkTestCase):
         self.assertRaises(TypeError, lambda: df[{}])
 
     def test_column_name_with_non_ascii(self):
-        df = self.spark.createDataFrame([(1,)], ["数量"])
-        self.assertEqual(StructType([StructField("数量", LongType(), True)]), df.schema)
+        if sys.version >= '3':
+            columnName = "数量"
+            self.assertTrue(isinstance(columnName, str))
+        else:
+            columnName = unicode("数量", "utf-8")
+            self.assertTrue(isinstance(columnName, unicode))
+        schema = StructType([StructField(columnName, LongType(), True)])
+        df = self.spark.createDataFrame([(1,)], schema)
+        self.assertEqual(schema, df.schema)
         self.assertEqual("DataFrame[数量: bigint]", str(df))
         self.assertEqual([("数量", 'bigint')], df.dtypes)
         self.assertEqual(1, df.select("数量").first()[0])
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 30ab130f29..7d8d0230b4 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -27,7 +27,7 @@ from array import array
 
 if sys.version >= "3":
     long = int
-    unicode = str
+    basestring = unicode = str
 
 from py4j.protocol import register_input_converter
 from py4j.java_gateway import JavaClass
@@ -401,6 +401,7 @@ class StructField(DataType):
         False
         """
         assert isinstance(dataType, DataType), "dataType should be DataType"
+        assert isinstance(name, basestring), "field name should be string"
         if not isinstance(name, str):
             name = name.encode('utf-8')
         self.name = name
author	Liang-Chi Hsieh <simonh@tw.ibm.com>	2016-05-18 11:18:33 -0700
committer	Davies Liu <davies.liu@gmail.com>	2016-05-18 11:18:33 -0700
commit	3d1e67f903ab3512fcad82b94b1825578f8117c9 (patch)
tree	6f392bcbfbf0836ce44bccd95fab53a4d27e4b6b /python
parent	8fb1d1c7f3ed1b62625052a532b7388ebec71bbf (diff)
download	spark-3d1e67f903ab3512fcad82b94b1825578f8117c9.tar.gz spark-3d1e67f903ab3512fcad82b94b1825578f8117c9.tar.bz2 spark-3d1e67f903ab3512fcad82b94b1825578f8117c9.zip