aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-07-01 16:43:18 -0700
committerDavies Liu <davies@databricks.com>2015-07-01 17:18:04 -0700
commit17def395798dfc3af962d34b9a0260fa8880fe7d (patch)
tree0816ccde53808df614dc642afc6da7ec5a627fde
parent228aabe244d03886cd1c106c73df51054f882e73 (diff)
downloadspark-17def395798dfc3af962d34b9a0260fa8880fe7d.tar.gz
spark-17def395798dfc3af962d34b9a0260fa8880fe7d.tar.bz2
spark-17def395798dfc3af962d34b9a0260fa8880fe7d.zip
[SPARK-8766] support non-ascii character in column names
Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii'). This PR also fix a bug when there is Java exception without error message. Author: Davies Liu <davies@databricks.com> Closes #7165 from davies/non_ascii and squashes the following commits: 02cb61a [Davies Liu] fix tests 3b09d31 [Davies Liu] add encoding in header 867754a [Davies Liu] support non-ascii character in column names (cherry picked from commit f958f27e2056f9e380373c2807d8bb5977ecf269) Signed-off-by: Davies Liu <davies@databricks.com> Conflicts: python/pyspark/sql/utils.py
-rw-r--r--python/pyspark/sql/dataframe.py3
-rw-r--r--python/pyspark/sql/tests.py9
-rw-r--r--python/pyspark/sql/types.py2
3 files changed, 12 insertions, 2 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2d8c59518b..e9dd00e31d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -476,13 +476,12 @@ class DataFrame(object):
return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]
@property
- @ignore_unicode_prefix
@since(1.3)
def columns(self):
"""Returns all column names as a list.
>>> df.columns
- [u'age', u'name']
+ ['age', 'name']
"""
return [f.name for f in self.schema.fields]
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f90277697d..27c2ad16b8 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -583,6 +584,14 @@ class SQLTests(ReusedPySparkTestCase):
self.assertRaises(IndexError, lambda: df["bad_key"])
self.assertRaises(TypeError, lambda: df[{}])
+ def test_column_name_with_non_ascii(self):
+ df = self.sqlCtx.createDataFrame([(1,)], ["数量"])
+ self.assertEqual(StructType([StructField("数量", LongType(), True)]), df.schema)
+ self.assertEqual("DataFrame[数量: bigint]", str(df))
+ self.assertEqual([("数量", 'bigint')], df.dtypes)
+ self.assertEqual(1, df.select("数量").first()[0])
+ self.assertEqual(1, df.select(df["数量"]).first()[0])
+
def test_access_nested_types(self):
df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF()
self.assertEqual(1, df.select(df.l[0]).first()[0])
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index b6ec6137c9..e4cb006515 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -323,6 +323,8 @@ class StructField(DataType):
False
"""
assert isinstance(dataType, DataType), "dataType should be DataType"
+ if not isinstance(name, str):
+ name = name.encode('utf-8')
self.name = name
self.dataType = dataType
self.nullable = nullable