diff options
author | Davies Liu <davies@databricks.com> | 2015-07-01 16:43:18 -0700 |
---|---|---|
committer | Davies Liu <davies@databricks.com> | 2015-07-01 16:43:18 -0700 |
commit | f958f27e2056f9e380373c2807d8bb5977ecf269 (patch) | |
tree | f2cc331beedc426b0574126473b6172a3d16ed6a /python/pyspark | |
parent | 1ce6428907b4ddcf52dbf0c86196d82ab7392442 (diff) | |
download | spark-f958f27e2056f9e380373c2807d8bb5977ecf269.tar.gz spark-f958f27e2056f9e380373c2807d8bb5977ecf269.tar.bz2 spark-f958f27e2056f9e380373c2807d8bb5977ecf269.zip |
[SPARK-8766] support non-ascii character in column names
Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii').
This PR also fix a bug when there is Java exception without error message.
Author: Davies Liu <davies@databricks.com>
Closes #7165 from davies/non_ascii and squashes the following commits:
02cb61a [Davies Liu] fix tests
3b09d31 [Davies Liu] add encoding in header
867754a [Davies Liu] support non-ascii character in column names
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 3 | ||||
-rw-r--r-- | python/pyspark/sql/tests.py | 9 | ||||
-rw-r--r-- | python/pyspark/sql/types.py | 2 | ||||
-rw-r--r-- | python/pyspark/sql/utils.py | 6 |
4 files changed, 15 insertions, 5 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 4b9efa0a21..273a40dd52 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -484,13 +484,12 @@ class DataFrame(object): return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] @property - @ignore_unicode_prefix @since(1.3) def columns(self): """Returns all column names as a list. >>> df.columns - [u'age', u'name'] + ['age', 'name'] """ return [f.name for f in self.schema.fields] diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 5af2ce09bc..333378c7f1 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -628,6 +629,14 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaises(IndexError, lambda: df["bad_key"]) self.assertRaises(TypeError, lambda: df[{}]) + def test_column_name_with_non_ascii(self): + df = self.sqlCtx.createDataFrame([(1,)], ["数量"]) + self.assertEqual(StructType([StructField("数量", LongType(), True)]), df.schema) + self.assertEqual("DataFrame[数量: bigint]", str(df)) + self.assertEqual([("数量", 'bigint')], df.dtypes) + self.assertEqual(1, df.select("数量").first()[0]) + self.assertEqual(1, df.select(df["数量"]).first()[0]) + def test_access_nested_types(self): df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() self.assertEqual(1, df.select(df.l[0]).first()[0]) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index ae9344e610..160df40d65 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -324,6 +324,8 @@ class StructField(DataType): False """ assert isinstance(dataType, DataType), "dataType should be DataType" + if not isinstance(name, str): + name = name.encode('utf-8') self.name = name self.dataType = dataType self.nullable = nullable diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 8096802e73..cc5b2c088b 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -29,9 +29,9 @@ def capture_sql_exception(f): try: return f(*a, **kw) except py4j.protocol.Py4JJavaError as e: - cls, msg = e.java_exception.toString().split(': ', 1) - if cls == 'org.apache.spark.sql.AnalysisException': - raise AnalysisException(msg) + s = e.java_exception.toString() + if s.startswith('org.apache.spark.sql.AnalysisException: '): + raise AnalysisException(s.split(': ', 1)[1]) raise return deco |