aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--python/pyspark/sql/dataframe.py12
-rw-r--r--python/pyspark/sql/tests.py4
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala3
3 files changed, 12 insertions, 7 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index da742d7ce7..025811f519 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1202,7 +1202,9 @@ class DataFrame(object):
@ignore_unicode_prefix
@since(1.3)
def withColumn(self, colName, col):
- """Returns a new :class:`DataFrame` by adding a column.
+ """
+ Returns a new :class:`DataFrame` by adding a column or replacing the
+ existing column that has the same name.
:param colName: string, name of the new column.
:param col: a :class:`Column` expression for the new column.
@@ -1210,7 +1212,8 @@ class DataFrame(object):
>>> df.withColumn('age2', df.age + 2).collect()
[Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
"""
- return self.select('*', col.alias(colName))
+ assert isinstance(col, Column), "col should be Column"
+ return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
@ignore_unicode_prefix
@since(1.3)
@@ -1223,10 +1226,7 @@ class DataFrame(object):
>>> df.withColumnRenamed('age', 'age2').collect()
[Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
"""
- cols = [Column(_to_java_column(c)).alias(new)
- if c == existing else c
- for c in self.columns]
- return self.select(*cols)
+ return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx)
@since(1.4)
@ignore_unicode_prefix
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13cf647b66..aacfb34c77 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1035,6 +1035,10 @@ class SQLTests(ReusedPySparkTestCase):
self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values",
lambda: df.select(sha2(df.a, 1024)).collect())
+ def test_with_column_with_existing_name(self):
+ keys = self.df.withColumn("key", self.df.key).select("key").collect()
+ self.assertEqual([r.key for r in keys], list(range(100)))
+
class HiveContextSQLTests(ReusedPySparkTestCase):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index fd0ead4401..d6688b24ae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1133,7 +1133,8 @@ class DataFrame private[sql](
/////////////////////////////////////////////////////////////////////////////
/**
- * Returns a new [[DataFrame]] by adding a column.
+ * Returns a new [[DataFrame]] by adding a column or replacing the existing column that has
+ * the same name.
* @group dfops
* @since 1.3.0
*/