[SPARK-6876] [PySpark] [SQL] add DataFrame na.replace in pyspark

Author: Daoyuan Wang <daoyuan.wang@intel.com> Closes #6003 from adrian-wang/pynareplace and squashes the following commits: 672efba [Daoyuan Wang] remove py2.7 feature 4a148f7 [Daoyuan Wang] to_replace support dict, value support single value, and add full tests 9e232e7 [Daoyuan Wang] rename scala map af0268a [Daoyuan Wang] remove na 63ac579 [Daoyuan Wang] add na.replace in pyspark
author: Daoyuan Wang <daoyuan.wang@intel.com> 2015-05-12 10:23:41 -0700
committer: Reynold Xin <rxin@databricks.com> 2015-05-12 10:23:41 -0700
commit: d86ce845840a92b4dde7975082738ed94ab8c570 (patch)
tree: 7c1f437169cf8132bd5c6b70bc374bea717e13ab /python/pyspark/sql/tests.py
parent: ec6f2a9774167014566fb9608ee4394d2ce5fd6a (diff)
download: spark-d86ce845840a92b4dde7975082738ed94ab8c570.tar.gz
spark-d86ce845840a92b4dde7975082738ed94ab8c570.tar.bz2
spark-d86ce845840a92b4dde7975082738ed94ab8c570.zip
1 files changed, 48 insertions, 0 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 7e63f4d646..1922d03af6 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -665,6 +665,54 @@ class SQLTests(ReusedPySparkTestCase):
         result = df.select(functions.bitwiseNOT(df.b)).collect()[0].asDict()
         self.assertEqual(~75, result['~b'])
 
+    def test_replace(self):
+        schema = StructType([
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+            StructField("height", DoubleType(), True)])
+
+        # replace with int
+        row = self.sqlCtx.createDataFrame([(u'Alice', 10, 10.0)], schema).replace(10, 20).first()
+        self.assertEqual(row.age, 20)
+        self.assertEqual(row.height, 20.0)
+
+        # replace with double
+        row = self.sqlCtx.createDataFrame(
+            [(u'Alice', 80, 80.0)], schema).replace(80.0, 82.1).first()
+        self.assertEqual(row.age, 82)
+        self.assertEqual(row.height, 82.1)
+
+        # replace with string
+        row = self.sqlCtx.createDataFrame(
+            [(u'Alice', 10, 80.1)], schema).replace(u'Alice', u'Ann').first()
+        self.assertEqual(row.name, u"Ann")
+        self.assertEqual(row.age, 10)
+
+        # replace with subset specified by a string of a column name w/ actual change
+        row = self.sqlCtx.createDataFrame(
+            [(u'Alice', 10, 80.1)], schema).replace(10, 20, subset='age').first()
+        self.assertEqual(row.age, 20)
+
+        # replace with subset specified by a string of a column name w/o actual change
+        row = self.sqlCtx.createDataFrame(
+            [(u'Alice', 10, 80.1)], schema).replace(10, 20, subset='height').first()
+        self.assertEqual(row.age, 10)
+
+        # replace with subset specified with one column replaced, another column not in subset
+        # stays unchanged.
+        row = self.sqlCtx.createDataFrame(
+            [(u'Alice', 10, 10.0)], schema).replace(10, 20, subset=['name', 'age']).first()
+        self.assertEqual(row.name, u'Alice')
+        self.assertEqual(row.age, 20)
+        self.assertEqual(row.height, 10.0)
+
+        # replace with subset specified but no column will be replaced
+        row = self.sqlCtx.createDataFrame(
+            [(u'Alice', 10, None)], schema).replace(10, 20, subset=['name', 'height']).first()
+        self.assertEqual(row.name, u'Alice')
+        self.assertEqual(row.age, 10)
+        self.assertEqual(row.height, None)
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
author	Daoyuan Wang <daoyuan.wang@intel.com>	2015-05-12 10:23:41 -0700
committer	Reynold Xin <rxin@databricks.com>	2015-05-12 10:23:41 -0700
commit	d86ce845840a92b4dde7975082738ed94ab8c570 (patch)
tree	7c1f437169cf8132bd5c6b70bc374bea717e13ab /python/pyspark/sql/tests.py
parent	ec6f2a9774167014566fb9608ee4394d2ce5fd6a (diff)
download	spark-d86ce845840a92b4dde7975082738ed94ab8c570.tar.gz spark-d86ce845840a92b4dde7975082738ed94ab8c570.tar.bz2 spark-d86ce845840a92b4dde7975082738ed94ab8c570.zip