aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-06-07 01:21:02 -0700
committerReynold Xin <rxin@databricks.com>2015-06-07 01:21:02 -0700
commit0ac47083f7ef5fca9847bca2f0490719e1ccf50a (patch)
treedffb943acb9ac7fe7c82dc30132fe94e90e7d7bb /python/pyspark/sql/dataframe.py
parent26d07f1ece4174788b0bcdc338a14d0bbc0e3602 (diff)
downloadspark-0ac47083f7ef5fca9847bca2f0490719e1ccf50a.tar.gz
spark-0ac47083f7ef5fca9847bca2f0490719e1ccf50a.tar.bz2
spark-0ac47083f7ef5fca9847bca2f0490719e1ccf50a.zip
[SPARK-8146] DataFrame Python API: Alias replace in df.na
Author: Reynold Xin <rxin@databricks.com> Closes #6688 from rxin/df-alias-replace and squashes the following commits: 774c19c [Reynold Xin] [SPARK-8146] DataFrame Python API: Alias replace in DataFrameNaFunctions.
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py47
1 files changed, 22 insertions, 25 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 902504df5b..2d8c59518b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -909,8 +909,7 @@ class DataFrame(object):
@since("1.3.1")
def dropna(self, how='any', thresh=None, subset=None):
"""Returns a new :class:`DataFrame` omitting rows with null values.
-
- This is an alias for ``na.drop()``.
+ :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
:param how: 'any' or 'all'.
If 'any', drop a row if it contains any nulls.
@@ -920,13 +919,6 @@ class DataFrame(object):
This overwrites the `how` parameter.
:param subset: optional list of column names to consider.
- >>> df4.dropna().show()
- +---+------+-----+
- |age|height| name|
- +---+------+-----+
- | 10| 80|Alice|
- +---+------+-----+
-
>>> df4.na.drop().show()
+---+------+-----+
|age|height| name|
@@ -952,6 +944,7 @@ class DataFrame(object):
@since("1.3.1")
def fillna(self, value, subset=None):
"""Replace null values, alias for ``na.fill()``.
+ :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.
:param value: int, long, float, string, or dict.
Value to replace null values with.
@@ -963,7 +956,7 @@ class DataFrame(object):
For example, if `value` is a string, and subset contains a non-string column,
then the non-string column is simply ignored.
- >>> df4.fillna(50).show()
+ >>> df4.na.fill(50).show()
+---+------+-----+
|age|height| name|
+---+------+-----+
@@ -973,16 +966,6 @@ class DataFrame(object):
| 50| 50| null|
+---+------+-----+
- >>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
- +---+------+-------+
- |age|height| name|
- +---+------+-------+
- | 10| 80| Alice|
- | 5| null| Bob|
- | 50| null| Tom|
- | 50| null|unknown|
- +---+------+-------+
-
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
+---+------+-------+
|age|height| name|
@@ -1014,6 +997,8 @@ class DataFrame(object):
@since(1.4)
def replace(self, to_replace, value, subset=None):
"""Returns a new :class:`DataFrame` replacing a value with another value.
+ :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
+ aliases of each other.
:param to_replace: int, long, float, string, or list.
Value to be replaced.
@@ -1029,7 +1014,7 @@ class DataFrame(object):
For example, if `value` is a string, and subset contains a non-string column,
then the non-string column is simply ignored.
- >>> df4.replace(10, 20).show()
+ >>> df4.na.replace(10, 20).show()
+----+------+-----+
| age|height| name|
+----+------+-----+
@@ -1039,7 +1024,7 @@ class DataFrame(object):
|null| null| null|
+----+------+-----+
- >>> df4.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+ >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+----+------+----+
| age|height|name|
+----+------+----+
@@ -1090,9 +1075,9 @@ class DataFrame(object):
@since(1.4)
def corr(self, col1, col2, method=None):
"""
- Calculates the correlation of two columns of a DataFrame as a double value. Currently only
- supports the Pearson Correlation Coefficient.
- :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases.
+ Calculates the correlation of two columns of a DataFrame as a double value.
+ Currently only supports the Pearson Correlation Coefficient.
+ :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.
:param col1: The name of the first column
:param col2: The name of the second column
@@ -1241,7 +1226,10 @@ class DataFrame(object):
import pandas as pd
return pd.DataFrame.from_records(self.collect(), columns=self.columns)
+ ##########################################################################################
# Pandas compatibility
+ ##########################################################################################
+
groupby = groupBy
drop_duplicates = dropDuplicates
@@ -1261,6 +1249,8 @@ def _to_scala_map(sc, jm):
class DataFrameNaFunctions(object):
"""Functionality for working with missing data in :class:`DataFrame`.
+
+ .. versionadded:: 1.4
"""
def __init__(self, df):
@@ -1276,9 +1266,16 @@ class DataFrameNaFunctions(object):
fill.__doc__ = DataFrame.fillna.__doc__
+ def replace(self, to_replace, value, subset=None):
+ return self.df.replace(to_replace, value, subset)
+
+ replace.__doc__ = DataFrame.replace.__doc__
+
class DataFrameStatFunctions(object):
"""Functionality for statistic functions with :class:`DataFrame`.
+
+ .. versionadded:: 1.4
"""
def __init__(self, df):