aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-05-11 19:15:14 -0700
committerMichael Armbrust <michael@databricks.com>2015-05-11 19:15:14 -0700
commitb6bf4f76c78abfaafa99b3c3c08b498aa9644346 (patch)
tree02578c66f048c2f8ada178e0f6fadbe9fc335210 /python
parentf9c7580adadce75a94bd2854cf4f743d8cbd1d23 (diff)
downloadspark-b6bf4f76c78abfaafa99b3c3c08b498aa9644346.tar.gz
spark-b6bf4f76c78abfaafa99b3c3c08b498aa9644346.tar.bz2
spark-b6bf4f76c78abfaafa99b3c3c08b498aa9644346.zip
[SPARK-7324] [SQL] DataFrame.dropDuplicates
This should also close https://github.com/apache/spark/pull/5870 Author: Reynold Xin <rxin@databricks.com> Closes #6066 from rxin/dropDups and squashes the following commits: 130692f [Reynold Xin] [SPARK-7324][SQL] DataFrame.dropDuplicates
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/dataframe.py36
1 files changed, 34 insertions, 2 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index c2fa6c8738..4eaa8d9c57 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -755,8 +755,6 @@ class DataFrame(object):
jdf = self._jdf.groupBy(self._jcols(*cols))
return GroupedData(jdf, self.sql_ctx)
- groupby = groupBy
-
def agg(self, *exprs):
""" Aggregate on the entire :class:`DataFrame` without groups
(shorthand for ``df.groupBy.agg()``).
@@ -793,6 +791,36 @@ class DataFrame(object):
"""
return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
+ def dropDuplicates(self, subset=None):
+ """Return a new :class:`DataFrame` with duplicate rows removed,
+ optionally only considering certain columns.
+
+ >>> from pyspark.sql import Row
+ >>> df = sc.parallelize([ \
+ Row(name='Alice', age=5, height=80), \
+ Row(name='Alice', age=5, height=80), \
+ Row(name='Alice', age=10, height=80)]).toDF()
+ >>> df.dropDuplicates().show()
+ +---+------+-----+
+ |age|height| name|
+ +---+------+-----+
+ | 5| 80|Alice|
+ | 10| 80|Alice|
+ +---+------+-----+
+
+ >>> df.dropDuplicates(['name', 'height']).show()
+ +---+------+-----+
+ |age|height| name|
+ +---+------+-----+
+ | 5| 80|Alice|
+ +---+------+-----+
+ """
+ if subset is None:
+ jdf = self._jdf.dropDuplicates()
+ else:
+ jdf = self._jdf.dropDuplicates(self._jseq(subset))
+ return DataFrame(jdf, self.sql_ctx)
+
def dropna(self, how='any', thresh=None, subset=None):
"""Returns a new :class:`DataFrame` omitting rows with null values.
@@ -1012,6 +1040,10 @@ class DataFrame(object):
import pandas as pd
return pd.DataFrame.from_records(self.collect(), columns=self.columns)
+ # Pandas compatibility
+ groupby = groupBy
+ drop_duplicates = dropDuplicates
+
# Having SchemaRDD for backward compatibility (for docs)
class SchemaRDD(DataFrame):