diff options
author | Reynold Xin <rxin@databricks.com> | 2015-05-11 19:15:14 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2015-05-11 19:15:14 -0700 |
commit | b6bf4f76c78abfaafa99b3c3c08b498aa9644346 (patch) | |
tree | 02578c66f048c2f8ada178e0f6fadbe9fc335210 /python/pyspark/sql/dataframe.py | |
parent | f9c7580adadce75a94bd2854cf4f743d8cbd1d23 (diff) | |
download | spark-b6bf4f76c78abfaafa99b3c3c08b498aa9644346.tar.gz spark-b6bf4f76c78abfaafa99b3c3c08b498aa9644346.tar.bz2 spark-b6bf4f76c78abfaafa99b3c3c08b498aa9644346.zip |
[SPARK-7324] [SQL] DataFrame.dropDuplicates
This should also close https://github.com/apache/spark/pull/5870
Author: Reynold Xin <rxin@databricks.com>
Closes #6066 from rxin/dropDups and squashes the following commits:
130692f [Reynold Xin] [SPARK-7324][SQL] DataFrame.dropDuplicates
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index c2fa6c8738..4eaa8d9c57 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -755,8 +755,6 @@ class DataFrame(object): jdf = self._jdf.groupBy(self._jcols(*cols)) return GroupedData(jdf, self.sql_ctx) - groupby = groupBy - def agg(self, *exprs): """ Aggregate on the entire :class:`DataFrame` without groups (shorthand for ``df.groupBy.agg()``). @@ -793,6 +791,36 @@ class DataFrame(object): """ return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx) + def dropDuplicates(self, subset=None): + """Return a new :class:`DataFrame` with duplicate rows removed, + optionally only considering certain columns. + + >>> from pyspark.sql import Row + >>> df = sc.parallelize([ \ + Row(name='Alice', age=5, height=80), \ + Row(name='Alice', age=5, height=80), \ + Row(name='Alice', age=10, height=80)]).toDF() + >>> df.dropDuplicates().show() + +---+------+-----+ + |age|height| name| + +---+------+-----+ + | 5| 80|Alice| + | 10| 80|Alice| + +---+------+-----+ + + >>> df.dropDuplicates(['name', 'height']).show() + +---+------+-----+ + |age|height| name| + +---+------+-----+ + | 5| 80|Alice| + +---+------+-----+ + """ + if subset is None: + jdf = self._jdf.dropDuplicates() + else: + jdf = self._jdf.dropDuplicates(self._jseq(subset)) + return DataFrame(jdf, self.sql_ctx) + def dropna(self, how='any', thresh=None, subset=None): """Returns a new :class:`DataFrame` omitting rows with null values. @@ -1012,6 +1040,10 @@ class DataFrame(object): import pandas as pd return pd.DataFrame.from_records(self.collect(), columns=self.columns) + # Pandas compatibility + groupby = groupBy + drop_duplicates = dropDuplicates + # Having SchemaRDD for backward compatibility (for docs) class SchemaRDD(DataFrame): |