diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 70efeaf016..bb6df22682 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1158,6 +1158,12 @@ class DataFrame(object): """Return a new :class:`DataFrame` with duplicate rows removed, optionally only considering certain columns. + For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming + :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop + duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can + be and system will accordingly limit the state. In addition, too late data older than + watermark will be dropped to avoid any possibility of duplicates. + :func:`drop_duplicates` is an alias for :func:`dropDuplicates`. >>> from pyspark.sql import Row |