[SPARK-5865][API DOC] Add doc warnings for methods that return local data structures

rxin srowen I work out note message for rdd.take function, please help to review. If it's fine, I can apply to all other function later. Author: Tommy YU <tummyyu@163.com> Closes #10874 from Wenpei/spark-5865-add-warning-for-localdatastructure.
author: Tommy YU <tummyyu@163.com> 2016-02-06 17:29:09 +0000
committer: Sean Owen <sowen@cloudera.com> 2016-02-06 17:29:09 +0000
commit: 81da3bee669aaeb79ec68baaf7c99bff6e5d14fe (patch)
tree: 269905def89ac9cd4cf438d1b45e2d261b0531b4 /python
parent: 4f28291f851b9062da3941e63de4eabb0c77f5d0 (diff)
download: spark-81da3bee669aaeb79ec68baaf7c99bff6e5d14fe.tar.gz
spark-81da3bee669aaeb79ec68baaf7c99bff6e5d14fe.tar.bz2
spark-81da3bee669aaeb79ec68baaf7c99bff6e5d14fe.zip
2 files changed, 23 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index c285946254..fe2264a63c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -426,6 +426,9 @@ class RDD(object):
         """
         Return a fixed-size sampled subset of this RDD.
 
+        Note that this method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         >>> rdd = sc.parallelize(range(0, 10))
         >>> len(rdd.takeSample(True, 20, 1))
         20
@@ -766,6 +769,8 @@ class RDD(object):
     def collect(self):
         """
         Return a list that contains all of the elements in this RDD.
+        Note that this method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
         """
         with SCCallSiteSync(self.context) as css:
             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
@@ -1213,6 +1218,9 @@ class RDD(object):
         """
         Get the top N elements from a RDD.
 
+        Note that this method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         Note: It returns the list sorted in descending order.
 
         >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
@@ -1235,6 +1243,9 @@ class RDD(object):
         Get the N elements from a RDD ordered in ascending order or as
         specified by the optional key function.
 
+        Note that this method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
         [1, 2, 3, 4, 5, 6]
         >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x)
@@ -1254,6 +1265,9 @@ class RDD(object):
         that partition to estimate the number of additional partitions needed
         to satisfy the limit.
 
+        Note that this method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         Translated from the Scala implementation in RDD#take().
 
         >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)
@@ -1511,6 +1525,9 @@ class RDD(object):
         """
         Return the key-value pairs in this RDD to the master as a dictionary.
 
+        Note that this method should only be used if the resulting data is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()
         >>> m[1]
         2
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 90a6b5d9c0..3a8c8305ee 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -739,6 +739,9 @@ class DataFrame(object):
     def head(self, n=None):
         """Returns the first ``n`` rows.
 
+        Note that this method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         :param n: int, default 1. Number of rows to return.
         :return: If n is greater than 1, return a list of :class:`Row`.
             If n is 1, return a single Row.
@@ -1330,6 +1333,9 @@ class DataFrame(object):
     def toPandas(self):
         """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
+        Note that this method should only be used if the resulting Pandas's DataFrame is expected
+        to be small, as all the data is loaded into the driver's memory.
+
         This is only available if Pandas is installed and available.
 
         >>> df.toPandas()  # doctest: +SKIP
author	Tommy YU <tummyyu@163.com>	2016-02-06 17:29:09 +0000
committer	Sean Owen <sowen@cloudera.com>	2016-02-06 17:29:09 +0000
commit	81da3bee669aaeb79ec68baaf7c99bff6e5d14fe (patch)
tree	269905def89ac9cd4cf438d1b45e2d261b0531b4 /python
parent	4f28291f851b9062da3941e63de4eabb0c77f5d0 (diff)
download	spark-81da3bee669aaeb79ec68baaf7c99bff6e5d14fe.tar.gz spark-81da3bee669aaeb79ec68baaf7c99bff6e5d14fe.tar.bz2 spark-81da3bee669aaeb79ec68baaf7c99bff6e5d14fe.zip