From 9952217749118ae78fe794ca11e1c4a87a4ae8ba Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 23 Sep 2015 16:43:21 -0700 Subject: [SPARK-10731] [SQL] Delegate to Scala's DataFrame.take implementation in Python DataFrame. Python DataFrame.head/take now requires scanning all the partitions. This pull request changes them to delegate the actual implementation to Scala DataFrame (by calling DataFrame.take). This is more of a hack for fixing this issue in 1.5.1. A more proper fix is to change executeCollect and executeTake to return InternalRow rather than Row, and thus eliminate the extra round-trip conversion. Author: Reynold Xin Closes #8876 from rxin/SPARK-10731. --- python/pyspark/sql/dataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'python/pyspark/sql/dataframe.py') diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 80f8d8a0eb..b09422aade 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -300,7 +300,10 @@ class DataFrame(object): >>> df.take(2) [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ - return self.limit(num).collect() + with SCCallSiteSync(self._sc) as css: + port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe( + self._jdf, num) + return list(_load_from_socket(port, BatchedSerializer(PickleSerializer()))) @ignore_unicode_prefix @since(1.3) -- cgit v1.2.3