aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorKan Zhang <kzhang@apache.org>2014-05-25 00:06:42 -0700
committerReynold Xin <rxin@apache.org>2014-05-25 00:06:57 -0700
commit64d0fb52713baaa24c7734aa5c7a991785635230 (patch)
tree4dab594aef07c94248800055364465ba92ffe055 /python
parent7e59335eb958bea495c43e687a1b425b28121d75 (diff)
downloadspark-64d0fb52713baaa24c7734aa5c7a991785635230.tar.gz
spark-64d0fb52713baaa24c7734aa5c7a991785635230.tar.bz2
spark-64d0fb52713baaa24c7734aa5c7a991785635230.zip
[SPARK-1822] SchemaRDD.count() should use query optimizer
Author: Kan Zhang <kzhang@apache.org> Closes #841 from kanzhang/SPARK-1822 and squashes the following commits: 2f8072a [Kan Zhang] [SPARK-1822] Minor style update cf4baa4 [Kan Zhang] [SPARK-1822] Adding Scaladoc e67c910 [Kan Zhang] [SPARK-1822] SchemaRDD.count() should use optimizer (cherry picked from commit 6052db9dc10c996215658485e805200e4f0cf549) Signed-off-by: Reynold Xin <rxin@apache.org>
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index bbe69e7d8f..f2001afae4 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -268,7 +268,7 @@ class SchemaRDD(RDD):
def _jrdd(self):
"""
Lazy evaluation of PythonRDD object. Only done when a user calls methods defined by the
- L{pyspark.rdd.RDD} super class (map, count, etc.).
+ L{pyspark.rdd.RDD} super class (map, filter, etc.).
"""
if not hasattr(self, '_lazy_jrdd'):
self._lazy_jrdd = self._toPython()._jrdd
@@ -321,6 +321,18 @@ class SchemaRDD(RDD):
"""
self._jschema_rdd.saveAsTable(tableName)
+ def count(self):
+ """
+ Return the number of elements in this RDD.
+
+ >>> srdd = sqlCtx.inferSchema(rdd)
+ >>> srdd.count()
+ 3L
+ >>> srdd.count() == srdd.map(lambda x: x).count()
+ True
+ """
+ return self._jschema_rdd.count()
+
def _toPython(self):
# We have to import the Row class explicitly, so that the reference Pickler has is
# pyspark.sql.Row instead of __main__.Row