diff options
author | Kan Zhang <kzhang@apache.org> | 2014-05-25 00:06:42 -0700 |
---|---|---|
committer | Reynold Xin <rxin@apache.org> | 2014-05-25 00:06:42 -0700 |
commit | 6052db9dc10c996215658485e805200e4f0cf549 (patch) | |
tree | c5575603da300b1dbdcce95f7bdff9457bc26094 /python | |
parent | 6e9fb6320bec3371bc9c010ccbc1b915f500486b (diff) | |
download | spark-6052db9dc10c996215658485e805200e4f0cf549.tar.gz spark-6052db9dc10c996215658485e805200e4f0cf549.tar.bz2 spark-6052db9dc10c996215658485e805200e4f0cf549.zip |
[SPARK-1822] SchemaRDD.count() should use query optimizer
Author: Kan Zhang <kzhang@apache.org>
Closes #841 from kanzhang/SPARK-1822 and squashes the following commits:
2f8072a [Kan Zhang] [SPARK-1822] Minor style update
cf4baa4 [Kan Zhang] [SPARK-1822] Adding Scaladoc
e67c910 [Kan Zhang] [SPARK-1822] SchemaRDD.count() should use optimizer
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/sql.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index bbe69e7d8f..f2001afae4 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -268,7 +268,7 @@ class SchemaRDD(RDD): def _jrdd(self): """ Lazy evaluation of PythonRDD object. Only done when a user calls methods defined by the - L{pyspark.rdd.RDD} super class (map, count, etc.). + L{pyspark.rdd.RDD} super class (map, filter, etc.). """ if not hasattr(self, '_lazy_jrdd'): self._lazy_jrdd = self._toPython()._jrdd @@ -321,6 +321,18 @@ class SchemaRDD(RDD): """ self._jschema_rdd.saveAsTable(tableName) + def count(self): + """ + Return the number of elements in this RDD. + + >>> srdd = sqlCtx.inferSchema(rdd) + >>> srdd.count() + 3L + >>> srdd.count() == srdd.map(lambda x: x).count() + True + """ + return self._jschema_rdd.count() + def _toPython(self): # We have to import the Row class explicitly, so that the reference Pickler has is # pyspark.sql.Row instead of __main__.Row |