From eaa8a68ff08304f713f4f75d39c61c020e0e691d Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 29 Dec 2013 20:15:07 -0500 Subject: Fix some Python docs and make sure to unset SPARK_TESTING in Python tests so we don't get the test spark.conf on the classpath. --- python/epydoc.conf | 2 +- python/pyspark/__init__.py | 31 +++++++++++++++++-------------- python/pyspark/broadcast.py | 11 +++++++++++ python/pyspark/conf.py | 10 +++++----- python/pyspark/context.py | 3 ++- python/run-tests | 2 +- 6 files changed, 37 insertions(+), 22 deletions(-) (limited to 'python') diff --git a/python/epydoc.conf b/python/epydoc.conf index 0b42e729f8..95a6af0974 100644 --- a/python/epydoc.conf +++ b/python/epydoc.conf @@ -34,4 +34,4 @@ private: no exclude: pyspark.cloudpickle pyspark.worker pyspark.join pyspark.java_gateway pyspark.examples pyspark.shell pyspark.test - pyspark.rddsampler pyspark.daemon + pyspark.rddsampler pyspark.daemon pyspark.mllib._common diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index f1b95acf09..2b2c3a061a 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -20,21 +20,24 @@ PySpark is the Python API for Spark. Public classes: - - L{SparkContext} - Main entry point for Spark functionality. - - L{RDD} - A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. - - L{Broadcast} - A broadcast variable that gets reused across tasks. - - L{Accumulator} - An "add-only" shared variable that tasks can only add values to. - - L{SparkConf} - Access files shipped with jobs. - - L{StorageLevel} - Finer-grained cache persistence levels. + - L{SparkContext} + Main entry point for Spark functionality. + - L{RDD} + A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. + - L{Broadcast} + A broadcast variable that gets reused across tasks. + - L{Accumulator} + An "add-only" shared variable that tasks can only add values to. + - L{SparkConf} + For configuring Spark. + - L{SparkFiles} + Access files shipped with jobs. + - L{StorageLevel} + Finer-grained cache persistence levels. """ + + + import sys import os sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j0.7.egg")) diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py index dfdaba274f..43f40f8783 100644 --- a/python/pyspark/broadcast.py +++ b/python/pyspark/broadcast.py @@ -45,7 +45,18 @@ def _from_id(bid): class Broadcast(object): + """ + A broadcast variable created with + L{SparkContext.broadcast()}. + Access its value through C{.value}. + """ + def __init__(self, bid, value, java_broadcast=None, pickle_registry=None): + """ + Should not be called directly by users -- use + L{SparkContext.broadcast()} + instead. + """ self.value = value self.bid = bid self._jbroadcast = java_broadcast diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index a79f348b52..cf98b0e071 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -55,11 +55,11 @@ class SparkConf(object): parameters as key-value pairs. Most of the time, you would create a SparkConf object with - C{SparkConf()}, which will load values from `spark.*` Java system - properties and any `spark.conf` on your application's classpath. - In this case, system properties take priority over `spark.conf`, - and any parameters you set directly on the `SparkConf` object take - priority over both of those. + C{SparkConf()}, which will load values from C{spark.*} Java system + properties and any C{spark.conf} on your Spark classpath. In this + case, system properties take priority over C{spark.conf}, and any + parameters you set directly on the C{SparkConf} object take priority + over both of those. For unit tests, you can also call C{SparkConf(false)} to skip loading external settings and get the same configuration no matter diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 1244a1495f..8b028027eb 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -267,7 +267,8 @@ class SparkContext(object): def broadcast(self, value): """ - Broadcast a read-only variable to the cluster, returning a C{Broadcast} + Broadcast a read-only variable to the cluster, returning a + L{Broadcast} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ diff --git a/python/run-tests b/python/run-tests index a0898b3c21..4b71fff7c1 100755 --- a/python/run-tests +++ b/python/run-tests @@ -29,7 +29,7 @@ FAILED=0 rm -f unit-tests.log function run_test() { - $FWDIR/pyspark $1 2>&1 | tee -a unit-tests.log + SPARK_TESTING=0 $FWDIR/pyspark $1 2>&1 | tee -a unit-tests.log FAILED=$((PIPESTATUS[0]||$FAILED)) } -- cgit v1.2.3