diff options
-rw-r--r-- | docs/python-programming-guide.md | 11 | ||||
-rw-r--r-- | python/pyspark/context.py | 50 |
2 files changed, 49 insertions, 12 deletions
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md index 6c2336ad0c..55e39b1de1 100644 --- a/docs/python-programming-guide.md +++ b/docs/python-programming-guide.md @@ -131,6 +131,17 @@ sc = SparkContext("local", "App Name", pyFiles=['MyFile.py', 'lib.zip', 'app.egg Files listed here will be added to the `PYTHONPATH` and shipped to remote worker machines. Code dependencies can be added to an existing SparkContext using its `addPyFile()` method. +You can set [system properties](configuration.html#system-properties) +using `SparkContext.setSystemProperty()` class method *before* +instantiating SparkContext. For example, to set the amount of memory +per executor process: + +{% highlight python %} +from pyspark import SparkContext +SparkContext.setSystemProperty('spark.executor.memory', '2g') +sc = SparkContext("local", "App Name") +{% endhighlight %} + # API Docs [API documentation](api/pyspark/index.html) for PySpark is available as Epydoc. diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 597110321a..a7ca8bc888 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -49,6 +49,7 @@ class SparkContext(object): _lock = Lock() _python_includes = None # zip and egg files that need to be added to PYTHONPATH + def __init__(self, master, jobName, sparkHome=None, pyFiles=None, environment=None, batchSize=1024): """ @@ -66,19 +67,18 @@ class SparkContext(object): @param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching or -1 to use an unlimited batch size. + + + >>> from pyspark.context import SparkContext + >>> sc = SparkContext('local', 'test') + + >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + ValueError:... """ - with SparkContext._lock: - if SparkContext._active_spark_context: - raise ValueError("Cannot run multiple SparkContexts at once") - else: - SparkContext._active_spark_context = self - if not SparkContext._gateway: - SparkContext._gateway = launch_gateway() - SparkContext._jvm = SparkContext._gateway.jvm - SparkContext._writeIteratorToPickleFile = \ - SparkContext._jvm.PythonRDD.writeIteratorToPickleFile - SparkContext._takePartition = \ - SparkContext._jvm.PythonRDD.takePartition + SparkContext._ensure_initialized(self) + self.master = master self.jobName = jobName self.sparkHome = sparkHome or None # None becomes null in Py4J @@ -119,6 +119,32 @@ class SparkContext(object): self._temp_dir = \ self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath() + @classmethod + def _ensure_initialized(cls, instance=None): + with SparkContext._lock: + if not SparkContext._gateway: + SparkContext._gateway = launch_gateway() + SparkContext._jvm = SparkContext._gateway.jvm + SparkContext._writeIteratorToPickleFile = \ + SparkContext._jvm.PythonRDD.writeIteratorToPickleFile + SparkContext._takePartition = \ + SparkContext._jvm.PythonRDD.takePartition + + if instance: + if SparkContext._active_spark_context and SparkContext._active_spark_context != instance: + raise ValueError("Cannot run multiple SparkContexts at once") + else: + SparkContext._active_spark_context = instance + + @classmethod + def setSystemProperty(cls, key, value): + """ + Set a system property, such as spark.executor.memory. This must be + invoked before instantiating SparkContext. + """ + SparkContext._ensure_initialized() + SparkContext._jvm.java.lang.System.setProperty(key, value) + @property def defaultParallelism(self): """ |