aboutsummaryrefslogtreecommitdiff
path: root/bin/pyspark
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@apache.org>2014-10-09 16:08:07 -0700
committerJosh Rosen <joshrosen@apache.org>2014-10-09 16:08:07 -0700
commit4e9b551a0b807f5a2cc6679165c8be4e88a3d077 (patch)
treed7d6be00b5dbb04d38eab034c9dc18e73b5002de /bin/pyspark
parentac302052870a650d56f2d3131c27755bb2960ad7 (diff)
downloadspark-4e9b551a0b807f5a2cc6679165c8be4e88a3d077.tar.gz
spark-4e9b551a0b807f5a2cc6679165c8be4e88a3d077.tar.bz2
spark-4e9b551a0b807f5a2cc6679165c8be4e88a3d077.zip
[SPARK-3772] Allow `ipython` to be used by Pyspark workers; IPython support improvements:
This pull request addresses a few issues related to PySpark's IPython support: - Fix the remaining uses of the '-u' flag, which IPython doesn't support (see SPARK-3772). - Change PYSPARK_PYTHON_OPTS to PYSPARK_DRIVER_PYTHON_OPTS, so that the old name is reserved in case we ever want to allow the worker Python options to be customized (this variable was introduced in #2554 and hasn't landed in a release yet, so this doesn't break any compatibility). - Introduce a PYSPARK_DRIVER_PYTHON option that allows the driver to use `ipython` while the workers use a different Python version. - Attempt to use Python 2.7 by default if PYSPARK_PYTHON is not specified. - Retain the old semantics for IPYTHON=1 and IPYTHON_OPTS (to avoid breaking existing example programs). There are more details in a block comment in `bin/pyspark`. Author: Josh Rosen <joshrosen@apache.org> Closes #2651 from JoshRosen/SPARK-3772 and squashes the following commits: 7b8eb86 [Josh Rosen] More changes to PySpark python executable configuration: c4f5778 [Josh Rosen] [SPARK-3772] Allow ipython to be used by Pyspark workers; IPython fixes:
Diffstat (limited to 'bin/pyspark')
-rwxr-xr-xbin/pyspark51
1 files changed, 38 insertions, 13 deletions
diff --git a/bin/pyspark b/bin/pyspark
index 6655725ef8..96f30a260a 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -50,22 +50,47 @@ fi
. "$FWDIR"/bin/load-spark-env.sh
-# Figure out which Python executable to use
+# In Spark <= 1.1, setting IPYTHON=1 would cause the driver to be launched using the `ipython`
+# executable, while the worker would still be launched using PYSPARK_PYTHON.
+#
+# In Spark 1.2, we removed the documentation of the IPYTHON and IPYTHON_OPTS variables and added
+# PYSPARK_DRIVER_PYTHON and PYSPARK_DRIVER_PYTHON_OPTS to allow IPython to be used for the driver.
+# Now, users can simply set PYSPARK_DRIVER_PYTHON=ipython to use IPython and set
+# PYSPARK_DRIVER_PYTHON_OPTS to pass options when starting the Python driver
+# (e.g. PYSPARK_DRIVER_PYTHON_OPTS='notebook'). This supports full customization of the IPython
+# and executor Python executables.
+#
+# For backwards-compatibility, we retain the old IPYTHON and IPYTHON_OPTS variables.
+
+# Determine the Python executable to use if PYSPARK_PYTHON or PYSPARK_DRIVER_PYTHON isn't set:
+if hash python2.7 2>/dev/null; then
+ # Attempt to use Python 2.7, if installed:
+ DEFAULT_PYTHON="python2.7"
+else
+ DEFAULT_PYTHON="python"
+fi
+
+# Determine the Python executable to use for the driver:
+if [[ -n "$IPYTHON_OPTS" || "$IPYTHON" == "1" ]]; then
+ # If IPython options are specified, assume user wants to run IPython
+ # (for backwards-compatibility)
+ PYSPARK_DRIVER_PYTHON_OPTS="$PYSPARK_DRIVER_PYTHON_OPTS $IPYTHON_OPTS"
+ PYSPARK_DRIVER_PYTHON="ipython"
+elif [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
+ PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"$DEFAULT_PYTHON"}"
+fi
+
+# Determine the Python executable to use for the executors:
if [[ -z "$PYSPARK_PYTHON" ]]; then
- if [[ "$IPYTHON" = "1" || -n "$IPYTHON_OPTS" ]]; then
- # for backward compatibility
- PYSPARK_PYTHON="ipython"
+ if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && $DEFAULT_PYTHON != "python2.7" ]]; then
+ echo "IPython requires Python 2.7+; please install python2.7 or set PYSPARK_PYTHON" 1>&2
+ exit 1
else
- PYSPARK_PYTHON="python"
+ PYSPARK_PYTHON="$DEFAULT_PYTHON"
fi
fi
export PYSPARK_PYTHON
-if [[ -z "$PYSPARK_PYTHON_OPTS" && -n "$IPYTHON_OPTS" ]]; then
- # for backward compatibility
- PYSPARK_PYTHON_OPTS="$IPYTHON_OPTS"
-fi
-
# Add the PySpark classes to the Python path:
export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
@@ -93,9 +118,9 @@ if [[ -n "$SPARK_TESTING" ]]; then
unset YARN_CONF_DIR
unset HADOOP_CONF_DIR
if [[ -n "$PYSPARK_DOC_TEST" ]]; then
- exec "$PYSPARK_PYTHON" -m doctest $1
+ exec "$PYSPARK_DRIVER_PYTHON" -m doctest $1
else
- exec "$PYSPARK_PYTHON" $1
+ exec "$PYSPARK_DRIVER_PYTHON" $1
fi
exit
fi
@@ -111,5 +136,5 @@ if [[ "$1" =~ \.py$ ]]; then
else
# PySpark shell requires special handling downstream
export PYSPARK_SHELL=1
- exec "$PYSPARK_PYTHON" $PYSPARK_PYTHON_OPTS
+ exec "$PYSPARK_DRIVER_PYTHON" $PYSPARK_DRIVER_PYTHON_OPTS
fi