aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/MANIFEST.in22
-rw-r--r--python/README.md32
-rw-r--r--python/pyspark/__init__.py1
-rwxr-xr-xpython/pyspark/find_spark_home.py74
-rw-r--r--python/pyspark/java_gateway.py3
-rw-r--r--python/pyspark/version.py19
-rw-r--r--python/setup.cfg22
-rw-r--r--python/setup.py209
8 files changed, 381 insertions, 1 deletions
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
new file mode 100644
index 0000000000..bbcce1baa4
--- /dev/null
+++ b/python/MANIFEST.in
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+global-exclude *.py[cod] __pycache__ .DS_Store
+recursive-include deps/jars *.jar
+graft deps/bin
+recursive-include deps/examples *.py
+recursive-include lib *.zip
+include README.md
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000000..0a5c8010b8
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,32 @@
+# Apache Spark
+
+Spark is a fast and general cluster computing system for Big Data. It provides
+high-level APIs in Scala, Java, Python, and R, and an optimized engine that
+supports general computation graphs for data analysis. It also supports a
+rich set of higher-level tools including Spark SQL for SQL and DataFrames,
+MLlib for machine learning, GraphX for graph processing,
+and Spark Streaming for stream processing.
+
+<http://spark.apache.org/>
+
+## Online Documentation
+
+You can find the latest Spark documentation, including a programming
+guide, on the [project web page](http://spark.apache.org/documentation.html)
+
+
+## Python Packaging
+
+This README file only contains basic information related to pip installed PySpark.
+This packaging is currently experimental and may change in future versions (although we will do our best to keep compatibility).
+Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instructions at
+["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
+
+The Python packaging for Spark is not intended to replace all of the other use cases. This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. You can download the full version of Spark from the [Apache Spark downloads page](http://spark.apache.org/downloads.html).
+
+
+**NOTE:** If you are using this with a Spark standalone cluster you must ensure that the version (including minor version) matches or you may experience odd errors.
+
+## Python Requirements
+
+At its core PySpark depends on Py4J (currently version 0.10.4), but additional sub-packages have their own requirements (including numpy and pandas). \ No newline at end of file
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index ec1687415a..5f93586a48 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -50,6 +50,7 @@ from pyspark.broadcast import Broadcast
from pyspark.serializers import MarshalSerializer, PickleSerializer
from pyspark.status import *
from pyspark.profiler import Profiler, BasicProfiler
+from pyspark.version import __version__
def since(version):
diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py
new file mode 100755
index 0000000000..212a618b76
--- /dev/null
+++ b/python/pyspark/find_spark_home.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script attempt to determine the correct setting for SPARK_HOME given
+# that Spark may have been installed on the system with pip.
+
+from __future__ import print_function
+import os
+import sys
+
+
+def _find_spark_home():
+ """Find the SPARK_HOME."""
+ # If the enviroment has SPARK_HOME set trust it.
+ if "SPARK_HOME" in os.environ:
+ return os.environ["SPARK_HOME"]
+
+ def is_spark_home(path):
+ """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME"""
+ return (os.path.isfile(os.path.join(path, "bin/spark-submit")) and
+ (os.path.isdir(os.path.join(path, "jars")) or
+ os.path.isdir(os.path.join(path, "assembly"))))
+
+ paths = ["../", os.path.dirname(os.path.realpath(__file__))]
+
+ # Add the path of the PySpark module if it exists
+ if sys.version < "3":
+ import imp
+ try:
+ module_home = imp.find_module("pyspark")[1]
+ paths.append(module_home)
+ # If we are installed in edit mode also look two dirs up
+ paths.append(os.path.join(module_home, "../../"))
+ except ImportError:
+ # Not pip installed no worries
+ pass
+ else:
+ from importlib.util import find_spec
+ try:
+ module_home = os.path.dirname(find_spec("pyspark").origin)
+ paths.append(module_home)
+ # If we are installed in edit mode also look two dirs up
+ paths.append(os.path.join(module_home, "../../"))
+ except ImportError:
+ # Not pip installed no worries
+ pass
+
+ # Normalize the paths
+ paths = [os.path.abspath(p) for p in paths]
+
+ try:
+ return next(path for path in paths if is_spark_home(path))
+ except StopIteration:
+ print("Could not find valid SPARK_HOME while searching {0}".format(paths), file=sys.stderr)
+ exit(-1)
+
+if __name__ == "__main__":
+ print(_find_spark_home())
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c1cf843d84..3c783ae541 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -29,6 +29,7 @@ if sys.version >= '3':
xrange = range
from py4j.java_gateway import java_import, JavaGateway, GatewayClient
+from pyspark.find_spark_home import _find_spark_home
from pyspark.serializers import read_int
@@ -41,7 +42,7 @@ def launch_gateway(conf=None):
if "PYSPARK_GATEWAY_PORT" in os.environ:
gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
else:
- SPARK_HOME = os.environ["SPARK_HOME"]
+ SPARK_HOME = _find_spark_home()
# Launch the Py4j gateway using Spark's run command so that we pick up the
# proper classpath and settings from spark-env.sh
on_windows = platform.system() == "Windows"
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
new file mode 100644
index 0000000000..08a301695f
--- /dev/null
+++ b/python/pyspark/version.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "2.1.0.dev0"
diff --git a/python/setup.cfg b/python/setup.cfg
new file mode 100644
index 0000000000..d100b932bb
--- /dev/null
+++ b/python/setup.cfg
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[bdist_wheel]
+universal = 1
+
+[metadata]
+description-file = README.md
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000000..625aea0407
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import glob
+import os
+import sys
+from setuptools import setup, find_packages
+from shutil import copyfile, copytree, rmtree
+
+if sys.version_info < (2, 7):
+ print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
+ file=sys.stderr)
+ exit(-1)
+
+try:
+ exec(open('pyspark/version.py').read())
+except IOError:
+ print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
+ file=sys.stderr)
+ sys.exit(-1)
+VERSION = __version__
+# A temporary path so we can access above the Python project root and fetch scripts and jars we need
+TEMP_PATH = "deps"
+SPARK_HOME = os.path.abspath("../")
+
+# Provide guidance about how to use setup.py
+incorrect_invocation_message = """
+If you are installing pyspark from spark source, you must first build Spark and
+run sdist.
+
+ To build Spark with maven you can run:
+ ./build/mvn -DskipTests clean package
+ Building the source dist is done in the Python directory:
+ cd python
+ python setup.py sdist
+ pip install dist/*.tar.gz"""
+
+# Figure out where the jars are we need to package with PySpark.
+JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
+
+if len(JARS_PATH) == 1:
+ JARS_PATH = JARS_PATH[0]
+elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1):
+ # Release mode puts the jars in a jars directory
+ JARS_PATH = os.path.join(SPARK_HOME, "jars")
+elif len(JARS_PATH) > 1:
+ print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
+ JARS_PATH), file=sys.stderr)
+ sys.exit(-1)
+elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
+ print(incorrect_invocation_message, file=sys.stderr)
+ sys.exit(-1)
+
+EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
+SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
+JARS_TARGET = os.path.join(TEMP_PATH, "jars")
+EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
+
+
+# Check and see if we are under the spark path in which case we need to build the symlink farm.
+# This is important because we only want to build the symlink farm while under Spark otherwise we
+# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
+# partially built sdist) we should error and have the user sort it out.
+in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or
+ (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1))
+
+
+def _supports_symlinks():
+ """Check if the system supports symlinks (e.g. *nix) or not."""
+ return getattr(os, "symlink", None) is not None
+
+
+if (in_spark):
+ # Construct links for setup
+ try:
+ os.mkdir(TEMP_PATH)
+ except:
+ print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
+ file=sys.stderr)
+ exit(-1)
+
+try:
+ # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
+ # find it where expected. The rest of the files aren't copied because they are accessed
+ # using Python imports instead which will be resolved correctly.
+ try:
+ os.makedirs("pyspark/python/pyspark")
+ except OSError:
+ # Don't worry if the directory already exists.
+ pass
+ copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
+
+ if (in_spark):
+ # Construct the symlink farm - this is necessary since we can't refer to the path above the
+ # package root and we need to copy the jars and scripts which are up above the python root.
+ if _supports_symlinks():
+ os.symlink(JARS_PATH, JARS_TARGET)
+ os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
+ os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+ else:
+ # For windows fall back to the slower copytree
+ copytree(JARS_PATH, JARS_TARGET)
+ copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
+ copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+ else:
+ # If we are not inside of SPARK_HOME verify we have the required symlink farm
+ if not os.path.exists(JARS_TARGET):
+ print("To build packaging must be in the python directory under the SPARK_HOME.",
+ file=sys.stderr)
+
+ if not os.path.isdir(SCRIPTS_TARGET):
+ print(incorrect_invocation_message, file=sys.stderr)
+ exit(-1)
+
+ # Scripts directive requires a list of each script path and does not take wild cards.
+ script_names = os.listdir(SCRIPTS_TARGET)
+ scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
+ # We add find_spark_home.py to the bin directory we install so that pip installed PySpark
+ # will search for SPARK_HOME with Python.
+ scripts.append("pyspark/find_spark_home.py")
+
+ # Parse the README markdown file into rst for PyPI
+ long_description = "!!!!! missing pandoc do not upload to PyPI !!!!"
+ try:
+ import pypandoc
+ long_description = pypandoc.convert('README.md', 'rst')
+ except ImportError:
+ print("Could not import pypandoc - required to package PySpark", file=sys.stderr)
+
+ setup(
+ name='pyspark',
+ version=VERSION,
+ description='Apache Spark Python API',
+ long_description=long_description,
+ author='Spark Developers',
+ author_email='dev@spark.apache.org',
+ url='https://github.com/apache/spark/tree/master/python',
+ packages=['pyspark',
+ 'pyspark.mllib',
+ 'pyspark.ml',
+ 'pyspark.sql',
+ 'pyspark.streaming',
+ 'pyspark.bin',
+ 'pyspark.jars',
+ 'pyspark.python.pyspark',
+ 'pyspark.python.lib',
+ 'pyspark.examples.src.main.python'],
+ include_package_data=True,
+ package_dir={
+ 'pyspark.jars': 'deps/jars',
+ 'pyspark.bin': 'deps/bin',
+ 'pyspark.python.lib': 'lib',
+ 'pyspark.examples.src.main.python': 'deps/examples',
+ },
+ package_data={
+ 'pyspark.jars': ['*.jar'],
+ 'pyspark.bin': ['*'],
+ 'pyspark.python.lib': ['*.zip'],
+ 'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
+ scripts=scripts,
+ license='http://www.apache.org/licenses/LICENSE-2.0',
+ install_requires=['py4j==0.10.4'],
+ setup_requires=['pypandoc'],
+ extras_require={
+ 'ml': ['numpy>=1.7'],
+ 'mllib': ['numpy>=1.7'],
+ 'sql': ['pandas']
+ },
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: Implementation :: CPython',
+ 'Programming Language :: Python :: Implementation :: PyPy']
+ )
+finally:
+ # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
+ # packaging.
+ if (in_spark):
+ # Depending on cleaning up the symlink farm or copied version
+ if _supports_symlinks():
+ os.remove(os.path.join(TEMP_PATH, "jars"))
+ os.remove(os.path.join(TEMP_PATH, "bin"))
+ os.remove(os.path.join(TEMP_PATH, "examples"))
+ else:
+ rmtree(os.path.join(TEMP_PATH, "jars"))
+ rmtree(os.path.join(TEMP_PATH, "bin"))
+ rmtree(os.path.join(TEMP_PATH, "examples"))
+ os.rmdir(TEMP_PATH)