aboutsummaryrefslogtreecommitdiff
path: root/python/setup.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/setup.py')
-rw-r--r--python/setup.py209
1 files changed, 209 insertions, 0 deletions
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000000..625aea0407
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import glob
+import os
+import sys
+from setuptools import setup, find_packages
+from shutil import copyfile, copytree, rmtree
+
+if sys.version_info < (2, 7):
+ print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
+ file=sys.stderr)
+ exit(-1)
+
+try:
+ exec(open('pyspark/version.py').read())
+except IOError:
+ print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
+ file=sys.stderr)
+ sys.exit(-1)
+VERSION = __version__
+# A temporary path so we can access above the Python project root and fetch scripts and jars we need
+TEMP_PATH = "deps"
+SPARK_HOME = os.path.abspath("../")
+
+# Provide guidance about how to use setup.py
+incorrect_invocation_message = """
+If you are installing pyspark from spark source, you must first build Spark and
+run sdist.
+
+ To build Spark with maven you can run:
+ ./build/mvn -DskipTests clean package
+ Building the source dist is done in the Python directory:
+ cd python
+ python setup.py sdist
+ pip install dist/*.tar.gz"""
+
+# Figure out where the jars are we need to package with PySpark.
+JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
+
+if len(JARS_PATH) == 1:
+ JARS_PATH = JARS_PATH[0]
+elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1):
+ # Release mode puts the jars in a jars directory
+ JARS_PATH = os.path.join(SPARK_HOME, "jars")
+elif len(JARS_PATH) > 1:
+ print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
+ JARS_PATH), file=sys.stderr)
+ sys.exit(-1)
+elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
+ print(incorrect_invocation_message, file=sys.stderr)
+ sys.exit(-1)
+
+EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
+SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
+JARS_TARGET = os.path.join(TEMP_PATH, "jars")
+EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
+
+
+# Check and see if we are under the spark path in which case we need to build the symlink farm.
+# This is important because we only want to build the symlink farm while under Spark otherwise we
+# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
+# partially built sdist) we should error and have the user sort it out.
+in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or
+ (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1))
+
+
+def _supports_symlinks():
+ """Check if the system supports symlinks (e.g. *nix) or not."""
+ return getattr(os, "symlink", None) is not None
+
+
+if (in_spark):
+ # Construct links for setup
+ try:
+ os.mkdir(TEMP_PATH)
+ except:
+ print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
+ file=sys.stderr)
+ exit(-1)
+
+try:
+ # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
+ # find it where expected. The rest of the files aren't copied because they are accessed
+ # using Python imports instead which will be resolved correctly.
+ try:
+ os.makedirs("pyspark/python/pyspark")
+ except OSError:
+ # Don't worry if the directory already exists.
+ pass
+ copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
+
+ if (in_spark):
+ # Construct the symlink farm - this is necessary since we can't refer to the path above the
+ # package root and we need to copy the jars and scripts which are up above the python root.
+ if _supports_symlinks():
+ os.symlink(JARS_PATH, JARS_TARGET)
+ os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
+ os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+ else:
+ # For windows fall back to the slower copytree
+ copytree(JARS_PATH, JARS_TARGET)
+ copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
+ copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+ else:
+ # If we are not inside of SPARK_HOME verify we have the required symlink farm
+ if not os.path.exists(JARS_TARGET):
+ print("To build packaging must be in the python directory under the SPARK_HOME.",
+ file=sys.stderr)
+
+ if not os.path.isdir(SCRIPTS_TARGET):
+ print(incorrect_invocation_message, file=sys.stderr)
+ exit(-1)
+
+ # Scripts directive requires a list of each script path and does not take wild cards.
+ script_names = os.listdir(SCRIPTS_TARGET)
+ scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
+ # We add find_spark_home.py to the bin directory we install so that pip installed PySpark
+ # will search for SPARK_HOME with Python.
+ scripts.append("pyspark/find_spark_home.py")
+
+ # Parse the README markdown file into rst for PyPI
+ long_description = "!!!!! missing pandoc do not upload to PyPI !!!!"
+ try:
+ import pypandoc
+ long_description = pypandoc.convert('README.md', 'rst')
+ except ImportError:
+ print("Could not import pypandoc - required to package PySpark", file=sys.stderr)
+
+ setup(
+ name='pyspark',
+ version=VERSION,
+ description='Apache Spark Python API',
+ long_description=long_description,
+ author='Spark Developers',
+ author_email='dev@spark.apache.org',
+ url='https://github.com/apache/spark/tree/master/python',
+ packages=['pyspark',
+ 'pyspark.mllib',
+ 'pyspark.ml',
+ 'pyspark.sql',
+ 'pyspark.streaming',
+ 'pyspark.bin',
+ 'pyspark.jars',
+ 'pyspark.python.pyspark',
+ 'pyspark.python.lib',
+ 'pyspark.examples.src.main.python'],
+ include_package_data=True,
+ package_dir={
+ 'pyspark.jars': 'deps/jars',
+ 'pyspark.bin': 'deps/bin',
+ 'pyspark.python.lib': 'lib',
+ 'pyspark.examples.src.main.python': 'deps/examples',
+ },
+ package_data={
+ 'pyspark.jars': ['*.jar'],
+ 'pyspark.bin': ['*'],
+ 'pyspark.python.lib': ['*.zip'],
+ 'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
+ scripts=scripts,
+ license='http://www.apache.org/licenses/LICENSE-2.0',
+ install_requires=['py4j==0.10.4'],
+ setup_requires=['pypandoc'],
+ extras_require={
+ 'ml': ['numpy>=1.7'],
+ 'mllib': ['numpy>=1.7'],
+ 'sql': ['pandas']
+ },
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: Implementation :: CPython',
+ 'Programming Language :: Python :: Implementation :: PyPy']
+ )
+finally:
+ # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
+ # packaging.
+ if (in_spark):
+ # Depending on cleaning up the symlink farm or copied version
+ if _supports_symlinks():
+ os.remove(os.path.join(TEMP_PATH, "jars"))
+ os.remove(os.path.join(TEMP_PATH, "bin"))
+ os.remove(os.path.join(TEMP_PATH, "examples"))
+ else:
+ rmtree(os.path.join(TEMP_PATH, "jars"))
+ rmtree(os.path.join(TEMP_PATH, "bin"))
+ rmtree(os.path.join(TEMP_PATH, "examples"))
+ os.rmdir(TEMP_PATH)