aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShuai Lin <linshuai2012@gmail.com>2016-12-07 06:09:27 +0800
committerSean Owen <sowen@cloudera.com>2016-12-07 06:09:27 +0800
commitbd9a4a5ac3abcc48131d1249df55e7d68266343a (patch)
treef0e912b499d92c696b7eb829209fb56da35d6059
parenteeed38eaf8c6912f3c51ba83903b67835a699f86 (diff)
downloadspark-bd9a4a5ac3abcc48131d1249df55e7d68266343a.tar.gz
spark-bd9a4a5ac3abcc48131d1249df55e7d68266343a.tar.bz2
spark-bd9a4a5ac3abcc48131d1249df55e7d68266343a.zip
[SPARK-18652][PYTHON] Include the example data and third-party licenses in pyspark package.
## What changes were proposed in this pull request? Since we already include the python examples in the pyspark package, we should include the example data with it as well. We should also include the third-party licences since we distribute their jars with the pyspark package. ## How was this patch tested? Manually tested with python2.7 and python3.4 ```sh $ ./build/mvn -DskipTests -Phive -Phive-thriftserver -Pyarn -Pmesos clean package $ cd python $ python setup.py sdist $ pip install dist/pyspark-2.1.0.dev0.tar.gz $ ls -1 /usr/local/lib/python2.7/dist-packages/pyspark/data/ graphx mllib streaming $ du -sh /usr/local/lib/python2.7/dist-packages/pyspark/data/ 600K /usr/local/lib/python2.7/dist-packages/pyspark/data/ $ ls -1 /usr/local/lib/python2.7/dist-packages/pyspark/licenses/|head -5 LICENSE-AnchorJS.txt LICENSE-DPark.txt LICENSE-Mockito.txt LICENSE-SnapTree.txt LICENSE-antlr.txt ``` Author: Shuai Lin <linshuai2012@gmail.com> Closes #16082 from lins05/include-data-in-pyspark-dist.
-rw-r--r--python/MANIFEST.in2
-rw-r--r--python/setup.py20
2 files changed, 21 insertions, 1 deletions
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index bbcce1baa4..40f1fb2f1e 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -17,6 +17,8 @@
global-exclude *.py[cod] __pycache__ .DS_Store
recursive-include deps/jars *.jar
graft deps/bin
+recursive-include deps/data *.data *.txt
+recursive-include deps/licenses *.txt
recursive-include deps/examples *.py
recursive-include lib *.zip
include README.md
diff --git a/python/setup.py b/python/setup.py
index 625aea0407..bc2eb4ce9d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -69,10 +69,14 @@ elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+DATA_PATH = os.path.join(SPARK_HOME, "data")
+LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
+
SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
JARS_TARGET = os.path.join(TEMP_PATH, "jars")
EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
-
+DATA_TARGET = os.path.join(TEMP_PATH, "data")
+LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
# Check and see if we are under the spark path in which case we need to build the symlink farm.
# This is important because we only want to build the symlink farm while under Spark otherwise we
@@ -114,11 +118,15 @@ try:
os.symlink(JARS_PATH, JARS_TARGET)
os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+ os.symlink(DATA_PATH, DATA_TARGET)
+ os.symlink(LICENSES_PATH, LICENSES_TARGET)
else:
# For windows fall back to the slower copytree
copytree(JARS_PATH, JARS_TARGET)
copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+ copytree(DATA_PATH, DATA_TARGET)
+ copytree(LICENSES_PATH, LICENSES_TARGET)
else:
# If we are not inside of SPARK_HOME verify we have the required symlink farm
if not os.path.exists(JARS_TARGET):
@@ -161,18 +169,24 @@ try:
'pyspark.jars',
'pyspark.python.pyspark',
'pyspark.python.lib',
+ 'pyspark.data',
+ 'pyspark.licenses',
'pyspark.examples.src.main.python'],
include_package_data=True,
package_dir={
'pyspark.jars': 'deps/jars',
'pyspark.bin': 'deps/bin',
'pyspark.python.lib': 'lib',
+ 'pyspark.data': 'deps/data',
+ 'pyspark.licenses': 'deps/licenses',
'pyspark.examples.src.main.python': 'deps/examples',
},
package_data={
'pyspark.jars': ['*.jar'],
'pyspark.bin': ['*'],
'pyspark.python.lib': ['*.zip'],
+ 'pyspark.data': ['*.txt', '*.data'],
+ 'pyspark.licenses': ['*.txt'],
'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
scripts=scripts,
license='http://www.apache.org/licenses/LICENSE-2.0',
@@ -202,8 +216,12 @@ finally:
os.remove(os.path.join(TEMP_PATH, "jars"))
os.remove(os.path.join(TEMP_PATH, "bin"))
os.remove(os.path.join(TEMP_PATH, "examples"))
+ os.remove(os.path.join(TEMP_PATH, "data"))
+ os.remove(os.path.join(TEMP_PATH, "licenses"))
else:
rmtree(os.path.join(TEMP_PATH, "jars"))
rmtree(os.path.join(TEMP_PATH, "bin"))
rmtree(os.path.join(TEMP_PATH, "examples"))
+ rmtree(os.path.join(TEMP_PATH, "data"))
+ rmtree(os.path.join(TEMP_PATH, "licenses"))
os.rmdir(TEMP_PATH)