From c7e348faec45ad1d996d16639015c4bc4fc3bc92 Mon Sep 17 00:00:00 2001
From: Andre Schumacher <schumach@icsi.berkeley.edu>
Date: Thu, 15 Aug 2013 16:01:19 -0700
Subject: Implementing SPARK-878 for PySpark: adding zip and egg files to
 context and passing it down to workers which add these to their sys.path

---
 python/pyspark/worker.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'python/pyspark/worker.py')

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 75d692beeb..695f6dfb84 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -49,15 +49,26 @@ def main(infile, outfile):
     split_index = read_int(infile)
     if split_index == -1:  # for unit tests
         return
+
+    # fetch name of workdir
     spark_files_dir = load_pickle(read_with_length(infile))
     SparkFiles._root_directory = spark_files_dir
     SparkFiles._is_running_on_worker = True
-    sys.path.append(spark_files_dir)
+
+    # fetch names and values of broadcast variables
     num_broadcast_variables = read_int(infile)
     for _ in range(num_broadcast_variables):
         bid = read_long(infile)
         value = read_with_length(infile)
         _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
+
+    # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
+    sys.path.append(spark_files_dir) # *.py files that were added will be copied here
+    num_python_includes =  read_int(infile)
+    for _ in range(num_python_includes):
+        sys.path.append(os.path.join(spark_files_dir, load_pickle(read_with_length(infile))))
+
+    # now load function
     func = load_obj(infile)
     bypassSerializer = load_obj(infile)
     if bypassSerializer:
-- 
cgit v1.2.3