diff options
author | Josh Rosen <joshrosen@apache.org> | 2014-08-18 20:42:19 -0700 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-08-18 20:42:19 -0700 |
commit | 1f1819b20f887b487557c31e54b8bcd95b582dc6 (patch) | |
tree | cc77eae373099f65c1d2accd1030b325be0b85f2 /python | |
parent | 217b5e915e2f21f047dfc4be680cd20d58baf9f8 (diff) | |
download | spark-1f1819b20f887b487557c31e54b8bcd95b582dc6.tar.gz spark-1f1819b20f887b487557c31e54b8bcd95b582dc6.tar.bz2 spark-1f1819b20f887b487557c31e54b8bcd95b582dc6.zip |
[SPARK-3114] [PySpark] Fix Python UDFs in Spark SQL.
This fixes SPARK-3114, an issue where we inadvertently broke Python UDFs in Spark SQL.
This PR modifiers the test runner script to always run the PySpark SQL tests, irrespective of whether SparkSQL itself has been modified. It also includes Davies' fix for the bug.
Closes #2026.
Author: Josh Rosen <joshrosen@apache.org>
Author: Davies Liu <davies.liu@gmail.com>
Closes #2027 from JoshRosen/pyspark-sql-fix and squashes the following commits:
9af2708 [Davies Liu] bugfix: disable compression of command
0d8d3a4 [Josh Rosen] Always run Python Spark SQL tests.
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/rdd.py | 2 | ||||
-rw-r--r-- | python/pyspark/worker.py | 2 | ||||
-rwxr-xr-x | python/run-tests | 4 |
3 files changed, 3 insertions, 5 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index c708b69cc1..86cd89b245 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1812,7 +1812,7 @@ class PipelinedRDD(RDD): self._jrdd_deserializer = NoOpSerializer() command = (self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer) - ser = CompressedSerializer(CloudPickleSerializer()) + ser = CloudPickleSerializer() pickled_command = ser.dumps(command) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 77a9c4a0e0..6805063e06 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -72,7 +72,7 @@ def main(infile, outfile): value = ser._read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, value) - command = ser._read_with_length(infile) + command = pickleSer._read_with_length(infile) (func, deserializer, serializer) = command init_time = time.time() iterator = deserializer.load_stream(infile) diff --git a/python/run-tests b/python/run-tests index b506559a5e..7b1ee3e1cd 100755 --- a/python/run-tests +++ b/python/run-tests @@ -59,9 +59,7 @@ $PYSPARK_PYTHON --version run_test "pyspark/rdd.py" run_test "pyspark/context.py" run_test "pyspark/conf.py" -if [ -n "$_RUN_SQL_TESTS" ]; then - run_test "pyspark/sql.py" -fi +run_test "pyspark/sql.py" # These tests are included in the module-level docs, and so must # be handled on a higher level rather than within the python file. export PYSPARK_DOC_TEST=1 |