aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/worker.py
blob: 4c33ae49dcc95a4bf0b2aab3d5854653ea9882eb (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Worker that receives input from Piped RDD.
"""
import time
preboot_time = time.time()
import os
import sys
import traceback
from base64 import standard_b64decode
# CloudPickler needs to be imported so that depicklers are registered using the
# copy_reg module.
from pyspark.accumulators import _accumulatorRegistry
from pyspark.broadcast import Broadcast, _broadcastRegistry
from pyspark.cloudpickle import CloudPickler
from pyspark.files import SparkFiles
from pyspark.serializers import write_with_length, read_with_length, write_int, \
    read_long, write_long, read_int, dump_pickle, load_pickle, read_from_pickle_file


# Redirect stdout to stderr so that users must return values from functions.
old_stdout = os.fdopen(os.dup(1), 'w')
os.dup2(2, 1)


def load_obj():
    return load_pickle(standard_b64decode(sys.stdin.readline().strip()))


def report_times(preboot, boot, init, finish):
    write_int(-3, old_stdout)
    write_long(1000 * preboot, old_stdout)
    write_long(1000 * boot, old_stdout)
    write_long(1000 * init, old_stdout)
    write_long(1000 * finish, old_stdout)


def main():
    boot_time = time.time()
    split_index = read_int(sys.stdin)
    spark_files_dir = load_pickle(read_with_length(sys.stdin))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True
    sys.path.append(spark_files_dir)
    num_broadcast_variables = read_int(sys.stdin)
    for _ in range(num_broadcast_variables):
        bid = read_long(sys.stdin)
        value = read_with_length(sys.stdin)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
    func = load_obj()
    bypassSerializer = load_obj()
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    init_time = time.time()
    iterator = read_from_pickle_file(sys.stdin)
    try:
        for obj in func(split_index, iterator):
           write_with_length(dumps(obj), old_stdout)
    except Exception as e:
        write_int(-2, old_stdout)
        write_with_length(traceback.format_exc(), old_stdout)
        sys.exit(-1)
    finish_time = time.time()
    report_times(preboot_time, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, old_stdout)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), old_stdout)


if __name__ == '__main__':
    main()