examples/src/main/python/cassandra_outputformat.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

import sys

from pyspark import SparkContext

"""
Create data in Cassandra fist
(following: https://wiki.apache.org/cassandra/GettingStarted)

cqlsh> CREATE KEYSPACE test
   ... WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
cqlsh> use test;
cqlsh:test> CREATE TABLE users (
        ...   user_id int PRIMARY KEY,
        ...   fname text,
        ...   lname text
        ... );

> cassandra_outputformat <host> test users 1745 john smith
> cassandra_outputformat <host> test users 1744 john doe
> cassandra_outputformat <host> test users 1746 john smith

cqlsh:test> SELECT * FROM users;

 user_id | fname | lname
---------+-------+-------
    1745 |  john | smith
    1744 |  john |   doe
    1746 |  john | smith
"""
if __name__ == "__main__":
    if len(sys.argv) != 7:
        print("""
        Usage: cassandra_outputformat <host> <keyspace> <cf> <user_id> <fname> <lname>

        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar \
        /path/to/examples/cassandra_outputformat.py <args>
        Assumes you have created the following table <cf> in Cassandra already,
        running on <host>, in <keyspace>.

        cqlsh:<keyspace>> CREATE TABLE <cf> (
           ...   user_id int PRIMARY KEY,
           ...   fname text,
           ...   lname text
           ... );
        """, file=sys.stderr)
        exit(-1)

    host = sys.argv[1]
    keyspace = sys.argv[2]
    cf = sys.argv[3]
    sc = SparkContext(appName="CassandraOutputFormat")

    conf = {"cassandra.output.thrift.address": host,
            "cassandra.output.thrift.port": "9160",
            "cassandra.output.keyspace": keyspace,
            "cassandra.output.partitioner.class": "Murmur3Partitioner",
            "cassandra.output.cql": "UPDATE " + keyspace + "." + cf + " SET fname = ?, lname = ?",
            "mapreduce.output.basename": cf,
            "mapreduce.outputformat.class": "org.apache.cassandra.hadoop.cql3.CqlOutputFormat",
            "mapreduce.job.output.key.class": "java.util.Map",
            "mapreduce.job.output.value.class": "java.util.List"}
    key = {"user_id": int(sys.argv[4])}
    sc.parallelize([(key, sys.argv[5:])]).saveAsNewAPIHadoopDataset(
        conf=conf,
        keyConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLKeyConverter",
        valueConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLValueConverter")

    sc.stop()