diff options
author | Marcelo Vanzin <vanzin@cloudera.com> | 2016-04-25 10:20:51 -0700 |
---|---|---|
committer | Marcelo Vanzin <vanzin@cloudera.com> | 2016-04-25 10:20:51 -0700 |
commit | a680562a6f87a03a00f71bad1c424267ae75c641 (patch) | |
tree | fe08025da6437124c283e9450dd3bbc0e99c411d /examples/src/main/python | |
parent | bfda09991398ce44be91997252cf8e5ddd361737 (diff) | |
download | spark-a680562a6f87a03a00f71bad1c424267ae75c641.tar.gz spark-a680562a6f87a03a00f71bad1c424267ae75c641.tar.bz2 spark-a680562a6f87a03a00f71bad1c424267ae75c641.zip |
[SPARK-14744][EXAMPLES] Clean up examples packaging, remove outdated examples.
First, make all dependencies in the examples module provided, and explicitly
list a couple of ones that somehow are promoted to compile by maven. This
means that to run streaming examples, the streaming connector package needs
to be provided to run-examples using --packages or --jars, just like regular
apps.
Also, remove a couple of outdated examples. HBase has had Spark bindings for
a while and is even including them in the HBase distribution in the next
version, making the examples obsolete. The same applies to Cassandra, which
seems to have a proper Spark binding library already.
I just tested the build, which passes, and ran SparkPi. The examples jars
directory now has only two jars:
```
$ ls -1 examples/target/scala-2.11/jars/
scopt_2.11-3.3.0.jar
spark-examples_2.11-2.0.0-SNAPSHOT.jar
```
Author: Marcelo Vanzin <vanzin@cloudera.com>
Closes #12544 from vanzin/SPARK-14744.
Diffstat (limited to 'examples/src/main/python')
-rw-r--r-- | examples/src/main/python/cassandra_inputformat.py | 84 | ||||
-rw-r--r-- | examples/src/main/python/cassandra_outputformat.py | 88 | ||||
-rw-r--r-- | examples/src/main/python/hbase_inputformat.py | 90 | ||||
-rw-r--r-- | examples/src/main/python/hbase_outputformat.py | 73 |
4 files changed, 0 insertions, 335 deletions
diff --git a/examples/src/main/python/cassandra_inputformat.py b/examples/src/main/python/cassandra_inputformat.py deleted file mode 100644 index 93ca0cfcc9..0000000000 --- a/examples/src/main/python/cassandra_inputformat.py +++ /dev/null @@ -1,84 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys - -from pyspark import SparkContext - -""" -Create data in Cassandra fist -(following: https://wiki.apache.org/cassandra/GettingStarted) - -cqlsh> CREATE KEYSPACE test - ... WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; -cqlsh> use test; -cqlsh:test> CREATE TABLE users ( - ... user_id int PRIMARY KEY, - ... fname text, - ... lname text - ... ); -cqlsh:test> INSERT INTO users (user_id, fname, lname) - ... VALUES (1745, 'john', 'smith'); -cqlsh:test> INSERT INTO users (user_id, fname, lname) - ... VALUES (1744, 'john', 'doe'); -cqlsh:test> INSERT INTO users (user_id, fname, lname) - ... VALUES (1746, 'john', 'smith'); -cqlsh:test> SELECT * FROM users; - - user_id | fname | lname ----------+-------+------- - 1745 | john | smith - 1744 | john | doe - 1746 | john | smith -""" -if __name__ == "__main__": - if len(sys.argv) != 4: - print(""" - Usage: cassandra_inputformat <host> <keyspace> <cf> - - Run with example jar: - ./bin/spark-submit --driver-class-path /path/to/example/jar \ - /path/to/examples/cassandra_inputformat.py <host> <keyspace> <cf> - Assumes you have some data in Cassandra already, running on <host>, in <keyspace> and <cf> - """, file=sys.stderr) - exit(-1) - - host = sys.argv[1] - keyspace = sys.argv[2] - cf = sys.argv[3] - sc = SparkContext(appName="CassandraInputFormat") - - conf = {"cassandra.input.thrift.address": host, - "cassandra.input.thrift.port": "9160", - "cassandra.input.keyspace": keyspace, - "cassandra.input.columnfamily": cf, - "cassandra.input.partitioner.class": "Murmur3Partitioner", - "cassandra.input.page.row.size": "3"} - cass_rdd = sc.newAPIHadoopRDD( - "org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat", - "java.util.Map", - "java.util.Map", - keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter", - valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter", - conf=conf) - output = cass_rdd.collect() - for (k, v) in output: - print((k, v)) - - sc.stop() diff --git a/examples/src/main/python/cassandra_outputformat.py b/examples/src/main/python/cassandra_outputformat.py deleted file mode 100644 index 5d643eac92..0000000000 --- a/examples/src/main/python/cassandra_outputformat.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys - -from pyspark import SparkContext - -""" -Create data in Cassandra fist -(following: https://wiki.apache.org/cassandra/GettingStarted) - -cqlsh> CREATE KEYSPACE test - ... WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; -cqlsh> use test; -cqlsh:test> CREATE TABLE users ( - ... user_id int PRIMARY KEY, - ... fname text, - ... lname text - ... ); - -> cassandra_outputformat <host> test users 1745 john smith -> cassandra_outputformat <host> test users 1744 john doe -> cassandra_outputformat <host> test users 1746 john smith - -cqlsh:test> SELECT * FROM users; - - user_id | fname | lname ----------+-------+------- - 1745 | john | smith - 1744 | john | doe - 1746 | john | smith -""" -if __name__ == "__main__": - if len(sys.argv) != 7: - print(""" - Usage: cassandra_outputformat <host> <keyspace> <cf> <user_id> <fname> <lname> - - Run with example jar: - ./bin/spark-submit --driver-class-path /path/to/example/jar \ - /path/to/examples/cassandra_outputformat.py <args> - Assumes you have created the following table <cf> in Cassandra already, - running on <host>, in <keyspace>. - - cqlsh:<keyspace>> CREATE TABLE <cf> ( - ... user_id int PRIMARY KEY, - ... fname text, - ... lname text - ... ); - """, file=sys.stderr) - exit(-1) - - host = sys.argv[1] - keyspace = sys.argv[2] - cf = sys.argv[3] - sc = SparkContext(appName="CassandraOutputFormat") - - conf = {"cassandra.output.thrift.address": host, - "cassandra.output.thrift.port": "9160", - "cassandra.output.keyspace": keyspace, - "cassandra.output.partitioner.class": "Murmur3Partitioner", - "cassandra.output.cql": "UPDATE " + keyspace + "." + cf + " SET fname = ?, lname = ?", - "mapreduce.output.basename": cf, - "mapreduce.outputformat.class": "org.apache.cassandra.hadoop.cql3.CqlOutputFormat", - "mapreduce.job.output.key.class": "java.util.Map", - "mapreduce.job.output.value.class": "java.util.List"} - key = {"user_id": int(sys.argv[4])} - sc.parallelize([(key, sys.argv[5:])]).saveAsNewAPIHadoopDataset( - conf=conf, - keyConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLKeyConverter", - valueConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLValueConverter") - - sc.stop() diff --git a/examples/src/main/python/hbase_inputformat.py b/examples/src/main/python/hbase_inputformat.py deleted file mode 100644 index c5ae5d043b..0000000000 --- a/examples/src/main/python/hbase_inputformat.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys -import json - -from pyspark import SparkContext - -""" -Create test data in HBase first: - -hbase(main):016:0> create 'test', 'f1' -0 row(s) in 1.0430 seconds - -hbase(main):017:0> put 'test', 'row1', 'f1:a', 'value1' -0 row(s) in 0.0130 seconds - -hbase(main):018:0> put 'test', 'row1', 'f1:b', 'value2' -0 row(s) in 0.0030 seconds - -hbase(main):019:0> put 'test', 'row2', 'f1', 'value3' -0 row(s) in 0.0050 seconds - -hbase(main):020:0> put 'test', 'row3', 'f1', 'value4' -0 row(s) in 0.0110 seconds - -hbase(main):021:0> scan 'test' -ROW COLUMN+CELL - row1 column=f1:a, timestamp=1401883411986, value=value1 - row1 column=f1:b, timestamp=1401883415212, value=value2 - row2 column=f1:, timestamp=1401883417858, value=value3 - row3 column=f1:, timestamp=1401883420805, value=value4 -4 row(s) in 0.0240 seconds -""" -if __name__ == "__main__": - if len(sys.argv) != 3: - print(""" - Usage: hbase_inputformat <host> <table> - - Run with example jar: - ./bin/spark-submit --driver-class-path /path/to/example/jar \ - /path/to/examples/hbase_inputformat.py <host> <table> [<znode>] - Assumes you have some data in HBase already, running on <host>, in <table> - optionally, you can specify parent znode for your hbase cluster - <znode> - """, file=sys.stderr) - exit(-1) - - host = sys.argv[1] - table = sys.argv[2] - sc = SparkContext(appName="HBaseInputFormat") - - # Other options for configuring scan behavior are available. More information available at - # https://github.com/apache/hbase/blob/master/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormat.java - conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table} - if len(sys.argv) > 3: - conf = {"hbase.zookeeper.quorum": host, "zookeeper.znode.parent": sys.argv[3], - "hbase.mapreduce.inputtable": table} - keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" - valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" - - hbase_rdd = sc.newAPIHadoopRDD( - "org.apache.hadoop.hbase.mapreduce.TableInputFormat", - "org.apache.hadoop.hbase.io.ImmutableBytesWritable", - "org.apache.hadoop.hbase.client.Result", - keyConverter=keyConv, - valueConverter=valueConv, - conf=conf) - hbase_rdd = hbase_rdd.flatMapValues(lambda v: v.split("\n")).mapValues(json.loads) - - output = hbase_rdd.collect() - for (k, v) in output: - print((k, v)) - - sc.stop() diff --git a/examples/src/main/python/hbase_outputformat.py b/examples/src/main/python/hbase_outputformat.py deleted file mode 100644 index 9e5641789a..0000000000 --- a/examples/src/main/python/hbase_outputformat.py +++ /dev/null @@ -1,73 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys - -from pyspark import SparkContext - -""" -Create test table in HBase first: - -hbase(main):001:0> create 'test', 'f1' -0 row(s) in 0.7840 seconds - -> hbase_outputformat <host> test row1 f1 q1 value1 -> hbase_outputformat <host> test row2 f1 q1 value2 -> hbase_outputformat <host> test row3 f1 q1 value3 -> hbase_outputformat <host> test row4 f1 q1 value4 - -hbase(main):002:0> scan 'test' -ROW COLUMN+CELL - row1 column=f1:q1, timestamp=1405659615726, value=value1 - row2 column=f1:q1, timestamp=1405659626803, value=value2 - row3 column=f1:q1, timestamp=1405659640106, value=value3 - row4 column=f1:q1, timestamp=1405659650292, value=value4 -4 row(s) in 0.0780 seconds -""" -if __name__ == "__main__": - if len(sys.argv) != 7: - print(""" - Usage: hbase_outputformat <host> <table> <row> <family> <qualifier> <value> - - Run with example jar: - ./bin/spark-submit --driver-class-path /path/to/example/jar \ - /path/to/examples/hbase_outputformat.py <args> - Assumes you have created <table> with column family <family> in HBase - running on <host> already - """, file=sys.stderr) - exit(-1) - - host = sys.argv[1] - table = sys.argv[2] - sc = SparkContext(appName="HBaseOutputFormat") - - conf = {"hbase.zookeeper.quorum": host, - "hbase.mapred.outputtable": table, - "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", - "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} - keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" - valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" - - sc.parallelize([sys.argv[3:]]).map(lambda x: (x[0], x)).saveAsNewAPIHadoopDataset( - conf=conf, - keyConverter=keyConv, - valueConverter=valueConv) - - sc.stop() |