From 256704c771d301700af9ebf0d180c1ba7c4116c0 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Wed, 9 Mar 2016 18:27:44 +0000 Subject: [SPARK-13595][BUILD] Move docker, extras modules into external ## What changes were proposed in this pull request? Move `docker` dirs out of top level into `external/`; move `extras/*` into `external/` ## How was this patch tested? This is tested with Jenkins tests. Author: Sean Owen Closes #11523 from srowen/SPARK-13595. --- extras/kinesis-asl/pom.xml | 87 ---- .../streaming/JavaKinesisWordCountASL.java | 189 ------- .../examples/streaming/kinesis_wordcount_asl.py | 83 --- .../src/main/resources/log4j.properties | 37 -- .../examples/streaming/KinesisWordCountASL.scala | 276 ---------- .../streaming/kinesis/KinesisBackedBlockRDD.scala | 288 ----------- .../streaming/kinesis/KinesisCheckpointer.scala | 133 ----- .../streaming/kinesis/KinesisInputDStream.scala | 76 --- .../spark/streaming/kinesis/KinesisReceiver.scala | 361 ------------- .../streaming/kinesis/KinesisRecordProcessor.scala | 177 ------- .../spark/streaming/kinesis/KinesisTestUtils.scala | 260 ---------- .../spark/streaming/kinesis/KinesisUtils.scala | 560 --------------------- .../streaming/kinesis/JavaKinesisStreamSuite.java | 62 --- .../src/test/resources/log4j.properties | 27 - .../kinesis/KPLBasedKinesisTestUtils.scala | 72 --- .../kinesis/KinesisBackedBlockRDDSuite.scala | 259 ---------- .../kinesis/KinesisCheckpointerSuite.scala | 152 ------ .../spark/streaming/kinesis/KinesisFunSuite.scala | 46 -- .../streaming/kinesis/KinesisReceiverSuite.scala | 210 -------- .../streaming/kinesis/KinesisStreamSuite.scala | 297 ----------- 20 files changed, 3652 deletions(-) delete mode 100644 extras/kinesis-asl/pom.xml delete mode 100644 extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java delete mode 100644 extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py delete mode 100644 extras/kinesis-asl/src/main/resources/log4j.properties delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala delete mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala delete mode 100644 extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java delete mode 100644 extras/kinesis-asl/src/test/resources/log4j.properties delete mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala delete mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala delete mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala delete mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala delete mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala delete mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala (limited to 'extras/kinesis-asl') diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml deleted file mode 100644 index 935155eb5d..0000000000 --- a/extras/kinesis-asl/pom.xml +++ /dev/null @@ -1,87 +0,0 @@ - - - - 4.0.0 - - org.apache.spark - spark-parent_2.11 - 2.0.0-SNAPSHOT - ../../pom.xml - - - - org.apache.spark - spark-streaming-kinesis-asl_2.11 - jar - Spark Kinesis Integration - - - streaming-kinesis-asl - - - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - - - org.apache.spark - spark-core_${scala.binary.version} - ${project.version} - test-jar - test - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${project.version} - test-jar - test - - - com.amazonaws - amazon-kinesis-client - ${aws.kinesis.client.version} - - - com.amazonaws - amazon-kinesis-producer - ${aws.kinesis.producer.version} - test - - - org.mockito - mockito-core - test - - - org.scalacheck - scalacheck_${scala.binary.version} - test - - - org.apache.spark - spark-test-tags_${scala.binary.version} - - - - target/scala-${scala.binary.version}/classes - target/scala-${scala.binary.version}/test-classes - - diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java deleted file mode 100644 index 5dc825dfdc..0000000000 --- a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.streaming; - -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.regex.Pattern; - -import com.amazonaws.regions.RegionUtils; -import org.apache.log4j.Logger; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.storage.StorageLevel; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaPairDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; -import org.apache.spark.streaming.kinesis.KinesisUtils; - -import scala.Tuple2; - -import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; -import com.amazonaws.services.kinesis.AmazonKinesisClient; -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream; - -/** - * Consumes messages from a Amazon Kinesis streams and does wordcount. - * - * This example spins up 1 Kinesis Receiver per shard for the given stream. - * It then starts pulling from the last checkpointed sequence number of the given stream. - * - * Usage: JavaKinesisWordCountASL [app-name] [stream-name] [endpoint-url] [region-name] - * [app-name] is the name of the consumer app, used to track the read data in DynamoDB - * [stream-name] name of the Kinesis stream (ie. mySparkStream) - * [endpoint-url] endpoint of the Kinesis service - * (e.g. https://kinesis.us-east-1.amazonaws.com) - * - * - * Example: - * # export AWS keys if necessary - * $ export AWS_ACCESS_KEY_ID=[your-access-key] - * $ export AWS_SECRET_KEY= - * - * # run the example - * $ SPARK_HOME/bin/run-example streaming.JavaKinesisWordCountASL myAppName mySparkStream \ - * https://kinesis.us-east-1.amazonaws.com - * - * There is a companion helper class called KinesisWordProducerASL which puts dummy data - * onto the Kinesis stream. - * - * This code uses the DefaultAWSCredentialsProviderChain to find credentials - * in the following order: - * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY - * Java System Properties - aws.accessKeyId and aws.secretKey - * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs - * Instance profile credentials - delivered through the Amazon EC2 metadata service - * For more information, see - * http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html - * - * See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on - * the Kinesis Spark Streaming integration. - */ -public final class JavaKinesisWordCountASL { // needs to be public for access from run-example - private static final Pattern WORD_SEPARATOR = Pattern.compile(" "); - private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class); - - public static void main(String[] args) { - // Check that all required args were passed in. - if (args.length != 3) { - System.err.println( - "Usage: JavaKinesisWordCountASL \n\n" + - " is the name of the app, used to track the read data in DynamoDB\n" + - " is the name of the Kinesis stream\n" + - " is the endpoint of the Kinesis service\n" + - " (e.g. https://kinesis.us-east-1.amazonaws.com)\n" + - "Generate data for the Kinesis stream using the example KinesisWordProducerASL.\n" + - "See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more\n" + - "details.\n" - ); - System.exit(1); - } - - // Set default log4j logging level to WARN to hide Spark logs - StreamingExamples.setStreamingLogLevels(); - - // Populate the appropriate variables from the given args - String kinesisAppName = args[0]; - String streamName = args[1]; - String endpointUrl = args[2]; - - // Create a Kinesis client in order to determine the number of shards for the given stream - AmazonKinesisClient kinesisClient = - new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()); - kinesisClient.setEndpoint(endpointUrl); - int numShards = - kinesisClient.describeStream(streamName).getStreamDescription().getShards().size(); - - - // In this example, we're going to create 1 Kinesis Receiver/input DStream for each shard. - // This is not a necessity; if there are less receivers/DStreams than the number of shards, - // then the shards will be automatically distributed among the receivers and each receiver - // will receive data from multiple shards. - int numStreams = numShards; - - // Spark Streaming batch interval - Duration batchInterval = new Duration(2000); - - // Kinesis checkpoint interval. Same as batchInterval for this example. - Duration kinesisCheckpointInterval = batchInterval; - - // Get the region name from the endpoint URL to save Kinesis Client Library metadata in - // DynamoDB of the same region as the Kinesis stream - String regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName(); - - // Setup the Spark config and StreamingContext - SparkConf sparkConfig = new SparkConf().setAppName("JavaKinesisWordCountASL"); - JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval); - - // Create the Kinesis DStreams - List> streamsList = new ArrayList<>(numStreams); - for (int i = 0; i < numStreams; i++) { - streamsList.add( - KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName, - InitialPositionInStream.LATEST, kinesisCheckpointInterval, StorageLevel.MEMORY_AND_DISK_2()) - ); - } - - // Union all the streams if there is more than 1 stream - JavaDStream unionStreams; - if (streamsList.size() > 1) { - unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size())); - } else { - // Otherwise, just use the 1 stream - unionStreams = streamsList.get(0); - } - - // Convert each line of Array[Byte] to String, and split into words - JavaDStream words = unionStreams.flatMap(new FlatMapFunction() { - @Override - public Iterator call(byte[] line) { - String s = new String(line, StandardCharsets.UTF_8); - return Arrays.asList(WORD_SEPARATOR.split(s)).iterator(); - } - }); - - // Map each word to a (word, 1) tuple so we can reduce by key to count the words - JavaPairDStream wordCounts = words.mapToPair( - new PairFunction() { - @Override - public Tuple2 call(String s) { - return new Tuple2(s, 1); - } - } - ).reduceByKey( - new Function2() { - @Override - public Integer call(Integer i1, Integer i2) { - return i1 + i2; - } - } - ); - - // Print the first 10 wordCounts - wordCounts.print(); - - // Start the streaming context and await termination - jssc.start(); - jssc.awaitTermination(); - } -} diff --git a/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py deleted file mode 100644 index 51f8c5ca66..0000000000 --- a/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" - Consumes messages from a Amazon Kinesis streams and does wordcount. - - This example spins up 1 Kinesis Receiver per shard for the given stream. - It then starts pulling from the last checkpointed sequence number of the given stream. - - Usage: kinesis_wordcount_asl.py - is the name of the consumer app, used to track the read data in DynamoDB - name of the Kinesis stream (ie. mySparkStream) - endpoint of the Kinesis service - (e.g. https://kinesis.us-east-1.amazonaws.com) - - - Example: - # export AWS keys if necessary - $ export AWS_ACCESS_KEY_ID= - $ export AWS_SECRET_KEY= - - # run the example - $ bin/spark-submit -jar extras/kinesis-asl/target/scala-*/\ - spark-streaming-kinesis-asl-assembly_*.jar \ - extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py \ - myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com - - There is a companion helper class called KinesisWordProducerASL which puts dummy data - onto the Kinesis stream. - - This code uses the DefaultAWSCredentialsProviderChain to find credentials - in the following order: - Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY - Java System Properties - aws.accessKeyId and aws.secretKey - Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs - Instance profile credentials - delivered through the Amazon EC2 metadata service - For more information, see - http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html - - See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on - the Kinesis Spark Streaming integration. -""" -from __future__ import print_function - -import sys - -from pyspark import SparkContext -from pyspark.streaming import StreamingContext -from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream - -if __name__ == "__main__": - if len(sys.argv) != 5: - print( - "Usage: kinesis_wordcount_asl.py ", - file=sys.stderr) - sys.exit(-1) - - sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl") - ssc = StreamingContext(sc, 1) - appName, streamName, endpointUrl, regionName = sys.argv[1:] - lines = KinesisUtils.createStream( - ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2) - counts = lines.flatMap(lambda line: line.split(" ")) \ - .map(lambda word: (word, 1)) \ - .reduceByKey(lambda a, b: a+b) - counts.pprint() - - ssc.start() - ssc.awaitTermination() diff --git a/extras/kinesis-asl/src/main/resources/log4j.properties b/extras/kinesis-asl/src/main/resources/log4j.properties deleted file mode 100644 index 6cdc9286c5..0000000000 --- a/extras/kinesis-asl/src/main/resources/log4j.properties +++ /dev/null @@ -1,37 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -log4j.rootCategory=WARN, console - -# File appender -log4j.appender.file=org.apache.log4j.FileAppender -log4j.appender.file.append=false -log4j.appender.file.file=target/unit-tests.log -log4j.appender.file.layout=org.apache.log4j.PatternLayout -log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n - -# Console appender -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.out -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.spark-project.jetty=WARN -log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO \ No newline at end of file diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala deleted file mode 100644 index 6a73bc0e30..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.streaming - -import java.nio.ByteBuffer - -import scala.util.Random - -import com.amazonaws.auth.{BasicAWSCredentials, DefaultAWSCredentialsProviderChain} -import com.amazonaws.regions.RegionUtils -import com.amazonaws.services.kinesis.AmazonKinesisClient -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream -import com.amazonaws.services.kinesis.model.PutRecordRequest -import org.apache.log4j.{Level, Logger} - -import org.apache.spark.{Logging, SparkConf} -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.{Milliseconds, StreamingContext} -import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions -import org.apache.spark.streaming.kinesis.KinesisUtils - - -/** - * Consumes messages from a Amazon Kinesis streams and does wordcount. - * - * This example spins up 1 Kinesis Receiver per shard for the given stream. - * It then starts pulling from the last checkpointed sequence number of the given stream. - * - * Usage: KinesisWordCountASL - * is the name of the consumer app, used to track the read data in DynamoDB - * name of the Kinesis stream (ie. mySparkStream) - * endpoint of the Kinesis service - * (e.g. https://kinesis.us-east-1.amazonaws.com) - * - * - * Example: - * # export AWS keys if necessary - * $ export AWS_ACCESS_KEY_ID= - * $ export AWS_SECRET_KEY= - * - * # run the example - * $ SPARK_HOME/bin/run-example streaming.KinesisWordCountASL myAppName mySparkStream \ - * https://kinesis.us-east-1.amazonaws.com - * - * There is a companion helper class called KinesisWordProducerASL which puts dummy data - * onto the Kinesis stream. - * - * This code uses the DefaultAWSCredentialsProviderChain to find credentials - * in the following order: - * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY - * Java System Properties - aws.accessKeyId and aws.secretKey - * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs - * Instance profile credentials - delivered through the Amazon EC2 metadata service - * For more information, see - * http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html - * - * See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on - * the Kinesis Spark Streaming integration. - */ -object KinesisWordCountASL extends Logging { - def main(args: Array[String]) { - // Check that all required args were passed in. - if (args.length != 3) { - System.err.println( - """ - |Usage: KinesisWordCountASL - | - | is the name of the consumer app, used to track the read data in DynamoDB - | is the name of the Kinesis stream - | is the endpoint of the Kinesis service - | (e.g. https://kinesis.us-east-1.amazonaws.com) - | - |Generate input data for Kinesis stream using the example KinesisWordProducerASL. - |See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more - |details. - """.stripMargin) - System.exit(1) - } - - StreamingExamples.setStreamingLogLevels() - - // Populate the appropriate variables from the given args - val Array(appName, streamName, endpointUrl) = args - - - // Determine the number of shards from the stream using the low-level Kinesis Client - // from the AWS Java SDK. - val credentials = new DefaultAWSCredentialsProviderChain().getCredentials() - require(credentials != null, - "No AWS credentials found. Please specify credentials using one of the methods specified " + - "in http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html") - val kinesisClient = new AmazonKinesisClient(credentials) - kinesisClient.setEndpoint(endpointUrl) - val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards().size - - - // In this example, we're going to create 1 Kinesis Receiver/input DStream for each shard. - // This is not a necessity; if there are less receivers/DStreams than the number of shards, - // then the shards will be automatically distributed among the receivers and each receiver - // will receive data from multiple shards. - val numStreams = numShards - - // Spark Streaming batch interval - val batchInterval = Milliseconds(2000) - - // Kinesis checkpoint interval is the interval at which the DynamoDB is updated with information - // on sequence number of records that have been received. Same as batchInterval for this - // example. - val kinesisCheckpointInterval = batchInterval - - // Get the region name from the endpoint URL to save Kinesis Client Library metadata in - // DynamoDB of the same region as the Kinesis stream - val regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName() - - // Setup the SparkConfig and StreamingContext - val sparkConfig = new SparkConf().setAppName("KinesisWordCountASL") - val ssc = new StreamingContext(sparkConfig, batchInterval) - - // Create the Kinesis DStreams - val kinesisStreams = (0 until numStreams).map { i => - KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, - InitialPositionInStream.LATEST, kinesisCheckpointInterval, StorageLevel.MEMORY_AND_DISK_2) - } - - // Union all the streams - val unionStreams = ssc.union(kinesisStreams) - - // Convert each line of Array[Byte] to String, and split into words - val words = unionStreams.flatMap(byteArray => new String(byteArray).split(" ")) - - // Map each word to a (word, 1) tuple so we can reduce by key to count the words - val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _) - - // Print the first 10 wordCounts - wordCounts.print() - - // Start the streaming context and await termination - ssc.start() - ssc.awaitTermination() - } -} - -/** - * Usage: KinesisWordProducerASL \ - * - * - * is the name of the Kinesis stream (ie. mySparkStream) - * is the endpoint of the Kinesis service - * (ie. https://kinesis.us-east-1.amazonaws.com) - * is the rate of records per second to put onto the stream - * is the rate of records per second to put onto the stream - * - * Example: - * $ SPARK_HOME/bin/run-example streaming.KinesisWordProducerASL mySparkStream \ - * https://kinesis.us-east-1.amazonaws.com us-east-1 10 5 - */ -object KinesisWordProducerASL { - def main(args: Array[String]) { - if (args.length != 4) { - System.err.println( - """ - |Usage: KinesisWordProducerASL - - | - | is the name of the Kinesis stream - | is the endpoint of the Kinesis service - | (e.g. https://kinesis.us-east-1.amazonaws.com) - | is the rate of records per second to put onto the stream - | is the rate of records per second to put onto the stream - | - """.stripMargin) - - System.exit(1) - } - - // Set default log4j logging level to WARN to hide Spark logs - StreamingExamples.setStreamingLogLevels() - - // Populate the appropriate variables from the given args - val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args - - // Generate the records and return the totals - val totals = generate(stream, endpoint, recordsPerSecond.toInt, - wordsPerRecord.toInt) - - // Print the array of (word, total) tuples - println("Totals for the words sent") - totals.foreach(println(_)) - } - - def generate(stream: String, - endpoint: String, - recordsPerSecond: Int, - wordsPerRecord: Int): Seq[(String, Int)] = { - - val randomWords = List("spark", "you", "are", "my", "father") - val totals = scala.collection.mutable.Map[String, Int]() - - // Create the low-level Kinesis Client from the AWS Java SDK. - val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain()) - kinesisClient.setEndpoint(endpoint) - - println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" + - s" $recordsPerSecond records per second and $wordsPerRecord words per record") - - // Iterate and put records onto the stream per the given recordPerSec and wordsPerRecord - for (i <- 1 to 10) { - // Generate recordsPerSec records to put onto the stream - val records = (1 to recordsPerSecond.toInt).foreach { recordNum => - // Randomly generate wordsPerRecord number of words - val data = (1 to wordsPerRecord.toInt).map(x => { - // Get a random index to a word - val randomWordIdx = Random.nextInt(randomWords.size) - val randomWord = randomWords(randomWordIdx) - - // Increment total count to compare to server counts later - totals(randomWord) = totals.getOrElse(randomWord, 0) + 1 - - randomWord - }).mkString(" ") - - // Create a partitionKey based on recordNum - val partitionKey = s"partitionKey-$recordNum" - - // Create a PutRecordRequest with an Array[Byte] version of the data - val putRecordRequest = new PutRecordRequest().withStreamName(stream) - .withPartitionKey(partitionKey) - .withData(ByteBuffer.wrap(data.getBytes())) - - // Put the record onto the stream and capture the PutRecordResult - val putRecordResult = kinesisClient.putRecord(putRecordRequest) - } - - // Sleep for a second - Thread.sleep(1000) - println("Sent " + recordsPerSecond + " records") - } - // Convert the totals to (index, total) tuple - totals.toSeq.sortBy(_._1) - } -} - -/** - * Utility functions for Spark Streaming examples. - * This has been lifted from the examples/ project to remove the circular dependency. - */ -private[streaming] object StreamingExamples extends Logging { - // Set reasonable logging levels for streaming if the user has not configured log4j. - def setStreamingLogLevels() { - val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements - if (!log4jInitialized) { - // We first log something to initialize Spark's default logging, then we override the - // logging level. - logInfo("Setting log level to [WARN] for streaming example." + - " To override add a custom log4j.properties to the classpath.") - Logger.getRootLogger.setLevel(Level.WARN) - } - } -} -// scalastyle:on println diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala deleted file mode 100644 index 3996f168e6..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import scala.collection.JavaConverters._ -import scala.reflect.ClassTag -import scala.util.control.NonFatal - -import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} -import com.amazonaws.services.kinesis.AmazonKinesisClient -import com.amazonaws.services.kinesis.clientlibrary.types.UserRecord -import com.amazonaws.services.kinesis.model._ - -import org.apache.spark._ -import org.apache.spark.rdd.{BlockRDD, BlockRDDPartition} -import org.apache.spark.storage.BlockId -import org.apache.spark.util.NextIterator - - -/** Class representing a range of Kinesis sequence numbers. Both sequence numbers are inclusive. */ -private[kinesis] -case class SequenceNumberRange( - streamName: String, shardId: String, fromSeqNumber: String, toSeqNumber: String) - -/** Class representing an array of Kinesis sequence number ranges */ -private[kinesis] -case class SequenceNumberRanges(ranges: Seq[SequenceNumberRange]) { - def isEmpty(): Boolean = ranges.isEmpty - - def nonEmpty(): Boolean = ranges.nonEmpty - - override def toString(): String = ranges.mkString("SequenceNumberRanges(", ", ", ")") -} - -private[kinesis] -object SequenceNumberRanges { - def apply(range: SequenceNumberRange): SequenceNumberRanges = { - new SequenceNumberRanges(Seq(range)) - } -} - - -/** Partition storing the information of the ranges of Kinesis sequence numbers to read */ -private[kinesis] -class KinesisBackedBlockRDDPartition( - idx: Int, - blockId: BlockId, - val isBlockIdValid: Boolean, - val seqNumberRanges: SequenceNumberRanges - ) extends BlockRDDPartition(blockId, idx) - -/** - * A BlockRDD where the block data is backed by Kinesis, which can accessed using the - * sequence numbers of the corresponding blocks. - */ -private[kinesis] -class KinesisBackedBlockRDD[T: ClassTag]( - sc: SparkContext, - val regionName: String, - val endpointUrl: String, - @transient private val _blockIds: Array[BlockId], - @transient val arrayOfseqNumberRanges: Array[SequenceNumberRanges], - @transient private val isBlockIdValid: Array[Boolean] = Array.empty, - val retryTimeoutMs: Int = 10000, - val messageHandler: Record => T = KinesisUtils.defaultMessageHandler _, - val awsCredentialsOption: Option[SerializableAWSCredentials] = None - ) extends BlockRDD[T](sc, _blockIds) { - - require(_blockIds.length == arrayOfseqNumberRanges.length, - "Number of blockIds is not equal to the number of sequence number ranges") - - override def isValid(): Boolean = true - - override def getPartitions: Array[Partition] = { - Array.tabulate(_blockIds.length) { i => - val isValid = if (isBlockIdValid.length == 0) true else isBlockIdValid(i) - new KinesisBackedBlockRDDPartition(i, _blockIds(i), isValid, arrayOfseqNumberRanges(i)) - } - } - - override def compute(split: Partition, context: TaskContext): Iterator[T] = { - val blockManager = SparkEnv.get.blockManager - val partition = split.asInstanceOf[KinesisBackedBlockRDDPartition] - val blockId = partition.blockId - - def getBlockFromBlockManager(): Option[Iterator[T]] = { - logDebug(s"Read partition data of $this from block manager, block $blockId") - blockManager.get(blockId).map(_.data.asInstanceOf[Iterator[T]]) - } - - def getBlockFromKinesis(): Iterator[T] = { - val credentials = awsCredentialsOption.getOrElse { - new DefaultAWSCredentialsProviderChain().getCredentials() - } - partition.seqNumberRanges.ranges.iterator.flatMap { range => - new KinesisSequenceRangeIterator(credentials, endpointUrl, regionName, - range, retryTimeoutMs).map(messageHandler) - } - } - if (partition.isBlockIdValid) { - getBlockFromBlockManager().getOrElse { getBlockFromKinesis() } - } else { - getBlockFromKinesis() - } - } -} - - -/** - * An iterator that return the Kinesis data based on the given range of sequence numbers. - * Internally, it repeatedly fetches sets of records starting from the fromSequenceNumber, - * until the endSequenceNumber is reached. - */ -private[kinesis] -class KinesisSequenceRangeIterator( - credentials: AWSCredentials, - endpointUrl: String, - regionId: String, - range: SequenceNumberRange, - retryTimeoutMs: Int) extends NextIterator[Record] with Logging { - - private val client = new AmazonKinesisClient(credentials) - private val streamName = range.streamName - private val shardId = range.shardId - - private var toSeqNumberReceived = false - private var lastSeqNumber: String = null - private var internalIterator: Iterator[Record] = null - - client.setEndpoint(endpointUrl, "kinesis", regionId) - - override protected def getNext(): Record = { - var nextRecord: Record = null - if (toSeqNumberReceived) { - finished = true - } else { - - if (internalIterator == null) { - - // If the internal iterator has not been initialized, - // then fetch records from starting sequence number - internalIterator = getRecords(ShardIteratorType.AT_SEQUENCE_NUMBER, range.fromSeqNumber) - } else if (!internalIterator.hasNext) { - - // If the internal iterator does not have any more records, - // then fetch more records after the last consumed sequence number - internalIterator = getRecords(ShardIteratorType.AFTER_SEQUENCE_NUMBER, lastSeqNumber) - } - - if (!internalIterator.hasNext) { - - // If the internal iterator still does not have any data, then throw exception - // and terminate this iterator - finished = true - throw new SparkException( - s"Could not read until the end sequence number of the range: $range") - } else { - - // Get the record, copy the data into a byte array and remember its sequence number - nextRecord = internalIterator.next() - lastSeqNumber = nextRecord.getSequenceNumber() - - // If the this record's sequence number matches the stopping sequence number, then make sure - // the iterator is marked finished next time getNext() is called - if (nextRecord.getSequenceNumber == range.toSeqNumber) { - toSeqNumberReceived = true - } - } - } - nextRecord - } - - override protected def close(): Unit = { - client.shutdown() - } - - /** - * Get records starting from or after the given sequence number. - */ - private def getRecords(iteratorType: ShardIteratorType, seqNum: String): Iterator[Record] = { - val shardIterator = getKinesisIterator(iteratorType, seqNum) - val result = getRecordsAndNextKinesisIterator(shardIterator) - result._1 - } - - /** - * Get the records starting from using a Kinesis shard iterator (which is a progress handle - * to get records from Kinesis), and get the next shard iterator for next consumption. - */ - private def getRecordsAndNextKinesisIterator( - shardIterator: String): (Iterator[Record], String) = { - val getRecordsRequest = new GetRecordsRequest - getRecordsRequest.setRequestCredentials(credentials) - getRecordsRequest.setShardIterator(shardIterator) - val getRecordsResult = retryOrTimeout[GetRecordsResult]( - s"getting records using shard iterator") { - client.getRecords(getRecordsRequest) - } - // De-aggregate records, if KPL was used in producing the records. The KCL automatically - // handles de-aggregation during regular operation. This code path is used during recovery - val recordIterator = UserRecord.deaggregate(getRecordsResult.getRecords) - (recordIterator.iterator().asScala, getRecordsResult.getNextShardIterator) - } - - /** - * Get the Kinesis shard iterator for getting records starting from or after the given - * sequence number. - */ - private def getKinesisIterator( - iteratorType: ShardIteratorType, - sequenceNumber: String): String = { - val getShardIteratorRequest = new GetShardIteratorRequest - getShardIteratorRequest.setRequestCredentials(credentials) - getShardIteratorRequest.setStreamName(streamName) - getShardIteratorRequest.setShardId(shardId) - getShardIteratorRequest.setShardIteratorType(iteratorType.toString) - getShardIteratorRequest.setStartingSequenceNumber(sequenceNumber) - val getShardIteratorResult = retryOrTimeout[GetShardIteratorResult]( - s"getting shard iterator from sequence number $sequenceNumber") { - client.getShardIterator(getShardIteratorRequest) - } - getShardIteratorResult.getShardIterator - } - - /** Helper method to retry Kinesis API request with exponential backoff and timeouts */ - private def retryOrTimeout[T](message: String)(body: => T): T = { - import KinesisSequenceRangeIterator._ - - var startTimeMs = System.currentTimeMillis() - var retryCount = 0 - var waitTimeMs = MIN_RETRY_WAIT_TIME_MS - var result: Option[T] = None - var lastError: Throwable = null - - def isTimedOut = (System.currentTimeMillis() - startTimeMs) >= retryTimeoutMs - def isMaxRetryDone = retryCount >= MAX_RETRIES - - while (result.isEmpty && !isTimedOut && !isMaxRetryDone) { - if (retryCount > 0) { // wait only if this is a retry - Thread.sleep(waitTimeMs) - waitTimeMs *= 2 // if you have waited, then double wait time for next round - } - try { - result = Some(body) - } catch { - case NonFatal(t) => - lastError = t - t match { - case ptee: ProvisionedThroughputExceededException => - logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ptee) - case e: Throwable => - throw new SparkException(s"Error while $message", e) - } - } - retryCount += 1 - } - result.getOrElse { - if (isTimedOut) { - throw new SparkException( - s"Timed out after $retryTimeoutMs ms while $message, last exception: ", lastError) - } else { - throw new SparkException( - s"Gave up after $retryCount retries while $message, last exception: ", lastError) - } - } - } -} - -private[streaming] -object KinesisSequenceRangeIterator { - val MAX_RETRIES = 3 - val MIN_RETRY_WAIT_TIME_MS = 100 -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala deleted file mode 100644 index 1ca6d4302c..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import java.util.concurrent._ - -import scala.util.control.NonFatal - -import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer -import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason - -import org.apache.spark.Logging -import org.apache.spark.streaming.Duration -import org.apache.spark.streaming.util.RecurringTimer -import org.apache.spark.util.{Clock, SystemClock, ThreadUtils} - -/** - * This is a helper class for managing Kinesis checkpointing. - * - * @param receiver The receiver that keeps track of which sequence numbers we can checkpoint - * @param checkpointInterval How frequently we will checkpoint to DynamoDB - * @param workerId Worker Id of KCL worker for logging purposes - * @param clock In order to use ManualClocks for the purpose of testing - */ -private[kinesis] class KinesisCheckpointer( - receiver: KinesisReceiver[_], - checkpointInterval: Duration, - workerId: String, - clock: Clock = new SystemClock) extends Logging { - - // a map from shardId's to checkpointers - private val checkpointers = new ConcurrentHashMap[String, IRecordProcessorCheckpointer]() - - private val lastCheckpointedSeqNums = new ConcurrentHashMap[String, String]() - - private val checkpointerThread: RecurringTimer = startCheckpointerThread() - - /** Update the checkpointer instance to the most recent one for the given shardId. */ - def setCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = { - checkpointers.put(shardId, checkpointer) - } - - /** - * Stop tracking the specified shardId. - * - * If a checkpointer is provided, e.g. on IRecordProcessor.shutdown [[ShutdownReason.TERMINATE]], - * we will use that to make the final checkpoint. If `null` is provided, we will not make the - * checkpoint, e.g. in case of [[ShutdownReason.ZOMBIE]]. - */ - def removeCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = { - synchronized { - checkpointers.remove(shardId) - checkpoint(shardId, checkpointer) - } - } - - /** Perform the checkpoint. */ - private def checkpoint(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = { - try { - if (checkpointer != null) { - receiver.getLatestSeqNumToCheckpoint(shardId).foreach { latestSeqNum => - val lastSeqNum = lastCheckpointedSeqNums.get(shardId) - // Kinesis sequence numbers are monotonically increasing strings, therefore we can do - // safely do the string comparison - if (lastSeqNum == null || latestSeqNum > lastSeqNum) { - /* Perform the checkpoint */ - KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(latestSeqNum), 4, 100) - logDebug(s"Checkpoint: WorkerId $workerId completed checkpoint at sequence number" + - s" $latestSeqNum for shardId $shardId") - lastCheckpointedSeqNums.put(shardId, latestSeqNum) - } - } - } else { - logDebug(s"Checkpointing skipped for shardId $shardId. Checkpointer not set.") - } - } catch { - case NonFatal(e) => - logWarning(s"Failed to checkpoint shardId $shardId to DynamoDB.", e) - } - } - - /** Checkpoint the latest saved sequence numbers for all active shardId's. */ - private def checkpointAll(): Unit = synchronized { - // if this method throws an exception, then the scheduled task will not run again - try { - val shardIds = checkpointers.keys() - while (shardIds.hasMoreElements) { - val shardId = shardIds.nextElement() - checkpoint(shardId, checkpointers.get(shardId)) - } - } catch { - case NonFatal(e) => - logWarning("Failed to checkpoint to DynamoDB.", e) - } - } - - /** - * Start the checkpointer thread with the given checkpoint duration. - */ - private def startCheckpointerThread(): RecurringTimer = { - val period = checkpointInterval.milliseconds - val threadName = s"Kinesis Checkpointer - Worker $workerId" - val timer = new RecurringTimer(clock, period, _ => checkpointAll(), threadName) - timer.start() - logDebug(s"Started checkpointer thread: $threadName") - timer - } - - /** - * Shutdown the checkpointer. Should be called on the onStop of the Receiver. - */ - def shutdown(): Unit = { - // the recurring timer checkpoints for us one last time. - checkpointerThread.stop(interruptTimer = false) - checkpointers.clear() - lastCheckpointedSeqNums.clear() - logInfo("Successfully shutdown Kinesis Checkpointer.") - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala deleted file mode 100644 index 5223c81a8e..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import scala.reflect.ClassTag - -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream -import com.amazonaws.services.kinesis.model.Record - -import org.apache.spark.rdd.RDD -import org.apache.spark.storage.{BlockId, StorageLevel} -import org.apache.spark.streaming.{Duration, StreamingContext, Time} -import org.apache.spark.streaming.dstream.ReceiverInputDStream -import org.apache.spark.streaming.receiver.Receiver -import org.apache.spark.streaming.scheduler.ReceivedBlockInfo - -private[kinesis] class KinesisInputDStream[T: ClassTag]( - _ssc: StreamingContext, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointAppName: String, - checkpointInterval: Duration, - storageLevel: StorageLevel, - messageHandler: Record => T, - awsCredentialsOption: Option[SerializableAWSCredentials] - ) extends ReceiverInputDStream[T](_ssc) { - - private[streaming] - override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { - - // This returns true even for when blockInfos is empty - val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) - - if (allBlocksHaveRanges) { - // Create a KinesisBackedBlockRDD, even when there are no blocks - val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray - val seqNumRanges = blockInfos.map { - _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray - val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray - logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + - s"seq number ranges: ${seqNumRanges.mkString(", ")} ") - new KinesisBackedBlockRDD( - context.sc, regionName, endpointUrl, blockIds, seqNumRanges, - isBlockIdValid = isBlockIdValid, - retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, - messageHandler = messageHandler, - awsCredentialsOption = awsCredentialsOption) - } else { - logWarning("Kinesis sequence number information was not present with some block metadata," + - " it may not be possible to recover from failures") - super.createBlockRDD(time, blockInfos) - } - } - - override def getReceiver(): Receiver[T] = { - new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, - checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala deleted file mode 100644 index 48ee2a9597..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ /dev/null @@ -1,361 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import java.util.UUID -import java.util.concurrent.ConcurrentHashMap - -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.util.control.NonFatal - -import com.amazonaws.auth.{AWSCredentials, AWSCredentialsProvider, DefaultAWSCredentialsProviderChain} -import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer, IRecordProcessorFactory} -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{InitialPositionInStream, KinesisClientLibConfiguration, Worker} -import com.amazonaws.services.kinesis.model.Record - -import org.apache.spark.storage.{StorageLevel, StreamBlockId} -import org.apache.spark.streaming.Duration -import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver} -import org.apache.spark.util.Utils -import org.apache.spark.Logging - -private[kinesis] -case class SerializableAWSCredentials(accessKeyId: String, secretKey: String) - extends AWSCredentials { - override def getAWSAccessKeyId: String = accessKeyId - override def getAWSSecretKey: String = secretKey -} - -/** - * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver. - * This implementation relies on the Kinesis Client Library (KCL) Worker as described here: - * https://github.com/awslabs/amazon-kinesis-client - * - * The way this Receiver works is as follows: - * - * - The receiver starts a KCL Worker, which is essentially runs a threadpool of multiple - * KinesisRecordProcessor - * - Each KinesisRecordProcessor receives data from a Kinesis shard in batches. Each batch is - * inserted into a Block Generator, and the corresponding range of sequence numbers is recorded. - * - When the block generator defines a block, then the recorded sequence number ranges that were - * inserted into the block are recorded separately for being used later. - * - When the block is ready to be pushed, the block is pushed and the ranges are reported as - * metadata of the block. In addition, the ranges are used to find out the latest sequence - * number for each shard that can be checkpointed through the DynamoDB. - * - Periodically, each KinesisRecordProcessor checkpoints the latest successfully stored sequence - * number for it own shard. - * - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Region name used by the Kinesis Client Library for - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointAppName Kinesis application name. Kinesis Apps are mapped to Kinesis Streams - * by the Kinesis Client Library. If you change the App name or Stream name, - * the KCL will throw errors. This usually requires deleting the backing - * DynamoDB table with the same name this Kinesis application. - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects - * @param awsCredentialsOption Optional AWS credentials, used when user directly specifies - * the credentials - */ -private[kinesis] class KinesisReceiver[T]( - val streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointAppName: String, - checkpointInterval: Duration, - storageLevel: StorageLevel, - messageHandler: Record => T, - awsCredentialsOption: Option[SerializableAWSCredentials]) - extends Receiver[T](storageLevel) with Logging { receiver => - - /* - * ================================================================================= - * The following vars are initialize in the onStart() method which executes in the - * Spark worker after this Receiver is serialized and shipped to the worker. - * ================================================================================= - */ - - /** - * workerId is used by the KCL should be based on the ip address of the actual Spark Worker - * where this code runs (not the driver's IP address.) - */ - @volatile private var workerId: String = null - - /** - * Worker is the core client abstraction from the Kinesis Client Library (KCL). - * A worker can process more than one shards from the given stream. - * Each shard is assigned its own IRecordProcessor and the worker run multiple such - * processors. - */ - @volatile private var worker: Worker = null - @volatile private var workerThread: Thread = null - - /** BlockGenerator used to generates blocks out of Kinesis data */ - @volatile private var blockGenerator: BlockGenerator = null - - /** - * Sequence number ranges added to the current block being generated. - * Accessing and updating of this map is synchronized by locks in BlockGenerator. - */ - private val seqNumRangesInCurrentBlock = new mutable.ArrayBuffer[SequenceNumberRange] - - /** Sequence number ranges of data added to each generated block */ - private val blockIdToSeqNumRanges = new ConcurrentHashMap[StreamBlockId, SequenceNumberRanges] - - /** - * The centralized kinesisCheckpointer that checkpoints based on the given checkpointInterval. - */ - @volatile private var kinesisCheckpointer: KinesisCheckpointer = null - - /** - * Latest sequence number ranges that have been stored successfully. - * This is used for checkpointing through KCL */ - private val shardIdToLatestStoredSeqNum = new ConcurrentHashMap[String, String] - - /** - * This is called when the KinesisReceiver starts and must be non-blocking. - * The KCL creates and manages the receiving/processing thread pool through Worker.run(). - */ - override def onStart() { - blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler) - - workerId = Utils.localHostName() + ":" + UUID.randomUUID() - - kinesisCheckpointer = new KinesisCheckpointer(receiver, checkpointInterval, workerId) - // KCL config instance - val awsCredProvider = resolveAWSCredentialsProvider() - val kinesisClientLibConfiguration = - new KinesisClientLibConfiguration(checkpointAppName, streamName, awsCredProvider, workerId) - .withKinesisEndpoint(endpointUrl) - .withInitialPositionInStream(initialPositionInStream) - .withTaskBackoffTimeMillis(500) - .withRegionName(regionName) - - /* - * RecordProcessorFactory creates impls of IRecordProcessor. - * IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the - * IRecordProcessor.processRecords() method. - * We're using our custom KinesisRecordProcessor in this case. - */ - val recordProcessorFactory = new IRecordProcessorFactory { - override def createProcessor: IRecordProcessor = - new KinesisRecordProcessor(receiver, workerId) - } - - worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration) - workerThread = new Thread() { - override def run(): Unit = { - try { - worker.run() - } catch { - case NonFatal(e) => - restart("Error running the KCL worker in Receiver", e) - } - } - } - - blockIdToSeqNumRanges.clear() - blockGenerator.start() - - workerThread.setName(s"Kinesis Receiver ${streamId}") - workerThread.setDaemon(true) - workerThread.start() - - logInfo(s"Started receiver with workerId $workerId") - } - - /** - * This is called when the KinesisReceiver stops. - * The KCL worker.shutdown() method stops the receiving/processing threads. - * The KCL will do its best to drain and checkpoint any in-flight records upon shutdown. - */ - override def onStop() { - if (workerThread != null) { - if (worker != null) { - worker.shutdown() - worker = null - } - workerThread.join() - workerThread = null - logInfo(s"Stopped receiver for workerId $workerId") - } - workerId = null - if (kinesisCheckpointer != null) { - kinesisCheckpointer.shutdown() - kinesisCheckpointer = null - } - } - - /** Add records of the given shard to the current block being generated */ - private[kinesis] def addRecords(shardId: String, records: java.util.List[Record]): Unit = { - if (records.size > 0) { - val dataIterator = records.iterator().asScala.map(messageHandler) - val metadata = SequenceNumberRange(streamName, shardId, - records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber()) - blockGenerator.addMultipleDataWithCallback(dataIterator, metadata) - } - } - - /** Get the latest sequence number for the given shard that can be checkpointed through KCL */ - private[kinesis] def getLatestSeqNumToCheckpoint(shardId: String): Option[String] = { - Option(shardIdToLatestStoredSeqNum.get(shardId)) - } - - /** - * Set the checkpointer that will be used to checkpoint sequence numbers to DynamoDB for the - * given shardId. - */ - def setCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = { - assert(kinesisCheckpointer != null, "Kinesis Checkpointer not initialized!") - kinesisCheckpointer.setCheckpointer(shardId, checkpointer) - } - - /** - * Remove the checkpointer for the given shardId. The provided checkpointer will be used to - * checkpoint one last time for the given shard. If `checkpointer` is `null`, then we will not - * checkpoint. - */ - def removeCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = { - assert(kinesisCheckpointer != null, "Kinesis Checkpointer not initialized!") - kinesisCheckpointer.removeCheckpointer(shardId, checkpointer) - } - - /** - * Remember the range of sequence numbers that was added to the currently active block. - * Internally, this is synchronized with `finalizeRangesForCurrentBlock()`. - */ - private def rememberAddedRange(range: SequenceNumberRange): Unit = { - seqNumRangesInCurrentBlock += range - } - - /** - * Finalize the ranges added to the block that was active and prepare the ranges buffer - * for next block. Internally, this is synchronized with `rememberAddedRange()`. - */ - private def finalizeRangesForCurrentBlock(blockId: StreamBlockId): Unit = { - blockIdToSeqNumRanges.put(blockId, SequenceNumberRanges(seqNumRangesInCurrentBlock.toArray)) - seqNumRangesInCurrentBlock.clear() - logDebug(s"Generated block $blockId has $blockIdToSeqNumRanges") - } - - /** Store the block along with its associated ranges */ - private def storeBlockWithRanges( - blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[T]): Unit = { - val rangesToReportOption = Option(blockIdToSeqNumRanges.remove(blockId)) - if (rangesToReportOption.isEmpty) { - stop("Error while storing block into Spark, could not find sequence number ranges " + - s"for block $blockId") - return - } - - val rangesToReport = rangesToReportOption.get - var attempt = 0 - var stored = false - var throwable: Throwable = null - while (!stored && attempt <= 3) { - try { - store(arrayBuffer, rangesToReport) - stored = true - } catch { - case NonFatal(th) => - attempt += 1 - throwable = th - } - } - if (!stored) { - stop("Error while storing block into Spark", throwable) - } - - // Update the latest sequence number that have been successfully stored for each shard - // Note that we are doing this sequentially because the array of sequence number ranges - // is assumed to be - rangesToReport.ranges.foreach { range => - shardIdToLatestStoredSeqNum.put(range.shardId, range.toSeqNumber) - } - } - - /** - * If AWS credential is provided, return a AWSCredentialProvider returning that credential. - * Otherwise, return the DefaultAWSCredentialsProviderChain. - */ - private def resolveAWSCredentialsProvider(): AWSCredentialsProvider = { - awsCredentialsOption match { - case Some(awsCredentials) => - logInfo("Using provided AWS credentials") - new AWSCredentialsProvider { - override def getCredentials: AWSCredentials = awsCredentials - override def refresh(): Unit = { } - } - case None => - logInfo("Using DefaultAWSCredentialsProviderChain") - new DefaultAWSCredentialsProviderChain() - } - } - - - /** - * Class to handle blocks generated by this receiver's block generator. Specifically, in - * the context of the Kinesis Receiver, this handler does the following. - * - * - When an array of records is added to the current active block in the block generator, - * this handler keeps track of the corresponding sequence number range. - * - When the currently active block is ready to sealed (not more records), this handler - * keep track of the list of ranges added into this block in another H - */ - private class GeneratedBlockHandler extends BlockGeneratorListener { - - /** - * Callback method called after a data item is added into the BlockGenerator. - * The data addition, block generation, and calls to onAddData and onGenerateBlock - * are all synchronized through the same lock. - */ - def onAddData(data: Any, metadata: Any): Unit = { - rememberAddedRange(metadata.asInstanceOf[SequenceNumberRange]) - } - - /** - * Callback method called after a block has been generated. - * The data addition, block generation, and calls to onAddData and onGenerateBlock - * are all synchronized through the same lock. - */ - def onGenerateBlock(blockId: StreamBlockId): Unit = { - finalizeRangesForCurrentBlock(blockId) - } - - /** Callback method called when a block is ready to be pushed / stored. */ - def onPushBlock(blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = { - storeBlockWithRanges(blockId, - arrayBuffer.asInstanceOf[mutable.ArrayBuffer[T]]) - } - - /** Callback called in case of any error in internal of the BlockGenerator */ - def onError(message: String, throwable: Throwable): Unit = { - reportError(message, throwable) - } - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala deleted file mode 100644 index b5b76cb92d..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import java.util.List - -import scala.util.Random -import scala.util.control.NonFatal - -import com.amazonaws.services.kinesis.clientlibrary.exceptions.{InvalidStateException, KinesisClientLibDependencyException, ShutdownException, ThrottlingException} -import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer} -import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason -import com.amazonaws.services.kinesis.model.Record - -import org.apache.spark.Logging -import org.apache.spark.streaming.Duration - -/** - * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor. - * This implementation operates on the Array[Byte] from the KinesisReceiver. - * The Kinesis Worker creates an instance of this KinesisRecordProcessor for each - * shard in the Kinesis stream upon startup. This is normally done in separate threads, - * but the KCLs within the KinesisReceivers will balance themselves out if you create - * multiple Receivers. - * - * @param receiver Kinesis receiver - * @param workerId for logging purposes - */ -private[kinesis] class KinesisRecordProcessor[T](receiver: KinesisReceiver[T], workerId: String) - extends IRecordProcessor with Logging { - - // shardId populated during initialize() - @volatile - private var shardId: String = _ - - /** - * The Kinesis Client Library calls this method during IRecordProcessor initialization. - * - * @param shardId assigned by the KCL to this particular RecordProcessor. - */ - override def initialize(shardId: String) { - this.shardId = shardId - logInfo(s"Initialized workerId $workerId with shardId $shardId") - } - - /** - * This method is called by the KCL when a batch of records is pulled from the Kinesis stream. - * This is the record-processing bridge between the KCL's IRecordProcessor.processRecords() - * and Spark Streaming's Receiver.store(). - * - * @param batch list of records from the Kinesis stream shard - * @param checkpointer used to update Kinesis when this batch has been processed/stored - * in the DStream - */ - override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) { - if (!receiver.isStopped()) { - try { - receiver.addRecords(shardId, batch) - logDebug(s"Stored: Worker $workerId stored ${batch.size} records for shardId $shardId") - receiver.setCheckpointer(shardId, checkpointer) - } catch { - case NonFatal(e) => { - /* - * If there is a failure within the batch, the batch will not be checkpointed. - * This will potentially cause records since the last checkpoint to be processed - * more than once. - */ - logError(s"Exception: WorkerId $workerId encountered and exception while storing " + - s" or checkpointing a batch for workerId $workerId and shardId $shardId.", e) - - /* Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor. */ - throw e - } - } - } else { - /* RecordProcessor has been stopped. */ - logInfo(s"Stopped: KinesisReceiver has stopped for workerId $workerId" + - s" and shardId $shardId. No more records will be processed.") - } - } - - /** - * Kinesis Client Library is shutting down this Worker for 1 of 2 reasons: - * 1) the stream is resharding by splitting or merging adjacent shards - * (ShutdownReason.TERMINATE) - * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason - * (ShutdownReason.ZOMBIE) - * - * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE - * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE) - */ - override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) { - logInfo(s"Shutdown: Shutting down workerId $workerId with reason $reason") - reason match { - /* - * TERMINATE Use Case. Checkpoint. - * Checkpoint to indicate that all records from the shard have been drained and processed. - * It's now OK to read from the new shards that resulted from a resharding event. - */ - case ShutdownReason.TERMINATE => - receiver.removeCheckpointer(shardId, checkpointer) - - /* - * ZOMBIE Use Case or Unknown reason. NoOp. - * No checkpoint because other workers may have taken over and already started processing - * the same records. - * This may lead to records being processed more than once. - */ - case _ => - receiver.removeCheckpointer(shardId, null) // return null so that we don't checkpoint - } - - } -} - -private[kinesis] object KinesisRecordProcessor extends Logging { - /** - * Retry the given amount of times with a random backoff time (millis) less than the - * given maxBackOffMillis - * - * @param expression expression to evalute - * @param numRetriesLeft number of retries left - * @param maxBackOffMillis: max millis between retries - * - * @return evaluation of the given expression - * @throws Unretryable exception, unexpected exception, - * or any exception that persists after numRetriesLeft reaches 0 - */ - @annotation.tailrec - def retryRandom[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = { - util.Try { expression } match { - /* If the function succeeded, evaluate to x. */ - case util.Success(x) => x - /* If the function failed, either retry or throw the exception */ - case util.Failure(e) => e match { - /* Retry: Throttling or other Retryable exception has occurred */ - case _: ThrottlingException | _: KinesisClientLibDependencyException if numRetriesLeft > 1 - => { - val backOffMillis = Random.nextInt(maxBackOffMillis) - Thread.sleep(backOffMillis) - logError(s"Retryable Exception: Random backOffMillis=${backOffMillis}", e) - retryRandom(expression, numRetriesLeft - 1, maxBackOffMillis) - } - /* Throw: Shutdown has been requested by the Kinesis Client Library. */ - case _: ShutdownException => { - logError(s"ShutdownException: Caught shutdown exception, skipping checkpoint.", e) - throw e - } - /* Throw: Non-retryable exception has occurred with the Kinesis Client Library */ - case _: InvalidStateException => { - logError(s"InvalidStateException: Cannot save checkpoint to the DynamoDB table used" + - s" by the Amazon Kinesis Client Library. Table likely doesn't exist.", e) - throw e - } - /* Throw: Unexpected exception has occurred */ - case _ => { - logError(s"Unexpected, non-retryable exception.", e) - throw e - } - } - } - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala deleted file mode 100644 index 0ace453ee9..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import java.nio.ByteBuffer -import java.util.concurrent.TimeUnit - -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.util.{Failure, Random, Success, Try} - -import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} -import com.amazonaws.regions.RegionUtils -import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient -import com.amazonaws.services.dynamodbv2.document.DynamoDB -import com.amazonaws.services.kinesis.AmazonKinesisClient -import com.amazonaws.services.kinesis.model._ - -import org.apache.spark.Logging - -/** - * Shared utility methods for performing Kinesis tests that actually transfer data. - * - * PLEASE KEEP THIS FILE UNDER src/main AS PYTHON TESTS NEED ACCESS TO THIS FILE! - */ -private[kinesis] class KinesisTestUtils extends Logging { - - val endpointUrl = KinesisTestUtils.endpointUrl - val regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName() - val streamShardCount = 2 - - private val createStreamTimeoutSeconds = 300 - private val describeStreamPollTimeSeconds = 1 - - @volatile - private var streamCreated = false - - @volatile - private var _streamName: String = _ - - protected lazy val kinesisClient = { - val client = new AmazonKinesisClient(KinesisTestUtils.getAWSCredentials()) - client.setEndpoint(endpointUrl) - client - } - - private lazy val dynamoDB = { - val dynamoDBClient = new AmazonDynamoDBClient(new DefaultAWSCredentialsProviderChain()) - dynamoDBClient.setRegion(RegionUtils.getRegion(regionName)) - new DynamoDB(dynamoDBClient) - } - - protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { - if (!aggregate) { - new SimpleDataGenerator(kinesisClient) - } else { - throw new UnsupportedOperationException("Aggregation is not supported through this code path") - } - } - - def streamName: String = { - require(streamCreated, "Stream not yet created, call createStream() to create one") - _streamName - } - - def createStream(): Unit = { - require(!streamCreated, "Stream already created") - _streamName = findNonExistentStreamName() - - // Create a stream. The number of shards determines the provisioned throughput. - logInfo(s"Creating stream ${_streamName}") - val createStreamRequest = new CreateStreamRequest() - createStreamRequest.setStreamName(_streamName) - createStreamRequest.setShardCount(2) - kinesisClient.createStream(createStreamRequest) - - // The stream is now being created. Wait for it to become active. - waitForStreamToBeActive(_streamName) - streamCreated = true - logInfo(s"Created stream ${_streamName}") - } - - /** - * Push data to Kinesis stream and return a map of - * shardId -> seq of (data, seq number) pushed to corresponding shard - */ - def pushData(testData: Seq[Int], aggregate: Boolean): Map[String, Seq[(Int, String)]] = { - require(streamCreated, "Stream not yet created, call createStream() to create one") - val producer = getProducer(aggregate) - val shardIdToSeqNumbers = producer.sendData(streamName, testData) - logInfo(s"Pushed $testData:\n\t ${shardIdToSeqNumbers.mkString("\n\t")}") - shardIdToSeqNumbers.toMap - } - - /** - * Expose a Python friendly API. - */ - def pushData(testData: java.util.List[Int]): Unit = { - pushData(testData.asScala, aggregate = false) - } - - def deleteStream(): Unit = { - try { - if (streamCreated) { - kinesisClient.deleteStream(streamName) - } - } catch { - case e: Exception => - logWarning(s"Could not delete stream $streamName") - } - } - - def deleteDynamoDBTable(tableName: String): Unit = { - try { - val table = dynamoDB.getTable(tableName) - table.delete() - table.waitForDelete() - } catch { - case e: Exception => - logWarning(s"Could not delete DynamoDB table $tableName") - } - } - - private def describeStream(streamNameToDescribe: String): Option[StreamDescription] = { - try { - val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe) - val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription() - Some(desc) - } catch { - case rnfe: ResourceNotFoundException => - None - } - } - - private def findNonExistentStreamName(): String = { - var testStreamName: String = null - do { - Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds)) - testStreamName = s"KinesisTestUtils-${math.abs(Random.nextLong())}" - } while (describeStream(testStreamName).nonEmpty) - testStreamName - } - - private def waitForStreamToBeActive(streamNameToWaitFor: String): Unit = { - val startTime = System.currentTimeMillis() - val endTime = startTime + TimeUnit.SECONDS.toMillis(createStreamTimeoutSeconds) - while (System.currentTimeMillis() < endTime) { - Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds)) - describeStream(streamNameToWaitFor).foreach { description => - val streamStatus = description.getStreamStatus() - logDebug(s"\t- current state: $streamStatus\n") - if ("ACTIVE".equals(streamStatus)) { - return - } - } - } - require(false, s"Stream $streamName never became active") - } -} - -private[kinesis] object KinesisTestUtils { - - val envVarNameForEnablingTests = "ENABLE_KINESIS_TESTS" - val endVarNameForEndpoint = "KINESIS_TEST_ENDPOINT_URL" - val defaultEndpointUrl = "https://kinesis.us-west-2.amazonaws.com" - - lazy val shouldRunTests = { - val isEnvSet = sys.env.get(envVarNameForEnablingTests) == Some("1") - if (isEnvSet) { - // scalastyle:off println - // Print this so that they are easily visible on the console and not hidden in the log4j logs. - println( - s""" - |Kinesis tests that actually send data has been enabled by setting the environment - |variable $envVarNameForEnablingTests to 1. This will create Kinesis Streams and - |DynamoDB tables in AWS. Please be aware that this may incur some AWS costs. - |By default, the tests use the endpoint URL $defaultEndpointUrl to create Kinesis streams. - |To change this endpoint URL to a different region, you can set the environment variable - |$endVarNameForEndpoint to the desired endpoint URL - |(e.g. $endVarNameForEndpoint="https://kinesis.us-west-2.amazonaws.com"). - """.stripMargin) - // scalastyle:on println - } - isEnvSet - } - - lazy val endpointUrl = { - val url = sys.env.getOrElse(endVarNameForEndpoint, defaultEndpointUrl) - // scalastyle:off println - // Print this so that they are easily visible on the console and not hidden in the log4j logs. - println(s"Using endpoint URL $url for creating Kinesis streams for tests.") - // scalastyle:on println - url - } - - def isAWSCredentialsPresent: Boolean = { - Try { new DefaultAWSCredentialsProviderChain().getCredentials() }.isSuccess - } - - def getAWSCredentials(): AWSCredentials = { - assert(shouldRunTests, - "Kinesis test not enabled, should not attempt to get AWS credentials") - Try { new DefaultAWSCredentialsProviderChain().getCredentials() } match { - case Success(cred) => cred - case Failure(e) => - throw new Exception( - s""" - |Kinesis tests enabled using environment variable $envVarNameForEnablingTests - |but could not find AWS credentials. Please follow instructions in AWS documentation - |to set the credentials in your system such that the DefaultAWSCredentialsProviderChain - |can find the credentials. - """.stripMargin) - } - } -} - -/** A wrapper interface that will allow us to consolidate the code for synthetic data generation. */ -private[kinesis] trait KinesisDataGenerator { - /** Sends the data to Kinesis and returns the metadata for everything that has been sent. */ - def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] -} - -private[kinesis] class SimpleDataGenerator( - client: AmazonKinesisClient) extends KinesisDataGenerator { - override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = { - val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]() - data.foreach { num => - val str = num.toString - val data = ByteBuffer.wrap(str.getBytes()) - val putRecordRequest = new PutRecordRequest().withStreamName(streamName) - .withData(data) - .withPartitionKey(str) - - val putRecordResult = client.putRecord(putRecordRequest) - val shardId = putRecordResult.getShardId - val seqNumber = putRecordResult.getSequenceNumber() - val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, - new ArrayBuffer[(Int, String)]()) - sentSeqNumbers += ((num, seqNumber)) - } - - shardIdToSeqNumbers.toMap - } -} diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala deleted file mode 100644 index 15ac588b82..0000000000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala +++ /dev/null @@ -1,560 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import scala.reflect.ClassTag - -import com.amazonaws.regions.RegionUtils -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream -import com.amazonaws.services.kinesis.model.Record - -import org.apache.spark.api.java.function.{Function => JFunction} -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.{Duration, StreamingContext} -import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext} -import org.apache.spark.streaming.dstream.ReceiverInputDStream - -object KinesisUtils { - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain - * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain - * gets the AWS credentials. - * - * @param ssc StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - * @param messageHandler A custom message handler that can generate a generic output from a - * Kinesis `Record`, which contains both message data, and metadata. - */ - def createStream[T: ClassTag]( - ssc: StreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel, - messageHandler: Record => T): ReceiverInputDStream[T] = { - val cleanedHandler = ssc.sc.clean(messageHandler) - // Setting scope to override receiver stream's scope of "receiver stream" - ssc.withNamedScope("kinesis stream") { - new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName), - initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, - cleanedHandler, None) - } - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: - * The given AWS credentials will get saved in DStream checkpoints if checkpointing - * is enabled. Make sure that your checkpoint directory is secure. - * - * @param ssc StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - * @param messageHandler A custom message handler that can generate a generic output from a - * Kinesis `Record`, which contains both message data, and metadata. - * @param awsAccessKeyId AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain) - * @param awsSecretKey AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain) - */ - // scalastyle:off - def createStream[T: ClassTag]( - ssc: StreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel, - messageHandler: Record => T, - awsAccessKeyId: String, - awsSecretKey: String): ReceiverInputDStream[T] = { - // scalastyle:on - val cleanedHandler = ssc.sc.clean(messageHandler) - ssc.withNamedScope("kinesis stream") { - new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName), - initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, - cleanedHandler, Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey))) - } - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain - * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain - * gets the AWS credentials. - * - * @param ssc StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - */ - def createStream( - ssc: StreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = { - // Setting scope to override receiver stream's scope of "receiver stream" - ssc.withNamedScope("kinesis stream") { - new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, validateRegion(regionName), - initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, - defaultMessageHandler, None) - } - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: - * The given AWS credentials will get saved in DStream checkpoints if checkpointing - * is enabled. Make sure that your checkpoint directory is secure. - * - * @param ssc StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - * @param awsAccessKeyId AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain) - * @param awsSecretKey AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain) - */ - def createStream( - ssc: StreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel, - awsAccessKeyId: String, - awsSecretKey: String): ReceiverInputDStream[Array[Byte]] = { - ssc.withNamedScope("kinesis stream") { - new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, validateRegion(regionName), - initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, - defaultMessageHandler, Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey))) - } - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: - * - * - The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain - * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain - * gets AWS credentials. - * - The region of the `endpointUrl` will be used for DynamoDB and CloudWatch. - * - The Kinesis application name used by the Kinesis Client Library (KCL) will be the app name - * in [[org.apache.spark.SparkConf]]. - * - * @param ssc StreamingContext object - * @param streamName Kinesis stream name - * @param endpointUrl Endpoint url of Kinesis service - * (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param storageLevel Storage level to use for storing the received objects - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - */ - @deprecated("use other forms of createStream", "1.4.0") - def createStream( - ssc: StreamingContext, - streamName: String, - endpointUrl: String, - checkpointInterval: Duration, - initialPositionInStream: InitialPositionInStream, - storageLevel: StorageLevel - ): ReceiverInputDStream[Array[Byte]] = { - ssc.withNamedScope("kinesis stream") { - new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, - getRegionByEndpoint(endpointUrl), initialPositionInStream, ssc.sc.appName, - checkpointInterval, storageLevel, defaultMessageHandler, None) - } - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain - * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain - * gets the AWS credentials. - * - * @param jssc Java StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - * @param messageHandler A custom message handler that can generate a generic output from a - * Kinesis `Record`, which contains both message data, and metadata. - * @param recordClass Class of the records in DStream - */ - def createStream[T]( - jssc: JavaStreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel, - messageHandler: JFunction[Record, T], - recordClass: Class[T]): JavaReceiverInputDStream[T] = { - implicit val recordCmt: ClassTag[T] = ClassTag(recordClass) - val cleanedHandler = jssc.sparkContext.clean(messageHandler.call(_)) - createStream[T](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName, - initialPositionInStream, checkpointInterval, storageLevel, cleanedHandler) - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: - * The given AWS credentials will get saved in DStream checkpoints if checkpointing - * is enabled. Make sure that your checkpoint directory is secure. - * - * @param jssc Java StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - * @param messageHandler A custom message handler that can generate a generic output from a - * Kinesis `Record`, which contains both message data, and metadata. - * @param recordClass Class of the records in DStream - * @param awsAccessKeyId AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain) - * @param awsSecretKey AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain) - */ - // scalastyle:off - def createStream[T]( - jssc: JavaStreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel, - messageHandler: JFunction[Record, T], - recordClass: Class[T], - awsAccessKeyId: String, - awsSecretKey: String): JavaReceiverInputDStream[T] = { - // scalastyle:on - implicit val recordCmt: ClassTag[T] = ClassTag(recordClass) - val cleanedHandler = jssc.sparkContext.clean(messageHandler.call(_)) - createStream[T](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName, - initialPositionInStream, checkpointInterval, storageLevel, cleanedHandler, - awsAccessKeyId, awsSecretKey) - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain - * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain - * gets the AWS credentials. - * - * @param jssc Java StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - */ - def createStream( - jssc: JavaStreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel - ): JavaReceiverInputDStream[Array[Byte]] = { - createStream[Array[Byte]](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName, - initialPositionInStream, checkpointInterval, storageLevel, defaultMessageHandler(_)) - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: - * The given AWS credentials will get saved in DStream checkpoints if checkpointing - * is enabled. Make sure that your checkpoint directory is secure. - * - * @param jssc Java StreamingContext object - * @param kinesisAppName Kinesis application name used by the Kinesis Client Library - * (KCL) to update DynamoDB - * @param streamName Kinesis stream name - * @param endpointUrl Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param regionName Name of region used by the Kinesis Client Library (KCL) to update - * DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param storageLevel Storage level to use for storing the received objects. - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - * @param awsAccessKeyId AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain) - * @param awsSecretKey AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain) - */ - def createStream( - jssc: JavaStreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: InitialPositionInStream, - checkpointInterval: Duration, - storageLevel: StorageLevel, - awsAccessKeyId: String, - awsSecretKey: String): JavaReceiverInputDStream[Array[Byte]] = { - createStream[Array[Byte]](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName, - initialPositionInStream, checkpointInterval, storageLevel, - defaultMessageHandler(_), awsAccessKeyId, awsSecretKey) - } - - /** - * Create an input stream that pulls messages from a Kinesis stream. - * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - * - * Note: - * - The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain - * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain - * gets AWS credentials. - * - The region of the `endpointUrl` will be used for DynamoDB and CloudWatch. - * - The Kinesis application name used by the Kinesis Client Library (KCL) will be the app name in - * [[org.apache.spark.SparkConf]]. - * - * @param jssc Java StreamingContext object - * @param streamName Kinesis stream name - * @param endpointUrl Endpoint url of Kinesis service - * (e.g., https://kinesis.us-east-1.amazonaws.com) - * @param checkpointInterval Checkpoint interval for Kinesis checkpointing. - * See the Kinesis Spark Streaming documentation for more - * details on the different types of checkpoints. - * @param initialPositionInStream In the absence of Kinesis checkpoint info, this is the - * worker's initial starting position in the stream. - * The values are either the beginning of the stream - * per Kinesis' limit of 24 hours - * (InitialPositionInStream.TRIM_HORIZON) or - * the tip of the stream (InitialPositionInStream.LATEST). - * @param storageLevel Storage level to use for storing the received objects - * StorageLevel.MEMORY_AND_DISK_2 is recommended. - */ - @deprecated("use other forms of createStream", "1.4.0") - def createStream( - jssc: JavaStreamingContext, - streamName: String, - endpointUrl: String, - checkpointInterval: Duration, - initialPositionInStream: InitialPositionInStream, - storageLevel: StorageLevel - ): JavaReceiverInputDStream[Array[Byte]] = { - createStream( - jssc.ssc, streamName, endpointUrl, checkpointInterval, initialPositionInStream, storageLevel) - } - - private def getRegionByEndpoint(endpointUrl: String): String = { - RegionUtils.getRegionByEndpoint(endpointUrl).getName() - } - - private def validateRegion(regionName: String): String = { - Option(RegionUtils.getRegion(regionName)).map { _.getName }.getOrElse { - throw new IllegalArgumentException(s"Region name '$regionName' is not valid") - } - } - - private[kinesis] def defaultMessageHandler(record: Record): Array[Byte] = { - if (record == null) return null - val byteBuffer = record.getData() - val byteArray = new Array[Byte](byteBuffer.remaining()) - byteBuffer.get(byteArray) - byteArray - } -} - -/** - * This is a helper class that wraps the methods in KinesisUtils into more Python-friendly class and - * function so that it can be easily instantiated and called from Python's KinesisUtils. - */ -private class KinesisUtilsPythonHelper { - - def getInitialPositionInStream(initialPositionInStream: Int): InitialPositionInStream = { - initialPositionInStream match { - case 0 => InitialPositionInStream.LATEST - case 1 => InitialPositionInStream.TRIM_HORIZON - case _ => throw new IllegalArgumentException( - "Illegal InitialPositionInStream. Please use " + - "InitialPositionInStream.LATEST or InitialPositionInStream.TRIM_HORIZON") - } - } - - def createStream( - jssc: JavaStreamingContext, - kinesisAppName: String, - streamName: String, - endpointUrl: String, - regionName: String, - initialPositionInStream: Int, - checkpointInterval: Duration, - storageLevel: StorageLevel, - awsAccessKeyId: String, - awsSecretKey: String - ): JavaReceiverInputDStream[Array[Byte]] = { - if (awsAccessKeyId == null && awsSecretKey != null) { - throw new IllegalArgumentException("awsSecretKey is set but awsAccessKeyId is null") - } - if (awsAccessKeyId != null && awsSecretKey == null) { - throw new IllegalArgumentException("awsAccessKeyId is set but awsSecretKey is null") - } - if (awsAccessKeyId == null && awsSecretKey == null) { - KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName, - getInitialPositionInStream(initialPositionInStream), checkpointInterval, storageLevel) - } else { - KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName, - getInitialPositionInStream(initialPositionInStream), checkpointInterval, storageLevel, - awsAccessKeyId, awsSecretKey) - } - } - -} diff --git a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java deleted file mode 100644 index 5c2371c543..0000000000 --- a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis; - -import com.amazonaws.services.kinesis.model.Record; -import org.junit.Test; - -import org.apache.spark.api.java.function.Function; -import org.apache.spark.storage.StorageLevel; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.LocalJavaStreamingContext; -import org.apache.spark.streaming.api.java.JavaDStream; - -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream; - -/** - * Demonstrate the use of the KinesisUtils Java API - */ -public class JavaKinesisStreamSuite extends LocalJavaStreamingContext { - @Test - public void testKinesisStream() { - // Tests the API, does not actually test data receiving - JavaDStream kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", new Duration(2000), - InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2()); - - ssc.stop(); - } - - - private static Function handler = new Function() { - @Override - public String call(Record record) { - return record.getPartitionKey() + "-" + record.getSequenceNumber(); - } - }; - - @Test - public void testCustomHandler() { - // Tests the API, does not actually test data receiving - JavaDStream kinesisStream = KinesisUtils.createStream(ssc, "testApp", "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, - new Duration(2000), StorageLevel.MEMORY_AND_DISK_2(), handler, String.class); - - ssc.stop(); - } -} diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties deleted file mode 100644 index edbecdae92..0000000000 --- a/extras/kinesis-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,27 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the file target/unit-tests.log -log4j.rootCategory=INFO, file -log4j.appender.file=org.apache.log4j.FileAppender -log4j.appender.file.append=true -log4j.appender.file.file=target/unit-tests.log -log4j.appender.file.layout=org.apache.log4j.PatternLayout -log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n - -# Ignore messages below warning level from Jetty, because it's a bit verbose -log4j.logger.org.spark-project.jetty=WARN diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala deleted file mode 100644 index fdb270eaad..0000000000 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import java.nio.ByteBuffer - -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - -import com.amazonaws.services.kinesis.producer.{KinesisProducer => KPLProducer, KinesisProducerConfiguration, UserRecordResult} -import com.google.common.util.concurrent.{FutureCallback, Futures} - -private[kinesis] class KPLBasedKinesisTestUtils extends KinesisTestUtils { - override protected def getProducer(aggregate: Boolean): KinesisDataGenerator = { - if (!aggregate) { - new SimpleDataGenerator(kinesisClient) - } else { - new KPLDataGenerator(regionName) - } - } -} - -/** A wrapper for the KinesisProducer provided in the KPL. */ -private[kinesis] class KPLDataGenerator(regionName: String) extends KinesisDataGenerator { - - private lazy val producer: KPLProducer = { - val conf = new KinesisProducerConfiguration() - .setRecordMaxBufferedTime(1000) - .setMaxConnections(1) - .setRegion(regionName) - .setMetricsLevel("none") - - new KPLProducer(conf) - } - - override def sendData(streamName: String, data: Seq[Int]): Map[String, Seq[(Int, String)]] = { - val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]() - data.foreach { num => - val str = num.toString - val data = ByteBuffer.wrap(str.getBytes()) - val future = producer.addUserRecord(streamName, str, data) - val kinesisCallBack = new FutureCallback[UserRecordResult]() { - override def onFailure(t: Throwable): Unit = {} // do nothing - - override def onSuccess(result: UserRecordResult): Unit = { - val shardId = result.getShardId - val seqNumber = result.getSequenceNumber() - val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId, - new ArrayBuffer[(Int, String)]()) - sentSeqNumbers += ((num, seqNumber)) - } - } - Futures.addCallback(future, kinesisCallBack) - } - producer.flushSync() - shardIdToSeqNumbers.toMap - } -} diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala deleted file mode 100644 index 2555332d22..0000000000 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import org.scalatest.BeforeAndAfterEach - -import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException} -import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId} - -abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean) - extends KinesisFunSuite with BeforeAndAfterEach with LocalSparkContext { - - private val testData = 1 to 8 - - private var testUtils: KinesisTestUtils = null - private var shardIds: Seq[String] = null - private var shardIdToData: Map[String, Seq[Int]] = null - private var shardIdToSeqNumbers: Map[String, Seq[String]] = null - private var shardIdToDataAndSeqNumbers: Map[String, Seq[(Int, String)]] = null - private var shardIdToRange: Map[String, SequenceNumberRange] = null - private var allRanges: Seq[SequenceNumberRange] = null - - private var blockManager: BlockManager = null - - override def beforeAll(): Unit = { - super.beforeAll() - runIfTestsEnabled("Prepare KinesisTestUtils") { - testUtils = new KPLBasedKinesisTestUtils() - testUtils.createStream() - - shardIdToDataAndSeqNumbers = testUtils.pushData(testData, aggregate = aggregateTestData) - require(shardIdToDataAndSeqNumbers.size > 1, "Need data to be sent to multiple shards") - - shardIds = shardIdToDataAndSeqNumbers.keySet.toSeq - shardIdToData = shardIdToDataAndSeqNumbers.mapValues { _.map { _._1 }} - shardIdToSeqNumbers = shardIdToDataAndSeqNumbers.mapValues { _.map { _._2 }} - shardIdToRange = shardIdToSeqNumbers.map { case (shardId, seqNumbers) => - val seqNumRange = SequenceNumberRange( - testUtils.streamName, shardId, seqNumbers.head, seqNumbers.last) - (shardId, seqNumRange) - } - allRanges = shardIdToRange.values.toSeq - } - } - - override def beforeEach(): Unit = { - super.beforeEach() - val conf = new SparkConf().setMaster("local[4]").setAppName("KinesisBackedBlockRDDSuite") - sc = new SparkContext(conf) - blockManager = sc.env.blockManager - } - - override def afterAll(): Unit = { - try { - if (testUtils != null) { - testUtils.deleteStream() - } - } finally { - super.afterAll() - } - } - - testIfEnabled("Basic reading from Kinesis") { - // Verify all data using multiple ranges in a single RDD partition - val receivedData1 = new KinesisBackedBlockRDD[Array[Byte]](sc, testUtils.regionName, - testUtils.endpointUrl, fakeBlockIds(1), - Array(SequenceNumberRanges(allRanges.toArray)) - ).map { bytes => new String(bytes).toInt }.collect() - assert(receivedData1.toSet === testData.toSet) - - // Verify all data using one range in each of the multiple RDD partitions - val receivedData2 = new KinesisBackedBlockRDD[Array[Byte]](sc, testUtils.regionName, - testUtils.endpointUrl, fakeBlockIds(allRanges.size), - allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray - ).map { bytes => new String(bytes).toInt }.collect() - assert(receivedData2.toSet === testData.toSet) - - // Verify ordering within each partition - val receivedData3 = new KinesisBackedBlockRDD[Array[Byte]](sc, testUtils.regionName, - testUtils.endpointUrl, fakeBlockIds(allRanges.size), - allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray - ).map { bytes => new String(bytes).toInt }.collectPartitions() - assert(receivedData3.length === allRanges.size) - for (i <- 0 until allRanges.size) { - assert(receivedData3(i).toSeq === shardIdToData(allRanges(i).shardId)) - } - } - - testIfEnabled("Read data available in both block manager and Kinesis") { - testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 2) - } - - testIfEnabled("Read data available only in block manager, not in Kinesis") { - testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 0) - } - - testIfEnabled("Read data available only in Kinesis, not in block manager") { - testRDD(numPartitions = 2, numPartitionsInBM = 0, numPartitionsInKinesis = 2) - } - - testIfEnabled("Read data available partially in block manager, rest in Kinesis") { - testRDD(numPartitions = 2, numPartitionsInBM = 1, numPartitionsInKinesis = 1) - } - - testIfEnabled("Test isBlockValid skips block fetching from block manager") { - testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 0, - testIsBlockValid = true) - } - - testIfEnabled("Test whether RDD is valid after removing blocks from block anager") { - testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 2, - testBlockRemove = true) - } - - /** - * Test the WriteAheadLogBackedRDD, by writing some partitions of the data to block manager - * and the rest to a write ahead log, and then reading reading it all back using the RDD. - * It can also test if the partitions that were read from the log were again stored in - * block manager. - * - * - * - * @param numPartitions Number of partitions in RDD - * @param numPartitionsInBM Number of partitions to write to the BlockManager. - * Partitions 0 to (numPartitionsInBM-1) will be written to BlockManager - * @param numPartitionsInKinesis Number of partitions to write to the Kinesis. - * Partitions (numPartitions - 1 - numPartitionsInKinesis) to - * (numPartitions - 1) will be written to Kinesis - * @param testIsBlockValid Test whether setting isBlockValid to false skips block fetching - * @param testBlockRemove Test whether calling rdd.removeBlock() makes the RDD still usable with - * reads falling back to the WAL - * Example with numPartitions = 5, numPartitionsInBM = 3, and numPartitionsInWAL = 4 - * - * numPartitionsInBM = 3 - * |------------------| - * | | - * 0 1 2 3 4 - * | | - * |-------------------------| - * numPartitionsInKinesis = 4 - */ - private def testRDD( - numPartitions: Int, - numPartitionsInBM: Int, - numPartitionsInKinesis: Int, - testIsBlockValid: Boolean = false, - testBlockRemove: Boolean = false - ): Unit = { - require(shardIds.size > 1, "Need at least 2 shards to test") - require(numPartitionsInBM <= shardIds.size, - "Number of partitions in BlockManager cannot be more than the Kinesis test shards available") - require(numPartitionsInKinesis <= shardIds.size, - "Number of partitions in Kinesis cannot be more than the Kinesis test shards available") - require(numPartitionsInBM <= numPartitions, - "Number of partitions in BlockManager cannot be more than that in RDD") - require(numPartitionsInKinesis <= numPartitions, - "Number of partitions in Kinesis cannot be more than that in RDD") - - // Put necessary blocks in the block manager - val blockIds = fakeBlockIds(numPartitions) - blockIds.foreach(blockManager.removeBlock(_)) - (0 until numPartitionsInBM).foreach { i => - val blockData = shardIdToData(shardIds(i)).iterator.map { _.toString.getBytes() } - blockManager.putIterator(blockIds(i), blockData, StorageLevel.MEMORY_ONLY) - } - - // Create the necessary ranges to use in the RDD - val fakeRanges = Array.fill(numPartitions - numPartitionsInKinesis)( - SequenceNumberRanges(SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy"))) - val realRanges = Array.tabulate(numPartitionsInKinesis) { i => - val range = shardIdToRange(shardIds(i + (numPartitions - numPartitionsInKinesis))) - SequenceNumberRanges(Array(range)) - } - val ranges = (fakeRanges ++ realRanges) - - - // Make sure that the left `numPartitionsInBM` blocks are in block manager, and others are not - require( - blockIds.take(numPartitionsInBM).forall(blockManager.get(_).nonEmpty), - "Expected blocks not in BlockManager" - ) - - require( - blockIds.drop(numPartitionsInBM).forall(blockManager.get(_).isEmpty), - "Unexpected blocks in BlockManager" - ) - - // Make sure that the right sequence `numPartitionsInKinesis` are configured, and others are not - require( - ranges.takeRight(numPartitionsInKinesis).forall { - _.ranges.forall { _.streamName == testUtils.streamName } - }, "Incorrect configuration of RDD, expected ranges not set: " - ) - - require( - ranges.dropRight(numPartitionsInKinesis).forall { - _.ranges.forall { _.streamName != testUtils.streamName } - }, "Incorrect configuration of RDD, unexpected ranges set" - ) - - val rdd = new KinesisBackedBlockRDD[Array[Byte]]( - sc, testUtils.regionName, testUtils.endpointUrl, blockIds, ranges) - val collectedData = rdd.map { bytes => - new String(bytes).toInt - }.collect() - assert(collectedData.toSet === testData.toSet) - - // Verify that the block fetching is skipped when isBlockValid is set to false. - // This is done by using a RDD whose data is only in memory but is set to skip block fetching - // Using that RDD will throw exception, as it skips block fetching even if the blocks are in - // in BlockManager. - if (testIsBlockValid) { - require(numPartitionsInBM === numPartitions, "All partitions must be in BlockManager") - require(numPartitionsInKinesis === 0, "No partitions must be in Kinesis") - val rdd2 = new KinesisBackedBlockRDD[Array[Byte]]( - sc, testUtils.regionName, testUtils.endpointUrl, blockIds.toArray, ranges, - isBlockIdValid = Array.fill(blockIds.length)(false)) - intercept[SparkException] { - rdd2.collect() - } - } - - // Verify that the RDD is not invalid after the blocks are removed and can still read data - // from write ahead log - if (testBlockRemove) { - require(numPartitions === numPartitionsInKinesis, - "All partitions must be in WAL for this test") - require(numPartitionsInBM > 0, "Some partitions must be in BlockManager for this test") - rdd.removeBlocks() - assert(rdd.map { bytes => new String(bytes).toInt }.collect().toSet === testData.toSet) - } - } - - /** Generate fake block ids */ - private def fakeBlockIds(num: Int): Array[BlockId] = { - Array.tabulate(num) { i => new StreamBlockId(0, i) } - } -} - -class WithAggregationKinesisBackedBlockRDDSuite - extends KinesisBackedBlockRDDTests(aggregateTestData = true) - -class WithoutAggregationKinesisBackedBlockRDDSuite - extends KinesisBackedBlockRDDTests(aggregateTestData = false) diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala deleted file mode 100644 index e1499a8220..0000000000 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import java.util.concurrent.{ExecutorService, TimeoutException} - -import scala.concurrent.{Await, ExecutionContext, Future} -import scala.concurrent.duration._ -import scala.language.postfixOps - -import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer -import org.mockito.Matchers._ -import org.mockito.Mockito._ -import org.mockito.invocation.InvocationOnMock -import org.mockito.stubbing.Answer -import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester} -import org.scalatest.concurrent.Eventually -import org.scalatest.concurrent.Eventually._ -import org.scalatest.mock.MockitoSugar - -import org.apache.spark.streaming.{Duration, TestSuiteBase} -import org.apache.spark.util.ManualClock - -class KinesisCheckpointerSuite extends TestSuiteBase - with MockitoSugar - with BeforeAndAfterEach - with PrivateMethodTester - with Eventually { - - private val workerId = "dummyWorkerId" - private val shardId = "dummyShardId" - private val seqNum = "123" - private val otherSeqNum = "245" - private val checkpointInterval = Duration(10) - private val someSeqNum = Some(seqNum) - private val someOtherSeqNum = Some(otherSeqNum) - - private var receiverMock: KinesisReceiver[Array[Byte]] = _ - private var checkpointerMock: IRecordProcessorCheckpointer = _ - private var kinesisCheckpointer: KinesisCheckpointer = _ - private var clock: ManualClock = _ - - private val checkpoint = PrivateMethod[Unit]('checkpoint) - - override def beforeEach(): Unit = { - receiverMock = mock[KinesisReceiver[Array[Byte]]] - checkpointerMock = mock[IRecordProcessorCheckpointer] - clock = new ManualClock() - kinesisCheckpointer = new KinesisCheckpointer(receiverMock, checkpointInterval, workerId, clock) - } - - test("checkpoint is not called twice for the same sequence number") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) - kinesisCheckpointer.invokePrivate(checkpoint(shardId, checkpointerMock)) - kinesisCheckpointer.invokePrivate(checkpoint(shardId, checkpointerMock)) - - verify(checkpointerMock, times(1)).checkpoint(anyString()) - } - - test("checkpoint is called after sequence number increases") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)) - .thenReturn(someSeqNum).thenReturn(someOtherSeqNum) - kinesisCheckpointer.invokePrivate(checkpoint(shardId, checkpointerMock)) - kinesisCheckpointer.invokePrivate(checkpoint(shardId, checkpointerMock)) - - verify(checkpointerMock, times(1)).checkpoint(seqNum) - verify(checkpointerMock, times(1)).checkpoint(otherSeqNum) - } - - test("should checkpoint if we have exceeded the checkpoint interval") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)) - .thenReturn(someSeqNum).thenReturn(someOtherSeqNum) - - kinesisCheckpointer.setCheckpointer(shardId, checkpointerMock) - clock.advance(5 * checkpointInterval.milliseconds) - - eventually(timeout(1 second)) { - verify(checkpointerMock, times(1)).checkpoint(seqNum) - verify(checkpointerMock, times(1)).checkpoint(otherSeqNum) - } - } - - test("shouldn't checkpoint if we have not exceeded the checkpoint interval") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) - - kinesisCheckpointer.setCheckpointer(shardId, checkpointerMock) - clock.advance(checkpointInterval.milliseconds / 2) - - verify(checkpointerMock, never()).checkpoint(anyString()) - } - - test("should not checkpoint for the same sequence number") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) - - kinesisCheckpointer.setCheckpointer(shardId, checkpointerMock) - - clock.advance(checkpointInterval.milliseconds * 5) - eventually(timeout(1 second)) { - verify(checkpointerMock, atMost(1)).checkpoint(anyString()) - } - } - - test("removing checkpointer checkpoints one last time") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) - - kinesisCheckpointer.removeCheckpointer(shardId, checkpointerMock) - verify(checkpointerMock, times(1)).checkpoint(anyString()) - } - - test("if checkpointing is going on, wait until finished before removing and checkpointing") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)) - .thenReturn(someSeqNum).thenReturn(someOtherSeqNum) - when(checkpointerMock.checkpoint(anyString)).thenAnswer(new Answer[Unit] { - override def answer(invocations: InvocationOnMock): Unit = { - clock.waitTillTime(clock.getTimeMillis() + checkpointInterval.milliseconds / 2) - } - }) - - kinesisCheckpointer.setCheckpointer(shardId, checkpointerMock) - clock.advance(checkpointInterval.milliseconds) - eventually(timeout(1 second)) { - verify(checkpointerMock, times(1)).checkpoint(anyString()) - } - // don't block test thread - val f = Future(kinesisCheckpointer.removeCheckpointer(shardId, checkpointerMock))( - ExecutionContext.global) - - intercept[TimeoutException] { - Await.ready(f, 50 millis) - } - - clock.advance(checkpointInterval.milliseconds / 2) - eventually(timeout(1 second)) { - verify(checkpointerMock, times(2)).checkpoint(anyString()) - } - } -} diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala deleted file mode 100644 index ee428f31d6..0000000000 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import org.apache.spark.SparkFunSuite - -/** - * Helper class that runs Kinesis real data transfer tests or - * ignores them based on env variable is set or not. - */ -trait KinesisFunSuite extends SparkFunSuite { - import KinesisTestUtils._ - - /** Run the test if environment variable is set or ignore the test */ - def testIfEnabled(testName: String)(testBody: => Unit) { - if (shouldRunTests) { - test(testName)(testBody) - } else { - ignore(s"$testName [enable by setting env var $envVarNameForEnablingTests=1]")(testBody) - } - } - - /** Run the give body of code only if Kinesis tests are enabled */ - def runIfTestsEnabled(message: String)(body: => Unit): Unit = { - if (shouldRunTests) { - body - } else { - ignore(s"$message [enable by setting env var $envVarNameForEnablingTests=1]")() - } - } -} diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala deleted file mode 100644 index fd15b6ccdc..0000000000 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import java.nio.ByteBuffer -import java.nio.charset.StandardCharsets -import java.util.Arrays - -import com.amazonaws.services.kinesis.clientlibrary.exceptions._ -import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer -import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason -import com.amazonaws.services.kinesis.model.Record -import org.mockito.Matchers._ -import org.mockito.Matchers.{eq => meq} -import org.mockito.Mockito._ -import org.scalatest.{BeforeAndAfter, Matchers} -import org.scalatest.mock.MockitoSugar - -import org.apache.spark.streaming.{Duration, TestSuiteBase} -import org.apache.spark.util.Utils - -/** - * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor - */ -class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter - with MockitoSugar { - - val app = "TestKinesisReceiver" - val stream = "mySparkStream" - val endpoint = "endpoint-url" - val workerId = "dummyWorkerId" - val shardId = "dummyShardId" - val seqNum = "dummySeqNum" - val checkpointInterval = Duration(10) - val someSeqNum = Some(seqNum) - - val record1 = new Record() - record1.setData(ByteBuffer.wrap("Spark In Action".getBytes(StandardCharsets.UTF_8))) - val record2 = new Record() - record2.setData(ByteBuffer.wrap("Learning Spark".getBytes(StandardCharsets.UTF_8))) - val batch = Arrays.asList(record1, record2) - - var receiverMock: KinesisReceiver[Array[Byte]] = _ - var checkpointerMock: IRecordProcessorCheckpointer = _ - - override def beforeFunction(): Unit = { - receiverMock = mock[KinesisReceiver[Array[Byte]]] - checkpointerMock = mock[IRecordProcessorCheckpointer] - } - - test("check serializability of SerializableAWSCredentials") { - Utils.deserialize[SerializableAWSCredentials]( - Utils.serialize(new SerializableAWSCredentials("x", "y"))) - } - - test("process records including store and set checkpointer") { - when(receiverMock.isStopped()).thenReturn(false) - - val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId) - recordProcessor.initialize(shardId) - recordProcessor.processRecords(batch, checkpointerMock) - - verify(receiverMock, times(1)).isStopped() - verify(receiverMock, times(1)).addRecords(shardId, batch) - verify(receiverMock, times(1)).setCheckpointer(shardId, checkpointerMock) - } - - test("shouldn't store and update checkpointer when receiver is stopped") { - when(receiverMock.isStopped()).thenReturn(true) - - val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId) - recordProcessor.processRecords(batch, checkpointerMock) - - verify(receiverMock, times(1)).isStopped() - verify(receiverMock, never).addRecords(anyString, anyListOf(classOf[Record])) - verify(receiverMock, never).setCheckpointer(anyString, meq(checkpointerMock)) - } - - test("shouldn't update checkpointer when exception occurs during store") { - when(receiverMock.isStopped()).thenReturn(false) - when( - receiverMock.addRecords(anyString, anyListOf(classOf[Record])) - ).thenThrow(new RuntimeException()) - - intercept[RuntimeException] { - val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId) - recordProcessor.initialize(shardId) - recordProcessor.processRecords(batch, checkpointerMock) - } - - verify(receiverMock, times(1)).isStopped() - verify(receiverMock, times(1)).addRecords(shardId, batch) - verify(receiverMock, never).setCheckpointer(anyString, meq(checkpointerMock)) - } - - test("shutdown should checkpoint if the reason is TERMINATE") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) - - val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId) - recordProcessor.initialize(shardId) - recordProcessor.shutdown(checkpointerMock, ShutdownReason.TERMINATE) - - verify(receiverMock, times(1)).removeCheckpointer(meq(shardId), meq(checkpointerMock)) - } - - - test("shutdown should not checkpoint if the reason is something other than TERMINATE") { - when(receiverMock.getLatestSeqNumToCheckpoint(shardId)).thenReturn(someSeqNum) - - val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId) - recordProcessor.initialize(shardId) - recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE) - recordProcessor.shutdown(checkpointerMock, null) - - verify(receiverMock, times(2)).removeCheckpointer(meq(shardId), - meq[IRecordProcessorCheckpointer](null)) - } - - test("retry success on first attempt") { - val expectedIsStopped = false - when(receiverMock.isStopped()).thenReturn(expectedIsStopped) - - val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100) - assert(actualVal == expectedIsStopped) - - verify(receiverMock, times(1)).isStopped() - } - - test("retry success on second attempt after a Kinesis throttling exception") { - val expectedIsStopped = false - when(receiverMock.isStopped()) - .thenThrow(new ThrottlingException("error message")) - .thenReturn(expectedIsStopped) - - val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100) - assert(actualVal == expectedIsStopped) - - verify(receiverMock, times(2)).isStopped() - } - - test("retry success on second attempt after a Kinesis dependency exception") { - val expectedIsStopped = false - when(receiverMock.isStopped()) - .thenThrow(new KinesisClientLibDependencyException("error message")) - .thenReturn(expectedIsStopped) - - val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100) - assert(actualVal == expectedIsStopped) - - verify(receiverMock, times(2)).isStopped() - } - - test("retry failed after a shutdown exception") { - when(checkpointerMock.checkpoint()).thenThrow(new ShutdownException("error message")) - - intercept[ShutdownException] { - KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) - } - - verify(checkpointerMock, times(1)).checkpoint() - } - - test("retry failed after an invalid state exception") { - when(checkpointerMock.checkpoint()).thenThrow(new InvalidStateException("error message")) - - intercept[InvalidStateException] { - KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) - } - - verify(checkpointerMock, times(1)).checkpoint() - } - - test("retry failed after unexpected exception") { - when(checkpointerMock.checkpoint()).thenThrow(new RuntimeException("error message")) - - intercept[RuntimeException] { - KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) - } - - verify(checkpointerMock, times(1)).checkpoint() - } - - test("retry failed after exhausing all retries") { - val expectedErrorMessage = "final try error message" - when(checkpointerMock.checkpoint()) - .thenThrow(new ThrottlingException("error message")) - .thenThrow(new ThrottlingException(expectedErrorMessage)) - - val exception = intercept[RuntimeException] { - KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100) - } - exception.getMessage().shouldBe(expectedErrorMessage) - - verify(checkpointerMock, times(2)).checkpoint() - } -} diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala deleted file mode 100644 index ca5d13da46..0000000000 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.streaming.kinesis - -import scala.collection.mutable -import scala.concurrent.duration._ -import scala.language.postfixOps -import scala.util.Random - -import com.amazonaws.regions.RegionUtils -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream -import com.amazonaws.services.kinesis.model.Record -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} -import org.scalatest.Matchers._ -import org.scalatest.concurrent.Eventually - -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.network.util.JavaUtils -import org.apache.spark.rdd.RDD -import org.apache.spark.storage.{StorageLevel, StreamBlockId} -import org.apache.spark.streaming._ -import org.apache.spark.streaming.dstream.ReceiverInputDStream -import org.apache.spark.streaming.kinesis.KinesisTestUtils._ -import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult -import org.apache.spark.streaming.scheduler.ReceivedBlockInfo -import org.apache.spark.util.Utils - -abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFunSuite - with Eventually with BeforeAndAfter with BeforeAndAfterAll { - - // This is the name that KCL will use to save metadata to DynamoDB - private val appName = s"KinesisStreamSuite-${math.abs(Random.nextLong())}" - private val batchDuration = Seconds(1) - - // Dummy parameters for API testing - private val dummyEndpointUrl = defaultEndpointUrl - private val dummyRegionName = RegionUtils.getRegionByEndpoint(dummyEndpointUrl).getName() - private val dummyAWSAccessKey = "dummyAccessKey" - private val dummyAWSSecretKey = "dummySecretKey" - - private var testUtils: KinesisTestUtils = null - private var ssc: StreamingContext = null - private var sc: SparkContext = null - - override def beforeAll(): Unit = { - val conf = new SparkConf() - .setMaster("local[4]") - .setAppName("KinesisStreamSuite") // Setting Spark app name to Kinesis app name - sc = new SparkContext(conf) - - runIfTestsEnabled("Prepare KinesisTestUtils") { - testUtils = new KPLBasedKinesisTestUtils() - testUtils.createStream() - } - } - - override def afterAll(): Unit = { - if (ssc != null) { - ssc.stop() - } - if (sc != null) { - sc.stop() - } - if (testUtils != null) { - // Delete the Kinesis stream as well as the DynamoDB table generated by - // Kinesis Client Library when consuming the stream - testUtils.deleteStream() - testUtils.deleteDynamoDBTable(appName) - } - } - - before { - ssc = new StreamingContext(sc, batchDuration) - } - - after { - if (ssc != null) { - ssc.stop(stopSparkContext = false) - ssc = null - } - if (testUtils != null) { - testUtils.deleteDynamoDBTable(appName) - } - } - - test("KinesisUtils API") { - // Tests the API, does not actually test data receiving - val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream", - dummyEndpointUrl, Seconds(2), - InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2) - val kinesisStream2 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream", - dummyEndpointUrl, dummyRegionName, - InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2) - val kinesisStream3 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream", - dummyEndpointUrl, dummyRegionName, - InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2, - dummyAWSAccessKey, dummyAWSSecretKey) - } - - test("RDD generation") { - val inputStream = KinesisUtils.createStream(ssc, appName, "dummyStream", - dummyEndpointUrl, dummyRegionName, InitialPositionInStream.LATEST, Seconds(2), - StorageLevel.MEMORY_AND_DISK_2, dummyAWSAccessKey, dummyAWSSecretKey) - assert(inputStream.isInstanceOf[KinesisInputDStream[Array[Byte]]]) - - val kinesisStream = inputStream.asInstanceOf[KinesisInputDStream[Array[Byte]]] - val time = Time(1000) - - // Generate block info data for testing - val seqNumRanges1 = SequenceNumberRanges( - SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy")) - val blockId1 = StreamBlockId(kinesisStream.id, 123) - val blockInfo1 = ReceivedBlockInfo( - 0, None, Some(seqNumRanges1), new BlockManagerBasedStoreResult(blockId1, None)) - - val seqNumRanges2 = SequenceNumberRanges( - SequenceNumberRange("fakeStream", "fakeShardId", "aaa", "bbb")) - val blockId2 = StreamBlockId(kinesisStream.id, 345) - val blockInfo2 = ReceivedBlockInfo( - 0, None, Some(seqNumRanges2), new BlockManagerBasedStoreResult(blockId2, None)) - - // Verify that the generated KinesisBackedBlockRDD has the all the right information - val blockInfos = Seq(blockInfo1, blockInfo2) - val nonEmptyRDD = kinesisStream.createBlockRDD(time, blockInfos) - nonEmptyRDD shouldBe a [KinesisBackedBlockRDD[_]] - val kinesisRDD = nonEmptyRDD.asInstanceOf[KinesisBackedBlockRDD[_]] - assert(kinesisRDD.regionName === dummyRegionName) - assert(kinesisRDD.endpointUrl === dummyEndpointUrl) - assert(kinesisRDD.retryTimeoutMs === batchDuration.milliseconds) - assert(kinesisRDD.awsCredentialsOption === - Some(SerializableAWSCredentials(dummyAWSAccessKey, dummyAWSSecretKey))) - assert(nonEmptyRDD.partitions.size === blockInfos.size) - nonEmptyRDD.partitions.foreach { _ shouldBe a [KinesisBackedBlockRDDPartition] } - val partitions = nonEmptyRDD.partitions.map { - _.asInstanceOf[KinesisBackedBlockRDDPartition] }.toSeq - assert(partitions.map { _.seqNumberRanges } === Seq(seqNumRanges1, seqNumRanges2)) - assert(partitions.map { _.blockId } === Seq(blockId1, blockId2)) - assert(partitions.forall { _.isBlockIdValid === true }) - - // Verify that KinesisBackedBlockRDD is generated even when there are no blocks - val emptyRDD = kinesisStream.createBlockRDD(time, Seq.empty) - emptyRDD shouldBe a [KinesisBackedBlockRDD[Array[Byte]]] - emptyRDD.partitions shouldBe empty - - // Verify that the KinesisBackedBlockRDD has isBlockValid = false when blocks are invalid - blockInfos.foreach { _.setBlockIdInvalid() } - kinesisStream.createBlockRDD(time, blockInfos).partitions.foreach { partition => - assert(partition.asInstanceOf[KinesisBackedBlockRDDPartition].isBlockIdValid === false) - } - } - - - /** - * Test the stream by sending data to a Kinesis stream and receiving from it. - * This test is not run by default as it requires AWS credentials that the test - * environment may not have. Even if there is AWS credentials available, the user - * may not want to run these tests to avoid the Kinesis costs. To enable this test, - * you must have AWS credentials available through the default AWS provider chain, - * and you have to set the system environment variable RUN_KINESIS_TESTS=1 . - */ - testIfEnabled("basic operation") { - val awsCredentials = KinesisTestUtils.getAWSCredentials() - val stream = KinesisUtils.createStream(ssc, appName, testUtils.streamName, - testUtils.endpointUrl, testUtils.regionName, InitialPositionInStream.LATEST, - Seconds(10), StorageLevel.MEMORY_ONLY, - awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey) - - val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int] - stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd => - collected ++= rdd.collect() - logInfo("Collected = " + collected.mkString(", ")) - } - ssc.start() - - val testData = 1 to 10 - eventually(timeout(120 seconds), interval(10 second)) { - testUtils.pushData(testData, aggregateTestData) - assert(collected === testData.toSet, "\nData received does not match data sent") - } - ssc.stop(stopSparkContext = false) - } - - testIfEnabled("custom message handling") { - val awsCredentials = KinesisTestUtils.getAWSCredentials() - def addFive(r: Record): Int = JavaUtils.bytesToString(r.getData).toInt + 5 - val stream = KinesisUtils.createStream(ssc, appName, testUtils.streamName, - testUtils.endpointUrl, testUtils.regionName, InitialPositionInStream.LATEST, - Seconds(10), StorageLevel.MEMORY_ONLY, addFive, - awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey) - - stream shouldBe a [ReceiverInputDStream[_]] - - val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int] - stream.foreachRDD { rdd => - collected ++= rdd.collect() - logInfo("Collected = " + collected.mkString(", ")) - } - ssc.start() - - val testData = 1 to 10 - eventually(timeout(120 seconds), interval(10 second)) { - testUtils.pushData(testData, aggregateTestData) - val modData = testData.map(_ + 5) - assert(collected === modData.toSet, "\nData received does not match data sent") - } - ssc.stop(stopSparkContext = false) - } - - testIfEnabled("failure recovery") { - val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) - val checkpointDir = Utils.createTempDir().getAbsolutePath - - ssc = new StreamingContext(sc, Milliseconds(1000)) - ssc.checkpoint(checkpointDir) - - val awsCredentials = KinesisTestUtils.getAWSCredentials() - val collectedData = new mutable.HashMap[Time, (Array[SequenceNumberRanges], Seq[Int])] - - val kinesisStream = KinesisUtils.createStream(ssc, appName, testUtils.streamName, - testUtils.endpointUrl, testUtils.regionName, InitialPositionInStream.LATEST, - Seconds(10), StorageLevel.MEMORY_ONLY, - awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey) - - // Verify that the generated RDDs are KinesisBackedBlockRDDs, and collect the data in each batch - kinesisStream.foreachRDD((rdd: RDD[Array[Byte]], time: Time) => { - val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD[Array[Byte]]] - val data = rdd.map { bytes => new String(bytes).toInt }.collect().toSeq - collectedData.synchronized { - collectedData(time) = (kRdd.arrayOfseqNumberRanges, data) - } - }) - - ssc.remember(Minutes(60)) // remember all the batches so that they are all saved in checkpoint - ssc.start() - - def numBatchesWithData: Int = - collectedData.synchronized { collectedData.count(_._2._2.nonEmpty) } - - def isCheckpointPresent: Boolean = Checkpoint.getCheckpointFiles(checkpointDir).nonEmpty - - // Run until there are at least 10 batches with some data in them - // If this times out because numBatchesWithData is empty, then its likely that foreachRDD - // function failed with exceptions, and nothing got added to `collectedData` - eventually(timeout(2 minutes), interval(1 seconds)) { - testUtils.pushData(1 to 5, aggregateTestData) - assert(isCheckpointPresent && numBatchesWithData > 10) - } - ssc.stop(stopSparkContext = true) // stop the SparkContext so that the blocks are not reused - - // Restart the context from checkpoint and verify whether the - logInfo("Restarting from checkpoint") - ssc = new StreamingContext(checkpointDir) - ssc.start() - val recoveredKinesisStream = ssc.graph.getInputStreams().head - - // Verify that the recomputed RDDs are KinesisBackedBlockRDDs with the same sequence ranges - // and return the same data - collectedData.synchronized { - val times = collectedData.keySet - times.foreach { time => - val (arrayOfSeqNumRanges, data) = collectedData(time) - val rdd = recoveredKinesisStream.getOrCompute(time).get.asInstanceOf[RDD[Array[Byte]]] - rdd shouldBe a[KinesisBackedBlockRDD[_]] - - // Verify the recovered sequence ranges - val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD[Array[Byte]]] - assert(kRdd.arrayOfseqNumberRanges.size === arrayOfSeqNumRanges.size) - arrayOfSeqNumRanges.zip(kRdd.arrayOfseqNumberRanges).foreach { case (expected, found) => - assert(expected.ranges.toSeq === found.ranges.toSeq) - } - - // Verify the recovered data - assert(rdd.map { bytes => new String(bytes).toInt }.collect().toSeq === data) - } - } - ssc.stop() - } -} - -class WithAggregationKinesisStreamSuite extends KinesisStreamTests(aggregateTestData = true) - -class WithoutAggregationKinesisStreamSuite extends KinesisStreamTests(aggregateTestData = false) -- cgit v1.2.3