/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.examples.streaming;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kinesis.KinesisUtils;

import scala.Tuple2;

import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.services.kinesis.AmazonKinesisClient;
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;

/**
 * Consumes messages from a Amazon Kinesis streams and does wordcount.
 *
 * This example spins up 1 Kinesis Receiver per shard for the given stream.
 * It then starts pulling from the last checkpointed sequence number of the given stream.
 *
 * Usage: JavaKinesisWordCountASL [app-name] [stream-name] [endpoint-url] [region-name]
 *   [app-name] is the name of the consumer app, used to track the read data in DynamoDB
 *   [stream-name] name of the Kinesis stream (ie. mySparkStream)
 *   [endpoint-url] endpoint of the Kinesis service
 *     (e.g. https://kinesis.us-east-1.amazonaws.com)
 *
 *
 * Example:
 *      # export AWS keys if necessary
 *      $ export AWS_ACCESS_KEY_ID=[your-access-key]
 *      $ export AWS_SECRET_KEY=<your-secret-key>
 *
 *      # run the example
 *      $ SPARK_HOME/bin/run-example   streaming.JavaKinesisWordCountASL myAppName  mySparkStream \
 *             https://kinesis.us-east-1.amazonaws.com
 *
 * There is a companion helper class called KinesisWordProducerASL which puts dummy data
 * onto the Kinesis stream.
 *
 * This code uses the DefaultAWSCredentialsProviderChain to find credentials
 * in the following order:
 *    Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
 *    Java System Properties - aws.accessKeyId and aws.secretKey
 *    Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
 *    Instance profile credentials - delivered through the Amazon EC2 metadata service
 * For more information, see
 * http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html
 *
 * See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
 * the Kinesis Spark Streaming integration.
 */
public final class JavaKinesisWordCountASL { // needs to be public for access from run-example
  private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");

  public static void main(String[] args) throws Exception {
    // Check that all required args were passed in.
    if (args.length != 3) {
      System.err.println(
          "Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>\n\n" +
          "    <app-name> is the name of the app, used to track the read data in DynamoDB\n" +
          "    <stream-name> is the name of the Kinesis stream\n" +
          "    <endpoint-url> is the endpoint of the Kinesis service\n" +
          "                   (e.g. https://kinesis.us-east-1.amazonaws.com)\n" +
          "Generate data for the Kinesis stream using the example KinesisWordProducerASL.\n" +
          "See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more\n" +
          "details.\n"
      );
      System.exit(1);
    }

    // Set default log4j logging level to WARN to hide Spark logs
    StreamingExamples.setStreamingLogLevels();

    // Populate the appropriate variables from the given args
    String kinesisAppName = args[0];
    String streamName = args[1];
    String endpointUrl = args[2];

    // Create a Kinesis client in order to determine the number of shards for the given stream
    AmazonKinesisClient kinesisClient =
        new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain());
    kinesisClient.setEndpoint(endpointUrl);
    int numShards =
        kinesisClient.describeStream(streamName).getStreamDescription().getShards().size();


    // In this example, we're going to create 1 Kinesis Receiver/input DStream for each shard.
    // This is not a necessity; if there are less receivers/DStreams than the number of shards,
    // then the shards will be automatically distributed among the receivers and each receiver
    // will receive data from multiple shards.
    int numStreams = numShards;

    // Spark Streaming batch interval
    Duration batchInterval = new Duration(2000);

    // Kinesis checkpoint interval.  Same as batchInterval for this example.
    Duration kinesisCheckpointInterval = batchInterval;

    // Get the region name from the endpoint URL to save Kinesis Client Library metadata in
    // DynamoDB of the same region as the Kinesis stream
    String regionName = KinesisExampleUtils.getRegionNameByEndpoint(endpointUrl);

    // Setup the Spark config and StreamingContext
    SparkConf sparkConfig = new SparkConf().setAppName("JavaKinesisWordCountASL");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);

    // Create the Kinesis DStreams
    List<JavaDStream<byte[]>> streamsList = new ArrayList<>(numStreams);
    for (int i = 0; i < numStreams; i++) {
      streamsList.add(
          KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName,
              InitialPositionInStream.LATEST, kinesisCheckpointInterval,
              StorageLevel.MEMORY_AND_DISK_2())
      );
    }

    // Union all the streams if there is more than 1 stream
    JavaDStream<byte[]> unionStreams;
    if (streamsList.size() > 1) {
      unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
    } else {
      // Otherwise, just use the 1 stream
      unionStreams = streamsList.get(0);
    }

    // Convert each line of Array[Byte] to String, and split into words
    JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
      @Override
      public Iterator<String> call(byte[] line) {
        String s = new String(line, StandardCharsets.UTF_8);
        return Arrays.asList(WORD_SEPARATOR.split(s)).iterator();
      }
    });

    // Map each word to a (word, 1) tuple so we can reduce by key to count the words
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
        new PairFunction<String, String, Integer>() {
          @Override
          public Tuple2<String, Integer> call(String s) {
            return new Tuple2<>(s, 1);
          }
        }
    ).reduceByKey(
        new Function2<Integer, Integer, Integer>() {
          @Override
          public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
          }
        }
    );

    // Print the first 10 wordCounts
    wordCounts.print();

    // Start the streaming context and await termination
    jssc.start();
    jssc.awaitTermination();
  }
}