aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/mllib-statistics.md438
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java70
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java84
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java49
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java53
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java75
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java56
-rw-r--r--examples/src/main/python/mllib/correlations_example.py48
-rw-r--r--examples/src/main/python/mllib/hypothesis_testing_example.py65
-rw-r--r--examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py40
-rw-r--r--examples/src/main/python/mllib/kernel_density_estimation_example.py44
-rw-r--r--examples/src/main/python/mllib/stratified_sampling_example.py38
-rw-r--r--examples/src/main/python/mllib/summary_statistics_example.py42
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala62
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala80
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala54
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala54
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala53
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala53
19 files changed, 1076 insertions, 382 deletions
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index b773031bc7..02b81f153b 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -10,24 +10,24 @@ displayTitle: Basic Statistics - spark.mllib
`\[
\newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}}
+\newcommand{\E}{\mathbb{E}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\wv}{\mathbf{w}}
\newcommand{\av}{\mathbf{\alpha}}
\newcommand{\bv}{\mathbf{b}}
\newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}}
-\newcommand{\ind}{\mathbf{1}}
-\newcommand{\0}{\mathbf{0}}
-\newcommand{\unit}{\mathbf{e}}
-\newcommand{\one}{\mathbf{1}}
+\newcommand{\id}{\mathbf{I}}
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
\newcommand{\zero}{\mathbf{0}}
\]`
-## Summary statistics
+## Summary statistics
-We provide column summary statistics for `RDD[Vector]` through the function `colStats`
+We provide column summary statistics for `RDD[Vector]` through the function `colStats`
available in `Statistics`.
<div class="codetabs">
@@ -40,19 +40,7 @@ total count.
Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
-
-val observations: RDD[Vector] = ... // an RDD of Vectors
-
-// Compute column summary statistics.
-val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
-println(summary.mean) // a dense vector containing the mean value for each column
-println(summary.variance) // column-wise variance
-println(summary.numNonzeros) // number of nonzeros in each column
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -64,24 +52,7 @@ total count.
Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
-import org.apache.spark.mllib.stat.Statistics;
-
-JavaSparkContext jsc = ...
-
-JavaRDD<Vector> mat = ... // an RDD of Vectors
-
-// Compute column summary statistics.
-MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
-System.out.println(summary.mean()); // a dense vector containing the mean value for each column
-System.out.println(summary.variance()); // column-wise variance
-System.out.println(summary.numNonzeros()); // number of nonzeros in each column
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -92,20 +63,7 @@ total count.
Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.
-{% highlight python %}
-from pyspark.mllib.stat import Statistics
-
-sc = ... # SparkContext
-
-mat = ... # an RDD of Vectors
-
-# Compute column summary statistics.
-summary = Statistics.colStats(mat)
-print(summary.mean())
-print(summary.variance())
-print(summary.numNonzeros())
-
-{% endhighlight %}
+{% include_example python/mllib/summary_statistics_example.py %}
</div>
</div>
@@ -113,96 +71,38 @@ print(summary.numNonzeros())
## Correlations
Calculating the correlation between two series of data is a common operation in Statistics. In `spark.mllib`
-we provide the flexibility to calculate pairwise correlations among many series. The supported
+we provide the flexibility to calculate pairwise correlations among many series. The supported
correlation methods are currently Pearson's and Spearman's correlation.
-
+
<div class="codetabs">
<div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
-calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.stat.Statistics
-
-val sc: SparkContext = ...
-
-val seriesX: RDD[Double] = ... // a series
-val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX
-
-// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-// method is not specified, Pearson's method will be used by default.
-val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
-
-val data: RDD[Vector] = ... // note that each Vector is a row and not a column
-
-// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-// If a method is not specified, Pearson's method will be used by default.
-val correlMatrix: Matrix = Statistics.corr(data, "pearson")
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
</div>
<div data-lang="java" markdown="1">
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
-calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
+calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or
a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` respectively.
Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
-{% highlight java %}
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
-import org.apache.spark.mllib.stat.Statistics;
-
-JavaSparkContext jsc = ...
-
-JavaDoubleRDD seriesX = ... // a series
-JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX
-
-// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-// method is not specified, Pearson's method will be used by default.
-Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
-
-JavaRDD<Vector> data = ... // note that each Vector is a row and not a column
-
-// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-// If a method is not specified, Pearson's method will be used by default.
-Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %}
</div>
<div data-lang="python" markdown="1">
-[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to
-calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
+[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to
+calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
-{% highlight python %}
-from pyspark.mllib.stat import Statistics
-
-sc = ... # SparkContext
-
-seriesX = ... # a series
-seriesY = ... # must have the same number of partitions and cardinality as seriesX
-
-# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-# method is not specified, Pearson's method will be used by default.
-print(Statistics.corr(seriesX, seriesY, method="pearson"))
-
-data = ... # an RDD of Vectors
-# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-# If a method is not specified, Pearson's method will be used by default.
-print(Statistics.corr(data, method="pearson"))
-
-{% endhighlight %}
+{% include_example python/mllib/correlations_example.py %}
</div>
</div>
@@ -211,187 +111,76 @@ print(Statistics.corr(data, method="pearson"))
Unlike the other statistics functions, which reside in `spark.mllib`, stratified sampling methods,
`sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified
-sampling, the keys can be thought of as a label and the value as a specific attribute. For example
-the key can be man or woman, or document ids, and the respective values can be the list of ages
-of the people in the population or the list of words in the documents. The `sampleByKey` method
-will flip a coin to decide whether an observation will be sampled or not, therefore requires one
-pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant
+sampling, the keys can be thought of as a label and the value as a specific attribute. For example
+the key can be man or woman, or document ids, and the respective values can be the list of ages
+of the people in the population or the list of words in the documents. The `sampleByKey` method
+will flip a coin to decide whether an observation will be sampled or not, therefore requires one
+pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant
more resources than the per-stratum simple random sampling used in `sampleByKey`, but will provide
-the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in
+the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in
python.
<div class="codetabs">
<div data-lang="scala" markdown="1">
[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to
-sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
+sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of
-keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
+keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
size, whereas sampling with replacement requires two additional passes.
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.PairRDDFunctions
-
-val sc: SparkContext = ...
-
-val data = ... // an RDD[(K, V)] of any key value pairs
-val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
-
-// Get an exact sample from each stratum
-val approxSample = data.sampleByKey(withReplacement = false, fractions)
-val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
</div>
<div data-lang="java" markdown="1">
[`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to
-sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
+sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of
-keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
+keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
size, whereas sampling with replacement requires two additional passes.
-{% highlight java %}
-import java.util.Map;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-
-JavaSparkContext jsc = ...
-
-JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
-Map<K, Object> fractions = ... // specify the exact fraction desired from each key
-
-// Get an exact sample from each stratum
-JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions);
-JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
</div>
<div data-lang="python" markdown="1">
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to
-sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the
-desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the
+sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the
+desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the
set of keys.
*Note:* `sampleByKeyExact()` is currently not supported in Python.
-{% highlight python %}
-
-sc = ... # SparkContext
-
-data = ... # an RDD of any key value pairs
-fractions = ... # specify the exact fraction desired from each key as a dictionary
-
-approxSample = data.sampleByKey(False, fractions);
-
-{% endhighlight %}
+{% include_example python/mllib/stratified_sampling_example.py %}
</div>
</div>
## Hypothesis testing
-Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically
-significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's
+Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically
+significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's
chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine
-whether the goodness of fit or the independence test is conducted. The goodness of fit test requires
+whether the goodness of fit or the independence test is conducted. The goodness of fit test requires
an input type of `Vector`, whereas the independence test requires a `Matrix` as input.
-`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared
+`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared
independence tests.
<div class="codetabs">
<div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
-run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
hypothesis tests.
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.stat.Statistics._
-
-val sc: SparkContext = ...
-
-val vec: Vector = ... // a vector composed of the frequencies of events
-
-// compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-// the test runs against a uniform distribution.
-val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom,
- // test statistic, the method used, and the null hypothesis.
-
-val mat: Matrix = ... // a contingency matrix
-
-// conduct Pearson's independence test on the input contingency matrix
-val independenceTestResult = Statistics.chiSqTest(mat)
-println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
-
-val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
-
-// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-// against the label.
-val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
-var i = 1
-featureTestResults.foreach { result =>
- println(s"Column $i:\n$result")
- i += 1
-} // summary of the test
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
</div>
<div data-lang="java" markdown="1">
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
-run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
+run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
hypothesis tests.
Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.stat.Statistics;
-import org.apache.spark.mllib.stat.test.ChiSqTestResult;
-
-JavaSparkContext jsc = ...
-
-Vector vec = ... // a vector composed of the frequencies of events
-
-// compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-// the test runs against a uniform distribution.
-ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
-// summary of the test including the p-value, degrees of freedom, test statistic, the method used,
-// and the null hypothesis.
-System.out.println(goodnessOfFitTestResult);
-
-Matrix mat = ... // a contingency matrix
-
-// conduct Pearson's independence test on the input contingency matrix
-ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
-// summary of the test including the p-value, degrees of freedom...
-System.out.println(independenceTestResult);
-
-JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
-
-// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-// against the label.
-ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
-int i = 1;
-for (ChiSqTestResult result : featureTestResults) {
- System.out.println("Column " + i + ":");
- System.out.println(result); // summary of the test
- i++;
-}
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -401,50 +190,18 @@ hypothesis tests.
Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
-{% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.linalg import Vectors, Matrices
-from pyspark.mllib.regresssion import LabeledPoint
-from pyspark.mllib.stat import Statistics
-
-sc = SparkContext()
-
-vec = Vectors.dense(...) # a vector composed of the frequencies of events
-
-# compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-# the test runs against a uniform distribution.
-goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
- # test statistic, the method used, and the null hypothesis.
-
-mat = Matrices.dense(...) # a contingency matrix
-
-# conduct Pearson's independence test on the input contingency matrix
-independenceTestResult = Statistics.chiSqTest(mat)
-print(independenceTestResult) # summary of the test including the p-value, degrees of freedom...
-
-obs = sc.parallelize(...) # LabeledPoint(feature, label) .
-
-# The contingency table is constructed from an RDD of LabeledPoint and used to conduct
-# the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-# against the label.
-featureTestResults = Statistics.chiSqTest(obs)
-
-for i, result in enumerate(featureTestResults):
- print("Column $d:" % (i + 1))
- print(result)
-{% endhighlight %}
+{% include_example python/mllib/hypothesis_testing_example.py %}
</div>
</div>
Additionally, `spark.mllib` provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test
for equality of probability distributions. By providing the name of a theoretical distribution
-(currently solely supported for the normal distribution) and its parameters, or a function to
+(currently solely supported for the normal distribution) and its parameters, or a function to
calculate the cumulative distribution according to a given theoretical distribution, the user can
test the null hypothesis that their sample is drawn from that distribution. In the case that the
user tests against the normal distribution (`distName="norm"`), but does not provide distribution
-parameters, the test initializes to the standard normal distribution and logs an appropriate
+parameters, the test initializes to the standard normal distribution and logs an appropriate
message.
<div class="codetabs">
@@ -455,21 +212,7 @@ and interpret the hypothesis tests.
Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
-{% highlight scala %}
-import org.apache.spark.mllib.stat.Statistics
-
-val data: RDD[Double] = ... // an RDD of sample data
-
-// run a KS test for the sample versus a standard normal distribution
-val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
-println(testResult) // summary of the test including the p-value, test statistic,
- // and null hypothesis
- // if our p-value indicates significance, we can reject the null hypothesis
-
-// perform a KS test using a cumulative distribution function of our making
-val myCDF: Double => Double = ...
-val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -479,23 +222,7 @@ and interpret the hypothesis tests.
Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-
-import org.apache.spark.mllib.stat.Statistics;
-import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
-
-JavaSparkContext jsc = ...
-JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
-KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
-// summary of the test including the p-value, test statistic,
-// and null hypothesis
-// if our p-value indicates significance, we can reject the null hypothesis
-System.out.println(testResult);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -505,19 +232,7 @@ and interpret the hypothesis tests.
Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
-{% highlight python %}
-from pyspark.mllib.stat import Statistics
-
-parallelData = sc.parallelize([1.0, 2.0, ... ])
-
-# run a KS test for the sample versus a standard normal distribution
-testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-print(testResult) # summary of the test including the p-value, test statistic,
- # and null hypothesis
- # if our p-value indicates significance, we can reject the null hypothesis
-# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
-# a lambda to calculate the CDF is not made available in the Python API
-{% endhighlight %}
+{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
</div>
</div>
@@ -651,21 +366,7 @@ to do so.
Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
-{% highlight scala %}
-import org.apache.spark.mllib.stat.KernelDensity
-import org.apache.spark.rdd.RDD
-
-val data: RDD[Double] = ... // an RDD of sample data
-
-// Construct the density estimator with the sample data and a standard deviation for the Gaussian
-// kernels
-val kd = new KernelDensity()
- .setSample(data)
- .setBandwidth(3.0)
-
-// Find density estimates for the given values
-val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -675,21 +376,7 @@ to do so.
Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
-{% highlight java %}
-import org.apache.spark.mllib.stat.KernelDensity;
-import org.apache.spark.rdd.RDD;
-
-RDD<Double> data = ... // an RDD of sample data
-
-// Construct the density estimator with the sample data and a standard deviation for the Gaussian
-// kernels
-KernelDensity kd = new KernelDensity()
- .setSample(data)
- .setBandwidth(3.0);
-
-// Find density estimates for the given values
-double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -699,20 +386,7 @@ to do so.
Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.
-{% highlight python %}
-from pyspark.mllib.stat import KernelDensity
-
-data = ... # an RDD of sample data
-
-# Construct the density estimator with the sample data and a standard deviation for the Gaussian
-# kernels
-kd = KernelDensity()
-kd.setSample(data)
-kd.setBandwidth(3.0)
-
-# Find density estimates for the given values
-densities = kd.estimate([-1.0, 2.0, 5.0])
-{% endhighlight %}
+{% include_example python/mllib/kernel_density_estimation_example.py %}
</div>
</div>
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
new file mode 100644
index 0000000000..fd19b43504
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.stat.Statistics;
+// $example off$
+
+public class JavaCorrelationsExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
+ Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series
+
+ // must have the same number of partitions and cardinality as seriesX
+ JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
+ Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
+
+ // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
+ // If a method is not specified, Pearson's method will be used by default.
+ Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
+ System.out.println("Correlation is: " + correlation);
+
+ // note that each Vector is a row and not a column
+ JavaRDD<Vector> data = jsc.parallelize(
+ Arrays.asList(
+ Vectors.dense(1.0, 10.0, 100.0),
+ Vectors.dense(2.0, 20.0, 200.0),
+ Vectors.dense(5.0, 33.0, 366.0)
+ )
+ );
+
+ // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+ // If a method is not specified, Pearson's method will be used by default.
+ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
+ System.out.println(correlMatrix.toString());
+ // $example off$
+
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
new file mode 100644
index 0000000000..b48b95ff1d
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Matrices;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.ChiSqTestResult;
+// $example off$
+
+public class JavaHypothesisTestingExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ // a vector composed of the frequencies of events
+ Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);
+
+ // compute the goodness of fit. If a second vector to test against is not supplied
+ // as a parameter, the test runs against a uniform distribution.
+ ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
+ // summary of the test including the p-value, degrees of freedom, test statistic,
+ // the method used, and the null hypothesis.
+ System.out.println(goodnessOfFitTestResult + "\n");
+
+ // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+ Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
+
+ // conduct Pearson's independence test on the input contingency matrix
+ ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
+ // summary of the test including the p-value, degrees of freedom...
+ System.out.println(independenceTestResult + "\n");
+
+ // an RDD of labeled points
+ JavaRDD<LabeledPoint> obs = jsc.parallelize(
+ Arrays.asList(
+ new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
+ new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
+ new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
+ )
+ );
+
+ // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+ // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+ // against the label.
+ ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
+ int i = 1;
+ for (ChiSqTestResult result : featureTestResults) {
+ System.out.println("Column " + i + ":");
+ System.out.println(result + "\n"); // summary of the test
+ i++;
+ }
+ // $example off$
+
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
new file mode 100644
index 0000000000..fe611c9ae6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
+// $example off$
+
+public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
+ public static void main(String[] args) {
+
+ SparkConf conf =
+ new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
+ KolmogorovSmirnovTestResult testResult =
+ Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+ // summary of the test including the p-value, test statistic, and null hypothesis
+ // if our p-value indicates significance, we can reject the null hypothesis
+ System.out.println(testResult);
+ // $example off$
+
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
new file mode 100644
index 0000000000..41de0d90ec
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.stat.KernelDensity;
+// $example off$
+
+public class JavaKernelDensityEstimationExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ // an RDD of sample data
+ JavaRDD<Double> data = jsc.parallelize(
+ Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));
+
+ // Construct the density estimator with the sample data
+ // and a standard deviation for the Gaussian kernels
+ KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
+
+ // Find density estimates for the given values
+ double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
+
+ System.out.println(Arrays.toString(densities));
+ // $example off$
+
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
new file mode 100644
index 0000000000..f5a451019b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import java.util.*;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.function.VoidFunction;
+// $example off$
+
+public class JavaStratifiedSamplingExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
+ Arrays.<Tuple2<Integer, Character>>asList(
+ new Tuple2(1, 'a'),
+ new Tuple2(1, 'b'),
+ new Tuple2(2, 'c'),
+ new Tuple2(2, 'd'),
+ new Tuple2(2, 'e'),
+ new Tuple2(3, 'f')
+ )
+ );
+
+ JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
+
+ // specify the exact fraction desired from each key Map<K, Object>
+ ImmutableMap<Integer, Object> fractions =
+ ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
+
+ // Get an approximate sample from each stratum
+ JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
+ // Get an exact sample from each stratum
+ JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
+ // $example off$
+
+ System.out.println("approxSample size is " + approxSample.collect().size());
+ for (Tuple2<Integer, Character> t : approxSample.collect()) {
+ System.out.println(t._1() + " " + t._2());
+ }
+
+ System.out.println("exactSample size is " + exactSample.collect().size());
+ for (Tuple2<Integer, Character> t : exactSample.collect()) {
+ System.out.println(t._1() + " " + t._2());
+ }
+
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
new file mode 100644
index 0000000000..278706bc8f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+import org.apache.spark.mllib.stat.Statistics;
+// $example off$
+
+public class JavaSummaryStatisticsExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ JavaRDD<Vector> mat = jsc.parallelize(
+ Arrays.asList(
+ Vectors.dense(1.0, 10.0, 100.0),
+ Vectors.dense(2.0, 20.0, 200.0),
+ Vectors.dense(3.0, 30.0, 300.0)
+ )
+ ); // an RDD of Vectors
+
+ // Compute column summary statistics.
+ MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
+ System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+ System.out.println(summary.variance()); // column-wise variance
+ System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+ // $example off$
+
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
new file mode 100644
index 0000000000..66d18f6e5d
--- /dev/null
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+import numpy as np
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="CorrelationsExample") # SparkContext
+
+ # $example on$
+ seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series
+ # seriesY must have the same number of partitions and cardinality as seriesX
+ seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
+
+ # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
+ # If a method is not specified, Pearson's method will be used by default.
+ print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
+
+ data = sc.parallelize(
+ [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
+ ) # an RDD of Vectors
+
+ # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+ # If a method is not specified, Pearson's method will be used by default.
+ print(Statistics.corr(data, method="pearson"))
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
new file mode 100644
index 0000000000..e566ead0d3
--- /dev/null
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.linalg import Matrices, Vectors
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="HypothesisTestingExample")
+
+ # $example on$
+ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events
+
+ # compute the goodness of fit. If a second vector to test against
+ # is not supplied as a parameter, the test runs against a uniform distribution.
+ goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+
+ # summary of the test including the p-value, degrees of freedom,
+ # test statistic, the method used, and the null hypothesis.
+ print("%s\n" % goodnessOfFitTestResult)
+
+ mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix
+
+ # conduct Pearson's independence test on the input contingency matrix
+ independenceTestResult = Statistics.chiSqTest(mat)
+
+ # summary of the test including the p-value, degrees of freedom,
+ # test statistic, the method used, and the null hypothesis.
+ print("%s\n" % independenceTestResult)
+
+ obs = sc.parallelize(
+ [LabeledPoint(1.0, [1.0, 0.0, 3.0]),
+ LabeledPoint(1.0, [1.0, 2.0, 0.0]),
+ LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
+ ) # LabeledPoint(feature, label)
+
+ # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
+ # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+ # against the label.
+ featureTestResults = Statistics.chiSqTest(obs)
+
+ for i, result in enumerate(featureTestResults):
+ print("Column %d:\n%s" % (i + 1, result))
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
new file mode 100644
index 0000000000..ef380dee79
--- /dev/null
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")
+
+ # $example on$
+ parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
+
+ # run a KS test for the sample versus a standard normal distribution
+ testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
+ # summary of the test including the p-value, test statistic, and null hypothesis
+ # if our p-value indicates significance, we can reject the null hypothesis
+ # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
+ # a lambda to calculate the CDF is not made available in the Python API
+ print(testResult)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
new file mode 100644
index 0000000000..3e8f7241a4
--- /dev/null
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.stat import KernelDensity
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext
+
+ # $example on$
+ # an RDD of sample data
+ data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0])
+
+ # Construct the density estimator with the sample data and a standard deviation for the Gaussian
+ # kernels
+ kd = KernelDensity()
+ kd.setSample(data)
+ kd.setBandwidth(3.0)
+
+ # Find density estimates for the given values
+ densities = kd.estimate([-1.0, 2.0, 5.0])
+ # $example off$
+
+ print(densities)
+
+ sc.stop()
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
new file mode 100644
index 0000000000..a13f8f08dd
--- /dev/null
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext
+
+ # $example on$
+ # an RDD of any key value pairs
+ data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')])
+
+ # specify the exact fraction desired from each key as a dictionary
+ fractions = {1: 0.1, 2: 0.6, 3: 0.3}
+
+ approxSample = data.sampleByKey(False, fractions)
+ # $example off$
+
+ for each in approxSample.collect():
+ print(each)
+
+ sc.stop()
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
new file mode 100644
index 0000000000..d55d1a2c2d
--- /dev/null
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+import numpy as np
+
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext
+
+ # $example on$
+ mat = sc.parallelize(
+ [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
+ ) # an RDD of Vectors
+
+ # Compute column summary statistics.
+ summary = Statistics.colStats(mat)
+ print(summary.mean()) # a dense vector containing the mean value for each column
+ print(summary.variance()) # column-wise variance
+ print(summary.numNonzeros()) # number of nonzeros in each column
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
new file mode 100644
index 0000000000..1202caf534
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+// $example on$
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+// $example off$
+
+object CorrelationsExample {
+
+ def main(args: Array[String]): Unit = {
+
+ val conf = new SparkConf().setAppName("CorrelationsExample")
+ val sc = new SparkContext(conf)
+
+ // $example on$
+ val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series
+ // must have the same number of partitions and cardinality as seriesX
+ val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
+
+ // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
+ // method is not specified, Pearson's method will be used by default.
+ val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
+ println(s"Correlation is: $correlation")
+
+ val data: RDD[Vector] = sc.parallelize(
+ Seq(
+ Vectors.dense(1.0, 10.0, 100.0),
+ Vectors.dense(2.0, 20.0, 200.0),
+ Vectors.dense(5.0, 33.0, 366.0))
+ ) // note that each Vector is a row and not a column
+
+ // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method
+ // If a method is not specified, Pearson's method will be used by default.
+ val correlMatrix: Matrix = Statistics.corr(data, "pearson")
+ println(correlMatrix.toString)
+ // $example off$
+
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
new file mode 100644
index 0000000000..0d391a3637
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+// $example on$
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.stat.test.ChiSqTestResult
+import org.apache.spark.rdd.RDD
+// $example off$
+
+object HypothesisTestingExample {
+
+ def main(args: Array[String]) {
+
+ val conf = new SparkConf().setAppName("HypothesisTestingExample")
+ val sc = new SparkContext(conf)
+
+ // $example on$
+ // a vector composed of the frequencies of events
+ val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)
+
+ // compute the goodness of fit. If a second vector to test against is not supplied
+ // as a parameter, the test runs against a uniform distribution.
+ val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+ // summary of the test including the p-value, degrees of freedom, test statistic, the method
+ // used, and the null hypothesis.
+ println(s"$goodnessOfFitTestResult\n")
+
+ // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+ val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
+
+ // conduct Pearson's independence test on the input contingency matrix
+ val independenceTestResult = Statistics.chiSqTest(mat)
+ // summary of the test including the p-value, degrees of freedom
+ println(s"$independenceTestResult\n")
+
+ val obs: RDD[LabeledPoint] =
+ sc.parallelize(
+ Seq(
+ LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
+ LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
+ LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)
+ )
+ )
+ ) // (feature, label) pairs.
+
+ // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+ // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+ // against the label.
+ val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
+ featureTestResults.zipWithIndex.foreach { case (k, v) =>
+ println("Column " + (v + 1).toString + ":")
+ println(k)
+ } // summary of the test
+ // $example off$
+
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
new file mode 100644
index 0000000000..840874cf3c
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+// $example on$
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+// $example off$
+
+object HypothesisTestingKolmogorovSmirnovTestExample {
+
+ def main(args: Array[String]): Unit = {
+
+ val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
+ val sc = new SparkContext(conf)
+
+ // $example on$
+ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data
+
+ // run a KS test for the sample versus a standard normal distribution
+ val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+ // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
+ // indicates significance, we can reject the null hypothesis.
+ println(testResult)
+ println()
+
+ // perform a KS test using a cumulative distribution function of our making
+ val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
+ val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
+ println(testResult2)
+ // $example off$
+
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
new file mode 100644
index 0000000000..cc5d159b36
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+// $example on$
+import org.apache.spark.mllib.stat.KernelDensity
+import org.apache.spark.rdd.RDD
+// $example off$
+
+object KernelDensityEstimationExample {
+
+ def main(args: Array[String]): Unit = {
+
+ val conf = new SparkConf().setAppName("KernelDensityEstimationExample")
+ val sc = new SparkContext(conf)
+
+ // $example on$
+ // an RDD of sample data
+ val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9))
+
+ // Construct the density estimator with the sample data and a standard deviation
+ // for the Gaussian kernels
+ val kd = new KernelDensity()
+ .setSample(data)
+ .setBandwidth(3.0)
+
+ // Find density estimates for the given values
+ val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
+ // $example off$
+
+ densities.foreach(println)
+
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
new file mode 100644
index 0000000000..16b074ef60
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StratifiedSamplingExample {
+
+ def main(args: Array[String]): Unit = {
+
+ val conf = new SparkConf().setAppName("StratifiedSamplingExample")
+ val sc = new SparkContext(conf)
+
+ // $example on$
+ // an RDD[(K, V)] of any key value pairs
+ val data = sc.parallelize(
+ Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')))
+
+ // specify the exact fraction desired from each key
+ val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
+
+ // Get an approximate sample from each stratum
+ val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions)
+ // Get an exact sample from each stratum
+ val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)
+ // $example off$
+
+ println("approxSample size is " + approxSample.collect().size.toString)
+ approxSample.collect().foreach(println)
+
+ println("exactSample its size is " + exactSample.collect().size.toString)
+ exactSample.collect().foreach(println)
+
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
new file mode 100644
index 0000000000..948b443c0a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkConf, SparkContext}
+// $example on$
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+// $example off$
+
+object SummaryStatisticsExample {
+
+ def main(args: Array[String]): Unit = {
+
+ val conf = new SparkConf().setAppName("SummaryStatisticsExample")
+ val sc = new SparkContext(conf)
+
+ // $example on$
+ val observations = sc.parallelize(
+ Seq(
+ Vectors.dense(1.0, 10.0, 100.0),
+ Vectors.dense(2.0, 20.0, 200.0),
+ Vectors.dense(3.0, 30.0, 300.0)
+ )
+ )
+
+ // Compute column summary statistics.
+ val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
+ println(summary.mean) // a dense vector containing the mean value for each column
+ println(summary.variance) // column-wise variance
+ println(summary.numNonzeros) // number of nonzeros in each column
+ // $example off$
+
+ sc.stop()
+ }
+}
+// scalastyle:on println