aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-statistics.md
diff options
context:
space:
mode:
Diffstat (limited to 'docs/mllib-statistics.md')
-rw-r--r--docs/mllib-statistics.md34
1 files changed, 34 insertions, 0 deletions
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 6acfc71d7b..2c7c9ed693 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -38,6 +38,8 @@ available in `Statistics`.
which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
total count.
+Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
+
{% highlight scala %}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
@@ -60,6 +62,8 @@ println(summary.numNonzeros) // number of nonzeros in each column
which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
total count.
+Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
+
{% highlight java %}
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -86,6 +90,8 @@ System.out.println(summary.numNonzeros()); // number of nonzeros in each column
which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
total count.
+Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.
+
{% highlight python %}
from pyspark.mllib.stat import Statistics
@@ -116,6 +122,8 @@ correlation methods are currently Pearson's and Spearman's correlation.
calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
+Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
+
{% highlight scala %}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg._
@@ -144,6 +152,8 @@ val correlMatrix: Matrix = Statistics.corr(data, "pearson")
calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or
a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` respectively.
+Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
+
{% highlight java %}
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -173,6 +183,8 @@ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
+Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
+
{% highlight python %}
from pyspark.mllib.stat import Statistics
@@ -338,6 +350,8 @@ featureTestResults.foreach { result =>
run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
hypothesis tests.
+Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
+
{% highlight java %}
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -385,6 +399,8 @@ for (ChiSqTestResult result : featureTestResults) {
run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
hypothesis tests.
+Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
+
{% highlight python %}
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors, Matrices
@@ -437,6 +453,8 @@ message.
run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
and interpret the hypothesis tests.
+Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
+
{% highlight scala %}
import org.apache.spark.mllib.stat.Statistics
@@ -459,6 +477,8 @@ val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
and interpret the hypothesis tests.
+Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
+
{% highlight java %}
import java.util.Arrays;
@@ -483,6 +503,8 @@ System.out.println(testResult);
run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
and interpret the hypothesis tests.
+Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
+
{% highlight python %}
from pyspark.mllib.stat import Statistics
@@ -513,6 +535,8 @@ methods to generate random double RDDs or vector RDDs.
The following example generates a random double RDD, whose values follows the standard normal
distribution `N(0, 1)`, and then map it to `N(1, 4)`.
+Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API.
+
{% highlight scala %}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.random.RandomRDDs._
@@ -533,6 +557,8 @@ methods to generate random double RDDs or vector RDDs.
The following example generates a random double RDD, whose values follows the standard normal
distribution `N(0, 1)`, and then map it to `N(1, 4)`.
+Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API.
+
{% highlight java %}
import org.apache.spark.SparkContext;
import org.apache.spark.api.JavaDoubleRDD;
@@ -559,6 +585,8 @@ methods to generate random double RDDs or vector RDDs.
The following example generates a random double RDD, whose values follows the standard normal
distribution `N(0, 1)`, and then map it to `N(1, 4)`.
+Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API.
+
{% highlight python %}
from pyspark.mllib.random import RandomRDDs
@@ -589,6 +617,8 @@ mean of PDFs of normal distributions centered around each of the samples.
to compute kernel density estimates from an RDD of samples. The following example demonstrates how
to do so.
+Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
+
{% highlight scala %}
import org.apache.spark.mllib.stat.KernelDensity
import org.apache.spark.rdd.RDD
@@ -611,6 +641,8 @@ val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
to compute kernel density estimates from an RDD of samples. The following example demonstrates how
to do so.
+Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
+
{% highlight java %}
import org.apache.spark.mllib.stat.KernelDensity;
import org.apache.spark.rdd.RDD;
@@ -633,6 +665,8 @@ double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
to compute kernel density estimates from an RDD of samples. The following example demonstrates how
to do so.
+Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.
+
{% highlight python %}
from pyspark.mllib.stat import KernelDensity