diff options
Diffstat (limited to 'docs/mllib-statistics.md')
-rw-r--r-- | docs/mllib-statistics.md | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 6acfc71d7b..2c7c9ed693 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -38,6 +38,8 @@ available in `Statistics`. which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. +Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API. + {% highlight scala %} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} @@ -60,6 +62,8 @@ println(summary.numNonzeros) // number of nonzeros in each column which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. +Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API. + {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -86,6 +90,8 @@ System.out.println(summary.numNonzeros()); // number of nonzeros in each column which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. +Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API. + {% highlight python %} from pyspark.mllib.stat import Statistics @@ -116,6 +122,8 @@ correlation methods are currently Pearson's and Spearman's correlation. calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. +Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. + {% highlight scala %} import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg._ @@ -144,6 +152,8 @@ val correlMatrix: Matrix = Statistics.corr(data, "pearson") calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` respectively. +Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. + {% highlight java %} import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -173,6 +183,8 @@ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. +Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. + {% highlight python %} from pyspark.mllib.stat import Statistics @@ -338,6 +350,8 @@ featureTestResults.foreach { result => run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. +Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. + {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -385,6 +399,8 @@ for (ChiSqTestResult result : featureTestResults) { run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. +Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. + {% highlight python %} from pyspark import SparkContext from pyspark.mllib.linalg import Vectors, Matrices @@ -437,6 +453,8 @@ message. run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run and interpret the hypothesis tests. +Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. + {% highlight scala %} import org.apache.spark.mllib.stat.Statistics @@ -459,6 +477,8 @@ val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run and interpret the hypothesis tests. +Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. + {% highlight java %} import java.util.Arrays; @@ -483,6 +503,8 @@ System.out.println(testResult); run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run and interpret the hypothesis tests. +Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. + {% highlight python %} from pyspark.mllib.stat import Statistics @@ -513,6 +535,8 @@ methods to generate random double RDDs or vector RDDs. The following example generates a random double RDD, whose values follows the standard normal distribution `N(0, 1)`, and then map it to `N(1, 4)`. +Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API. + {% highlight scala %} import org.apache.spark.SparkContext import org.apache.spark.mllib.random.RandomRDDs._ @@ -533,6 +557,8 @@ methods to generate random double RDDs or vector RDDs. The following example generates a random double RDD, whose values follows the standard normal distribution `N(0, 1)`, and then map it to `N(1, 4)`. +Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API. + {% highlight java %} import org.apache.spark.SparkContext; import org.apache.spark.api.JavaDoubleRDD; @@ -559,6 +585,8 @@ methods to generate random double RDDs or vector RDDs. The following example generates a random double RDD, whose values follows the standard normal distribution `N(0, 1)`, and then map it to `N(1, 4)`. +Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API. + {% highlight python %} from pyspark.mllib.random import RandomRDDs @@ -589,6 +617,8 @@ mean of PDFs of normal distributions centered around each of the samples. to compute kernel density estimates from an RDD of samples. The following example demonstrates how to do so. +Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API. + {% highlight scala %} import org.apache.spark.mllib.stat.KernelDensity import org.apache.spark.rdd.RDD @@ -611,6 +641,8 @@ val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) to compute kernel density estimates from an RDD of samples. The following example demonstrates how to do so. +Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API. + {% highlight java %} import org.apache.spark.mllib.stat.KernelDensity; import org.apache.spark.rdd.RDD; @@ -633,6 +665,8 @@ double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); to compute kernel density estimates from an RDD of samples. The following example demonstrates how to do so. +Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API. + {% highlight python %} from pyspark.mllib.stat import KernelDensity |