aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-statistics.md
diff options
context:
space:
mode:
authorjose.cambronero <jose.cambronero@cloudera.com>2015-08-17 19:09:45 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-17 19:09:45 -0700
commitc90c605dc6a876aef3cc204ac15cd65bab9743ad (patch)
tree9935f2ccae880dfe797813383abcf38ed2053093 /docs/mllib-statistics.md
parentf9d1a92aa1bac4494022d78559b871149579e6e8 (diff)
downloadspark-c90c605dc6a876aef3cc204ac15cd65bab9743ad.tar.gz
spark-c90c605dc6a876aef3cc204ac15cd65bab9743ad.tar.bz2
spark-c90c605dc6a876aef3cc204ac15cd65bab9743ad.zip
[SPARK-9902] [MLLIB] Add Java and Python examples to user guide for 1-sample KS test
added doc examples for python. Author: jose.cambronero <jose.cambronero@cloudera.com> Closes #8154 from josepablocam/spark_9902.
Diffstat (limited to 'docs/mllib-statistics.md')
-rw-r--r--docs/mllib-statistics.md51
1 files changed, 47 insertions, 4 deletions
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 80a9d064c0..6acfc71d7b 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -438,22 +438,65 @@ run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstra
and interpret the hypothesis tests.
{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.stat.Statistics._
+import org.apache.spark.mllib.stat.Statistics
val data: RDD[Double] = ... // an RDD of sample data
// run a KS test for the sample versus a standard normal distribution
val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
println(testResult) // summary of the test including the p-value, test statistic,
- // and null hypothesis
- // if our p-value indicates significance, we can reject the null hypothesis
+ // and null hypothesis
+ // if our p-value indicates significance, we can reject the null hypothesis
// perform a KS test using a cumulative distribution function of our making
val myCDF: Double => Double = ...
val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
{% endhighlight %}
</div>
+
+<div data-lang="java" markdown="1">
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
+run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
+and interpret the hypothesis tests.
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
+
+JavaSparkContext jsc = ...
+JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
+KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+// summary of the test including the p-value, test statistic,
+// and null hypothesis
+// if our p-value indicates significance, we can reject the null hypothesis
+System.out.println(testResult);
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to
+run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
+and interpret the hypothesis tests.
+
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+parallelData = sc.parallelize([1.0, 2.0, ... ])
+
+# run a KS test for the sample versus a standard normal distribution
+testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
+print(testResult) # summary of the test including the p-value, test statistic,
+ # and null hypothesis
+ # if our p-value indicates significance, we can reject the null hypothesis
+# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
+# a lambda to calculate the CDF is not made available in the Python API
+{% endhighlight %}
+</div>
</div>