aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/java/org/apache
diff options
context:
space:
mode:
authorXin Ren <iamshrek@126.com>2016-03-24 09:34:54 +0000
committerSean Owen <sowen@cloudera.com>2016-03-24 09:34:54 +0000
commitdd9ca7b9607cb4ade287b646905d92064ac94d6f (patch)
tree07463b657cf83cf714b59076f4ef5e18d6a589be /examples/src/main/java/org/apache
parent048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2 (diff)
downloadspark-dd9ca7b9607cb4ade287b646905d92064ac94d6f.tar.gz
spark-dd9ca7b9607cb4ade287b646905d92064ac94d6f.tar.bz2
spark-dd9ca7b9607cb4ade287b646905d92064ac94d6f.zip
[SPARK-13019][DOCS] fix for scala-2.10 build: Replace example code in mllib-statistics.md using include_example
## What changes were proposed in this pull request? This PR for ticket SPARK-13019 is based on previous PR(https://github.com/apache/spark/pull/11108). Since PR(https://github.com/apache/spark/pull/11108) is breaking scala-2.10 build, more work is needed to fix build errors. What I did new in this PR is adding keyword argument for 'fractions': ` val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions)` ` val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)` I reopened ticket on JIRA but sorry I don't know how to reopen a GitHub pull request, so I just submitting a new pull request. ## How was this patch tested? Manual build testing on local machine, build based on scala-2.10. Author: Xin Ren <iamshrek@126.com> Closes #11901 from keypointt/SPARK-13019.
Diffstat (limited to 'examples/src/main/java/org/apache')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java70
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java84
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java49
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java53
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java75
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java56
6 files changed, 387 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
new file mode 100644
index 0000000000..fd19b43504
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.stat.Statistics;
+// $example off$
+
+public class JavaCorrelationsExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
+ Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series
+
+ // must have the same number of partitions and cardinality as seriesX
+ JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
+ Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
+
+ // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
+ // If a method is not specified, Pearson's method will be used by default.
+ Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
+ System.out.println("Correlation is: " + correlation);
+
+ // note that each Vector is a row and not a column
+ JavaRDD<Vector> data = jsc.parallelize(
+ Arrays.asList(
+ Vectors.dense(1.0, 10.0, 100.0),
+ Vectors.dense(2.0, 20.0, 200.0),
+ Vectors.dense(5.0, 33.0, 366.0)
+ )
+ );
+
+ // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+ // If a method is not specified, Pearson's method will be used by default.
+ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
+ System.out.println(correlMatrix.toString());
+ // $example off$
+
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
new file mode 100644
index 0000000000..b48b95ff1d
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Matrices;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.ChiSqTestResult;
+// $example off$
+
+public class JavaHypothesisTestingExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ // a vector composed of the frequencies of events
+ Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);
+
+ // compute the goodness of fit. If a second vector to test against is not supplied
+ // as a parameter, the test runs against a uniform distribution.
+ ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
+ // summary of the test including the p-value, degrees of freedom, test statistic,
+ // the method used, and the null hypothesis.
+ System.out.println(goodnessOfFitTestResult + "\n");
+
+ // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+ Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
+
+ // conduct Pearson's independence test on the input contingency matrix
+ ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
+ // summary of the test including the p-value, degrees of freedom...
+ System.out.println(independenceTestResult + "\n");
+
+ // an RDD of labeled points
+ JavaRDD<LabeledPoint> obs = jsc.parallelize(
+ Arrays.asList(
+ new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
+ new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
+ new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
+ )
+ );
+
+ // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+ // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+ // against the label.
+ ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
+ int i = 1;
+ for (ChiSqTestResult result : featureTestResults) {
+ System.out.println("Column " + i + ":");
+ System.out.println(result + "\n"); // summary of the test
+ i++;
+ }
+ // $example off$
+
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
new file mode 100644
index 0000000000..fe611c9ae6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
+// $example off$
+
+public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
+ public static void main(String[] args) {
+
+ SparkConf conf =
+ new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
+ KolmogorovSmirnovTestResult testResult =
+ Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+ // summary of the test including the p-value, test statistic, and null hypothesis
+ // if our p-value indicates significance, we can reject the null hypothesis
+ System.out.println(testResult);
+ // $example off$
+
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
new file mode 100644
index 0000000000..41de0d90ec
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.stat.KernelDensity;
+// $example off$
+
+public class JavaKernelDensityEstimationExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ // an RDD of sample data
+ JavaRDD<Double> data = jsc.parallelize(
+ Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));
+
+ // Construct the density estimator with the sample data
+ // and a standard deviation for the Gaussian kernels
+ KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
+
+ // Find density estimates for the given values
+ double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
+
+ System.out.println(Arrays.toString(densities));
+ // $example off$
+
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
new file mode 100644
index 0000000000..f5a451019b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import java.util.*;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.function.VoidFunction;
+// $example off$
+
+public class JavaStratifiedSamplingExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
+ Arrays.<Tuple2<Integer, Character>>asList(
+ new Tuple2(1, 'a'),
+ new Tuple2(1, 'b'),
+ new Tuple2(2, 'c'),
+ new Tuple2(2, 'd'),
+ new Tuple2(2, 'e'),
+ new Tuple2(3, 'f')
+ )
+ );
+
+ JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
+
+ // specify the exact fraction desired from each key Map<K, Object>
+ ImmutableMap<Integer, Object> fractions =
+ ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
+
+ // Get an approximate sample from each stratum
+ JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
+ // Get an exact sample from each stratum
+ JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
+ // $example off$
+
+ System.out.println("approxSample size is " + approxSample.collect().size());
+ for (Tuple2<Integer, Character> t : approxSample.collect()) {
+ System.out.println(t._1() + " " + t._2());
+ }
+
+ System.out.println("exactSample size is " + exactSample.collect().size());
+ for (Tuple2<Integer, Character> t : exactSample.collect()) {
+ System.out.println(t._1() + " " + t._2());
+ }
+
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
new file mode 100644
index 0000000000..278706bc8f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+import org.apache.spark.mllib.stat.Statistics;
+// $example off$
+
+public class JavaSummaryStatisticsExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+
+ // $example on$
+ JavaRDD<Vector> mat = jsc.parallelize(
+ Arrays.asList(
+ Vectors.dense(1.0, 10.0, 100.0),
+ Vectors.dense(2.0, 20.0, 200.0),
+ Vectors.dense(3.0, 30.0, 300.0)
+ )
+ ); // an RDD of Vectors
+
+ // Compute column summary statistics.
+ MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
+ System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+ System.out.println(summary.variance()); // column-wise variance
+ System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+ // $example off$
+
+ jsc.stop();
+ }
+}