diff options
author | Xin Ren <iamshrek@126.com> | 2016-03-24 09:34:54 +0000 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-03-24 09:34:54 +0000 |
commit | dd9ca7b9607cb4ade287b646905d92064ac94d6f (patch) | |
tree | 07463b657cf83cf714b59076f4ef5e18d6a589be /examples/src/main/java/org/apache | |
parent | 048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2 (diff) | |
download | spark-dd9ca7b9607cb4ade287b646905d92064ac94d6f.tar.gz spark-dd9ca7b9607cb4ade287b646905d92064ac94d6f.tar.bz2 spark-dd9ca7b9607cb4ade287b646905d92064ac94d6f.zip |
[SPARK-13019][DOCS] fix for scala-2.10 build: Replace example code in mllib-statistics.md using include_example
## What changes were proposed in this pull request?
This PR for ticket SPARK-13019 is based on previous PR(https://github.com/apache/spark/pull/11108).
Since PR(https://github.com/apache/spark/pull/11108) is breaking scala-2.10 build, more work is needed to fix build errors.
What I did new in this PR is adding keyword argument for 'fractions':
` val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions)`
` val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)`
I reopened ticket on JIRA but sorry I don't know how to reopen a GitHub pull request, so I just submitting a new pull request.
## How was this patch tested?
Manual build testing on local machine, build based on scala-2.10.
Author: Xin Ren <iamshrek@126.com>
Closes #11901 from keypointt/SPARK-13019.
Diffstat (limited to 'examples/src/main/java/org/apache')
6 files changed, 387 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java new file mode 100644 index 0000000000..fd19b43504 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.stat.Statistics; +// $example off$ + +public class JavaCorrelationsExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaDoubleRDD seriesX = jsc.parallelizeDoubles( + Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series + + // must have the same number of partitions and cardinality as seriesX + JavaDoubleRDD seriesY = jsc.parallelizeDoubles( + Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); + + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); + System.out.println("Correlation is: " + correlation); + + // note that each Vector is a row and not a column + JavaRDD<Vector> data = jsc.parallelize( + Arrays.asList( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(5.0, 33.0, 366.0) + ) + ); + + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); + System.out.println(correlMatrix.toString()); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java new file mode 100644 index 0000000000..b48b95ff1d --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Matrices; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.ChiSqTestResult; +// $example off$ + +public class JavaHypothesisTestingExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + // a vector composed of the frequencies of events + Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); + + // compute the goodness of fit. If a second vector to test against is not supplied + // as a parameter, the test runs against a uniform distribution. + ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); + // summary of the test including the p-value, degrees of freedom, test statistic, + // the method used, and the null hypothesis. + System.out.println(goodnessOfFitTestResult + "\n"); + + // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); + + // conduct Pearson's independence test on the input contingency matrix + ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); + // summary of the test including the p-value, degrees of freedom... + System.out.println(independenceTestResult + "\n"); + + // an RDD of labeled points + JavaRDD<LabeledPoint> obs = jsc.parallelize( + Arrays.asList( + new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), + new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), + new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) + ) + ); + + // The contingency table is constructed from the raw (feature, label) pairs and used to conduct + // the independence test. Returns an array containing the ChiSquaredTestResult for every feature + // against the label. + ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); + int i = 1; + for (ChiSqTestResult result : featureTestResults) { + System.out.println("Column " + i + ":"); + System.out.println(result + "\n"); // summary of the test + i++; + } + // $example off$ + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java new file mode 100644 index 0000000000..fe611c9ae6 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; +// $example off$ + +public class JavaHypothesisTestingKolmogorovSmirnovTestExample { + public static void main(String[] args) { + + SparkConf conf = + new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); + KolmogorovSmirnovTestResult testResult = + Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); + // summary of the test including the p-value, test statistic, and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis + System.out.println(testResult); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java new file mode 100644 index 0000000000..41de0d90ec --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.stat.KernelDensity; +// $example off$ + +public class JavaKernelDensityEstimationExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + // an RDD of sample data + JavaRDD<Double> data = jsc.parallelize( + Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); + + // Construct the density estimator with the sample data + // and a standard deviation for the Gaussian kernels + KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); + + // Find density estimates for the given values + double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); + + System.out.println(Arrays.toString(densities)); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java new file mode 100644 index 0000000000..f5a451019b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import com.google.common.collect.ImmutableMap; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +// $example on$ +import java.util.*; + +import scala.Tuple2; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.function.VoidFunction; +// $example off$ + +public class JavaStratifiedSamplingExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>( + Arrays.<Tuple2<Integer, Character>>asList( + new Tuple2(1, 'a'), + new Tuple2(1, 'b'), + new Tuple2(2, 'c'), + new Tuple2(2, 'd'), + new Tuple2(2, 'e'), + new Tuple2(3, 'f') + ) + ); + + JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list); + + // specify the exact fraction desired from each key Map<K, Object> + ImmutableMap<Integer, Object> fractions = + ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3); + + // Get an approximate sample from each stratum + JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions); + // Get an exact sample from each stratum + JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions); + // $example off$ + + System.out.println("approxSample size is " + approxSample.collect().size()); + for (Tuple2<Integer, Character> t : approxSample.collect()) { + System.out.println(t._1() + " " + t._2()); + } + + System.out.println("exactSample size is " + exactSample.collect().size()); + for (Tuple2<Integer, Character> t : exactSample.collect()) { + System.out.println(t._1() + " " + t._2()); + } + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java new file mode 100644 index 0000000000..278706bc8f --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; +// $example off$ + +public class JavaSummaryStatisticsExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaRDD<Vector> mat = jsc.parallelize( + Arrays.asList( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(3.0, 30.0, 300.0) + ) + ); // an RDD of Vectors + + // Compute column summary statistics. + MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); + System.out.println(summary.mean()); // a dense vector containing the mean value for each column + System.out.println(summary.variance()); // column-wise variance + System.out.println(summary.numNonzeros()); // number of nonzeros in each column + // $example off$ + + jsc.stop(); + } +} |