diff options
Diffstat (limited to 'examples/src')
14 files changed, 847 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java new file mode 100644 index 0000000000..ad44acb4cd --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.VoidFunction; +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.feature.ChiSqSelector; +import org.apache.spark.mllib.feature.ChiSqSelectorModel; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +// $example off$ + +public class JavaChiSqSelectorExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(jsc.sc(), + "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache(); + + // Discretize data in 16 equal bins since ChiSqSelector requires categorical features + // Although features are doubles, the ChiSqSelector treats each unique value as a category + JavaRDD<LabeledPoint> discretizedData = points.map( + new Function<LabeledPoint, LabeledPoint>() { + @Override + public LabeledPoint call(LabeledPoint lp) { + final double[] discretizedFeatures = new double[lp.features().size()]; + for (int i = 0; i < lp.features().size(); ++i) { + discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16); + } + return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures)); + } + } + ); + + // Create ChiSqSelector that will select top 50 of 692 features + ChiSqSelector selector = new ChiSqSelector(50); + // Create ChiSqSelector model (selecting features) + final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); + // Filter the top 50 features from each feature vector + JavaRDD<LabeledPoint> filteredData = discretizedData.map( + new Function<LabeledPoint, LabeledPoint>() { + @Override + public LabeledPoint call(LabeledPoint lp) { + return new LabeledPoint(lp.label(), transformer.transform(lp.features())); + } + } + ); + // $example off$ + + System.out.println("filtered data: "); + filteredData.foreach(new VoidFunction<LabeledPoint>() { + @Override + public void call(LabeledPoint labeledPoint) throws Exception { + System.out.println(labeledPoint.toString()); + } + }); + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java new file mode 100644 index 0000000000..c8ce6ab284 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +// $example on$ +import java.util.Arrays; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.feature.ElementwiseProduct; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +// $example off$ +import org.apache.spark.api.java.function.VoidFunction; + +public class JavaElementwiseProductExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + // Create some vector data; also works for sparse vectors + JavaRDD<Vector> data = jsc.parallelize(Arrays.asList( + Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))); + Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); + final ElementwiseProduct transformer = new ElementwiseProduct(transformingVector); + + // Batch transform and per-row transform give the same results: + JavaRDD<Vector> transformedData = transformer.transform(data); + JavaRDD<Vector> transformedData2 = data.map( + new Function<Vector, Vector>() { + @Override + public Vector call(Vector v) { + return transformer.transform(v); + } + } + ); + // $example off$ + + System.out.println("transformedData: "); + transformedData.foreach(new VoidFunction<Vector>() { + @Override + public void call(Vector vector) throws Exception { + System.out.println(vector.toString()); + } + }); + + System.out.println("transformedData2: "); + transformedData2.foreach(new VoidFunction<Vector>() { + @Override + public void call(Vector vector) throws Exception { + System.out.println(vector.toString()); + } + }); + + jsc.stop(); + } +} diff --git a/examples/src/main/python/mllib/elementwise_product_example.py b/examples/src/main/python/mllib/elementwise_product_example.py new file mode 100644 index 0000000000..6d8bf6d42e --- /dev/null +++ b/examples/src/main/python/mllib/elementwise_product_example.py @@ -0,0 +1,51 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import ElementwiseProduct +from pyspark.mllib.linalg import Vectors +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="ElementwiseProductExample") # SparkContext + + # $example on$ + data = sc.textFile("data/mllib/kmeans_data.txt") + parsedData = data.map(lambda x: [float(t) for t in x.split(" ")]) + + # Create weight vector. + transformingVector = Vectors.dense([0.0, 1.0, 2.0]) + transformer = ElementwiseProduct(transformingVector) + + # Batch transform + transformedData = transformer.transform(parsedData) + # Single-row transform + transformedData2 = transformer.transform(parsedData.first()) + # $example off$ + + print("transformedData:") + for each in transformedData.collect(): + print(each) + + print("transformedData2:") + for each in transformedData2.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/normalizer_example.py b/examples/src/main/python/mllib/normalizer_example.py new file mode 100644 index 0000000000..a4e028ca9a --- /dev/null +++ b/examples/src/main/python/mllib/normalizer_example.py @@ -0,0 +1,52 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import Normalizer +from pyspark.mllib.util import MLUtils +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="NormalizerExample") # SparkContext + + # $example on$ + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + labels = data.map(lambda x: x.label) + features = data.map(lambda x: x.features) + + normalizer1 = Normalizer() + normalizer2 = Normalizer(p=float("inf")) + + # Each sample in data1 will be normalized using $L^2$ norm. + data1 = labels.zip(normalizer1.transform(features)) + + # Each sample in data2 will be normalized using $L^\infty$ norm. + data2 = labels.zip(normalizer2.transform(features)) + # $example off$ + + print("data1:") + for each in data1.collect(): + print(each) + + print("data2:") + for each in data2.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py new file mode 100644 index 0000000000..20a77a4708 --- /dev/null +++ b/examples/src/main/python/mllib/standard_scaler_example.py @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import StandardScaler, StandardScalerModel +from pyspark.mllib.linalg import Vectors +from pyspark.mllib.util import MLUtils +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="StandardScalerExample") # SparkContext + + # $example on$ + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + label = data.map(lambda x: x.label) + features = data.map(lambda x: x.features) + + scaler1 = StandardScaler().fit(features) + scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) + + # data1 will be unit variance. + data1 = label.zip(scaler1.transform(features)) + + # Without converting the features into dense vectors, transformation with zero mean will raise + # exception on sparse vector. + # data2 will be unit variance and zero mean. + data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) + # $example off$ + + print("data1:") + for each in data1.collect(): + print(each) + + print("data2:") + for each in data2.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py new file mode 100644 index 0000000000..c4d53333a9 --- /dev/null +++ b/examples/src/main/python/mllib/tf_idf_example.py @@ -0,0 +1,57 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import HashingTF, IDF +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="TFIDFExample") # SparkContext + + # $example on$ + # Load documents (one per line). + documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) + + hashingTF = HashingTF() + tf = hashingTF.transform(documents) + + # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: + # First to compute the IDF vector and second to scale the term frequencies by IDF. + tf.cache() + idf = IDF().fit(tf) + tfidf = idf.transform(tf) + + # spark.mllib's IDF implementation provides an option for ignoring terms + # which occur in less than a minimum number of documents. + # In such cases, the IDF for these terms is set to 0. + # This feature can be used by passing the minDocFreq value to the IDF constructor. + idfIgnore = IDF(minDocFreq=2).fit(tf) + tfidfIgnore = idf.transform(tf) + # $example off$ + + print("tfidf:") + for each in tfidf.collect(): + print(each) + + print("tfidfIgnore:") + for each in tfidfIgnore.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/word2vec_example.py b/examples/src/main/python/mllib/word2vec_example.py new file mode 100644 index 0000000000..ad1090c77e --- /dev/null +++ b/examples/src/main/python/mllib/word2vec_example.py @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import Word2Vec +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="Word2VecExample") # SparkContext + + # $example on$ + inp = sc.textFile("data/mllib/sample_lda_data.txt").map(lambda row: row.split(" ")) + + word2vec = Word2Vec() + model = word2vec.fit(inp) + + synonyms = model.findSynonyms('1', 5) + + for word, cosine_distance in synonyms: + print("{}: {}".format(word, cosine_distance)) + # $example off$ + + sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala new file mode 100644 index 0000000000..5e400b7d71 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.ChiSqSelector +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object ChiSqSelectorExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("ChiSqSelectorExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load some data in libsvm format + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + // Discretize data in 16 equal bins since ChiSqSelector requires categorical features + // Even though features are doubles, the ChiSqSelector treats each unique value as a category + val discretizedData = data.map { lp => + LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor })) + } + // Create ChiSqSelector that will select top 50 of 692 features + val selector = new ChiSqSelector(50) + // Create ChiSqSelector model (selecting features) + val transformer = selector.fit(discretizedData) + // Filter the top 50 features from each feature vector + val filteredData = discretizedData.map { lp => + LabeledPoint(lp.label, transformer.transform(lp.features)) + } + // $example off$ + + println("filtered data: ") + filteredData.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala new file mode 100644 index 0000000000..1e4e354319 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.ElementwiseProduct +import org.apache.spark.mllib.linalg.Vectors +// $example off$ + +object ElementwiseProductExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("ElementwiseProductExample") + val sc = new SparkContext(conf) + + // $example on$ + // Create some vector data; also works for sparse vectors + val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) + + val transformingVector = Vectors.dense(0.0, 1.0, 2.0) + val transformer = new ElementwiseProduct(transformingVector) + + // Batch transform and per-row transform give the same results: + val transformedData = transformer.transform(data) + val transformedData2 = data.map(x => transformer.transform(x)) + // $example off$ + + println("transformedData: ") + transformedData.foreach(x => println(x)) + + println("transformedData2: ") + transformedData2.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala new file mode 100644 index 0000000000..b3a9604c2b --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.Normalizer +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object NormalizerExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("NormalizerExample") + val sc = new SparkContext(conf) + + // $example on$ + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + + val normalizer1 = new Normalizer() + val normalizer2 = new Normalizer(p = Double.PositiveInfinity) + + // Each sample in data1 will be normalized using $L^2$ norm. + val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) + + // Each sample in data2 will be normalized using $L^\infty$ norm. + val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) + // $example off$ + + println("data1: ") + data1.foreach(x => println(x)) + + println("data2: ") + data2.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala new file mode 100644 index 0000000000..f7a8136953 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.PCA +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} +// $example off$ + +object PCAExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("PCAExample") + val sc = new SparkContext(conf) + + // $example on$ + val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => + val parts = line.split(',') + LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) + }.cache() + + val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) + val training = splits(0).cache() + val test = splits(1) + + val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) + val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) + val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) + + val numIterations = 100 + val model = LinearRegressionWithSGD.train(training, numIterations) + val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) + + val valuesAndPreds = test.map { point => + val score = model.predict(point.features) + (score, point.label) + } + + val valuesAndPreds_pca = test_pca.map { point => + val score = model_pca.predict(point.features) + (score, point.label) + } + + val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() + val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() + + println("Mean Squared Error = " + MSE) + println("PCA Mean Squared Error = " + MSE_pca) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala new file mode 100644 index 0000000000..fc0aa1b7f0 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object StandardScalerExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("StandardScalerExample") + val sc = new SparkContext(conf) + + // $example on$ + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + + val scaler1 = new StandardScaler().fit(data.map(x => x.features)) + val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) + // scaler3 is an identical model to scaler2, and will produce identical transformations + val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) + + // data1 will be unit variance. + val data1 = data.map(x => (x.label, scaler1.transform(x.features))) + + // Without converting the features into dense vectors, transformation with zero mean will raise + // exception on sparse vector. + // data2 will be unit variance and zero mean. + val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) + // $example off$ + + println("data1: ") + data1.foreach(x => println(x)) + + println("data2: ") + data2.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala new file mode 100644 index 0000000000..a5bdcd8f2e --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.{HashingTF, IDF} +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.rdd.RDD +// $example off$ + +object TFIDFExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("TFIDFExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load documents (one per line). + val documents: RDD[Seq[String]] = sc.textFile("data/mllib/kmeans_data.txt") + .map(_.split(" ").toSeq) + + val hashingTF = new HashingTF() + val tf: RDD[Vector] = hashingTF.transform(documents) + + // While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: + // First to compute the IDF vector and second to scale the term frequencies by IDF. + tf.cache() + val idf = new IDF().fit(tf) + val tfidf: RDD[Vector] = idf.transform(tf) + + // spark.mllib IDF implementation provides an option for ignoring terms which occur in less than + // a minimum number of documents. In such cases, the IDF for these terms is set to 0. + // This feature can be used by passing the minDocFreq value to the IDF constructor. + val idfIgnore = new IDF(minDocFreq = 2).fit(tf) + val tfidfIgnore: RDD[Vector] = idfIgnore.transform(tf) + // $example off$ + + println("tfidf: ") + tfidf.foreach(x => println(x)) + + println("tfidfIgnore: ") + tfidfIgnore.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala new file mode 100644 index 0000000000..ea794c700a --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} +// $example off$ + +object Word2VecExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("Word2VecExample") + val sc = new SparkContext(conf) + + // $example on$ + val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) + + val word2vec = new Word2Vec() + + val model = word2vec.fit(input) + + val synonyms = model.findSynonyms("1", 5) + + for((synonym, cosineSimilarity) <- synonyms) { + println(s"$synonym $cosineSimilarity") + } + + // Save and load model + model.save(sc, "myModelPath") + val sameModel = Word2VecModel.load(sc, "myModelPath") + // $example off$ + + sc.stop() + } +} +// scalastyle:on println |