diff options
15 files changed, 861 insertions, 361 deletions
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 7796bac697..7a97285032 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -58,46 +58,7 @@ Each record could be an iterable of strings or other types. Refer to the [`HashingTF` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.HashingTF) for details on the API. - -{% highlight scala %} -import org.apache.spark.rdd.RDD -import org.apache.spark.SparkContext -import org.apache.spark.mllib.feature.HashingTF -import org.apache.spark.mllib.linalg.Vector - -val sc: SparkContext = ... - -// Load documents (one per line). -val documents: RDD[Seq[String]] = sc.textFile("...").map(_.split(" ").toSeq) - -val hashingTF = new HashingTF() -val tf: RDD[Vector] = hashingTF.transform(documents) -{% endhighlight %} - -While applying `HashingTF` only needs a single pass to the data, applying `IDF` needs two passes: -first to compute the IDF vector and second to scale the term frequencies by IDF. - -{% highlight scala %} -import org.apache.spark.mllib.feature.IDF - -// ... continue from the previous example -tf.cache() -val idf = new IDF().fit(tf) -val tfidf: RDD[Vector] = idf.transform(tf) -{% endhighlight %} - -`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a -minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature -can be used by passing the `minDocFreq` value to the IDF constructor. - -{% highlight scala %} -import org.apache.spark.mllib.feature.IDF - -// ... continue from the previous example -tf.cache() -val idf = new IDF(minDocFreq = 2).fit(tf) -val tfidf: RDD[Vector] = idf.transform(tf) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/TFIDFExample.scala %} </div> <div data-lang="python" markdown="1"> @@ -109,41 +70,7 @@ Each record could be an iterable of strings or other types. Refer to the [`HashingTF` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.HashingTF) for details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.feature import HashingTF - -sc = SparkContext() - -# Load documents (one per line). -documents = sc.textFile("...").map(lambda line: line.split(" ")) - -hashingTF = HashingTF() -tf = hashingTF.transform(documents) -{% endhighlight %} - -While applying `HashingTF` only needs a single pass to the data, applying `IDF` needs two passes: -first to compute the IDF vector and second to scale the term frequencies by IDF. - -{% highlight python %} -from pyspark.mllib.feature import IDF - -# ... continue from the previous example -tf.cache() -idf = IDF().fit(tf) -tfidf = idf.transform(tf) -{% endhighlight %} - -`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a -minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature -can be used by passing the `minDocFreq` value to the IDF constructor. - -{% highlight python %} -# ... continue from the previous example -tf.cache() -idf = IDF(minDocFreq=2).fit(tf) -tfidf = idf.transform(tf) -{% endhighlight %} +{% include_example python/mllib/tf_idf_example.py %} </div> </div> @@ -192,47 +119,12 @@ Here we assume the extracted file is `text8` and in same directory as you run th <div data-lang="scala" markdown="1"> Refer to the [`Word2Vec` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec) for details on the API. -{% highlight scala %} -import org.apache.spark._ -import org.apache.spark.rdd._ -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} - -val input = sc.textFile("text8").map(line => line.split(" ").toSeq) - -val word2vec = new Word2Vec() - -val model = word2vec.fit(input) - -val synonyms = model.findSynonyms("china", 40) - -for((synonym, cosineSimilarity) <- synonyms) { - println(s"$synonym $cosineSimilarity") -} - -// Save and load model -model.save(sc, "myModelPath") -val sameModel = Word2VecModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/Word2VecExample.scala %} </div> <div data-lang="python" markdown="1"> Refer to the [`Word2Vec` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Word2Vec) for more details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.feature import Word2Vec - -sc = SparkContext(appName='Word2Vec') -inp = sc.textFile("text8_lines").map(lambda row: row.split(" ")) - -word2vec = Word2Vec() -model = word2vec.fit(inp) - -synonyms = model.findSynonyms('china', 40) - -for word, cosine_distance in synonyms: - print("{}: {}".format(word, cosine_distance)) -{% endhighlight %} +{% include_example python/mllib/word2vec_example.py %} </div> </div> @@ -277,55 +169,13 @@ so that the new features have unit standard deviation and/or zero mean. <div data-lang="scala" markdown="1"> Refer to the [`StandardScaler` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.StandardScaler -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLUtils - -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") - -val scaler1 = new StandardScaler().fit(data.map(x => x.features)) -val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) -// scaler3 is an identical model to scaler2, and will produce identical transformations -val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) - -// data1 will be unit variance. -val data1 = data.map(x => (x.label, scaler1.transform(x.features))) - -// Without converting the features into dense vectors, transformation with zero mean will raise -// exception on sparse vector. -// data2 will be unit variance and zero mean. -val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/StandardScalerExample.scala %} </div> <div data-lang="python" markdown="1"> Refer to the [`StandardScaler` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.StandardScaler) for more details on the API. -{% highlight python %} -from pyspark.mllib.util import MLUtils -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.feature import StandardScaler - -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -label = data.map(lambda x: x.label) -features = data.map(lambda x: x.features) - -scaler1 = StandardScaler().fit(features) -scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) -# scaler3 is an identical model to scaler2, and will produce identical transformations -scaler3 = StandardScalerModel(scaler2.std, scaler2.mean) - - -# data1 will be unit variance. -data1 = label.zip(scaler1.transform(features)) - -# Without converting the features into dense vectors, transformation with zero mean will raise -# exception on sparse vector. -# data2 will be unit variance and zero mean. -data2 = label.zip(scaler1.transform(features.map(lambda x: Vectors.dense(x.toArray())))) -{% endhighlight %} +{% include_example python/mllib/standard_scaler_example.py %} </div> </div> @@ -355,46 +205,13 @@ with $L^2$ norm, and $L^\infty$ norm. <div data-lang="scala" markdown="1"> Refer to the [`Normalizer` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Normalizer) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.Normalizer -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLUtils - -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") - -val normalizer1 = new Normalizer() -val normalizer2 = new Normalizer(p = Double.PositiveInfinity) - -// Each sample in data1 will be normalized using $L^2$ norm. -val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) - -// Each sample in data2 will be normalized using $L^\infty$ norm. -val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/NormalizerExample.scala %} </div> <div data-lang="python" markdown="1"> Refer to the [`Normalizer` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Normalizer) for more details on the API. -{% highlight python %} -from pyspark.mllib.util import MLUtils -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.feature import Normalizer - -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -labels = data.map(lambda x: x.label) -features = data.map(lambda x: x.features) - -normalizer1 = Normalizer() -normalizer2 = Normalizer(p=float("inf")) - -# Each sample in data1 will be normalized using $L^2$ norm. -data1 = labels.zip(normalizer1.transform(features)) - -# Each sample in data2 will be normalized using $L^\infty$ norm. -data2 = labels.zip(normalizer2.transform(features)) -{% endhighlight %} +{% include_example python/mllib/normalizer_example.py %} </div> </div> @@ -435,29 +252,7 @@ The following example shows the basic use of ChiSqSelector. The data set used ha Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.mllib.feature.ChiSqSelector - -// Load some data in libsvm format -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -// Discretize data in 16 equal bins since ChiSqSelector requires categorical features -// Even though features are doubles, the ChiSqSelector treats each unique value as a category -val discretizedData = data.map { lp => - LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor } ) ) -} -// Create ChiSqSelector that will select top 50 of 692 features -val selector = new ChiSqSelector(50) -// Create ChiSqSelector model (selecting features) -val transformer = selector.fit(discretizedData) -// Filter the top 50 features from each feature vector -val filteredData = discretizedData.map { lp => - LabeledPoint(lp.label, transformer.transform(lp.features)) -} -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala %} </div> <div data-lang="java" markdown="1"> @@ -465,52 +260,7 @@ val filteredData = discretizedData.map { lp => Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html) for details on the API. -{% highlight java %} -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.feature.ChiSqSelector; -import org.apache.spark.mllib.feature.ChiSqSelectorModel; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; - -SparkConf sparkConf = new SparkConf().setAppName("JavaChiSqSelector"); -JavaSparkContext sc = new JavaSparkContext(sparkConf); -JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(sc.sc(), - "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache(); - -// Discretize data in 16 equal bins since ChiSqSelector requires categorical features -// Even though features are doubles, the ChiSqSelector treats each unique value as a category -JavaRDD<LabeledPoint> discretizedData = points.map( - new Function<LabeledPoint, LabeledPoint>() { - @Override - public LabeledPoint call(LabeledPoint lp) { - final double[] discretizedFeatures = new double[lp.features().size()]; - for (int i = 0; i < lp.features().size(); ++i) { - discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16); - } - return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures)); - } - }); - -// Create ChiSqSelector that will select top 50 of 692 features -ChiSqSelector selector = new ChiSqSelector(50); -// Create ChiSqSelector model (selecting features) -final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); -// Filter the top 50 features from each feature vector -JavaRDD<LabeledPoint> filteredData = discretizedData.map( - new Function<LabeledPoint, LabeledPoint>() { - @Override - public LabeledPoint call(LabeledPoint lp) { - return new LabeledPoint(lp.label(), transformer.transform(lp.features())); - } - } -); - -sc.stop(); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %} </div> </div> @@ -554,78 +304,19 @@ This example below demonstrates how to transform vectors using a transforming ve Refer to the [`ElementwiseProduct` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.ElementwiseProduct -import org.apache.spark.mllib.linalg.Vectors - -// Create some vector data; also works for sparse vectors -val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) - -val transformingVector = Vectors.dense(0.0, 1.0, 2.0) -val transformer = new ElementwiseProduct(transformingVector) - -// Batch transform and per-row transform give the same results: -val transformedData = transformer.transform(data) -val transformedData2 = data.map(x => transformer.transform(x)) - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala %} </div> <div data-lang="java" markdown="1"> Refer to the [`ElementwiseProduct` Java docs](api/java/org/apache/spark/mllib/feature/ElementwiseProduct.html) for details on the API. -{% highlight java %} -import java.util.Arrays; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.feature.ElementwiseProduct; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; - -// Create some vector data; also works for sparse vectors -JavaRDD<Vector> data = sc.parallelize(Arrays.asList( - Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))); -Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); -ElementwiseProduct transformer = new ElementwiseProduct(transformingVector); - -// Batch transform and per-row transform give the same results: -JavaRDD<Vector> transformedData = transformer.transform(data); -JavaRDD<Vector> transformedData2 = data.map( - new Function<Vector, Vector>() { - @Override - public Vector call(Vector v) { - return transformer.transform(v); - } - } -); - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java %} </div> <div data-lang="python" markdown="1"> Refer to the [`ElementwiseProduct` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ElementwiseProduct) for more details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.feature import ElementwiseProduct - -# Load and parse the data -sc = SparkContext() -data = sc.textFile("data/mllib/kmeans_data.txt") -parsedData = data.map(lambda x: [float(t) for t in x.split(" ")]) - -# Create weight vector. -transformingVector = Vectors.dense([0.0, 1.0, 2.0]) -transformer = ElementwiseProduct(transformingVector) - -# Batch transform -transformedData = transformer.transform(parsedData) -# Single-row transform -transformedData2 = transformer.transform(parsedData.first()) - -{% endhighlight %} +{% include_example python/mllib/elementwise_product_example.py %} </div> </div> @@ -645,44 +336,6 @@ for calculation a [Linear Regression]((mllib-linear-methods.html)) <div data-lang="scala" markdown="1"> Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.feature.PCA - -val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => - val parts = line.split(',') - LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) -}.cache() - -val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) -val training = splits(0).cache() -val test = splits(1) - -val pca = new PCA(training.first().features.size/2).fit(data.map(_.features)) -val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) -val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) - -val numIterations = 100 -val model = LinearRegressionWithSGD.train(training, numIterations) -val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) - -val valuesAndPreds = test.map { point => - val score = model.predict(point.features) - (score, point.label) -} - -val valuesAndPreds_pca = test_pca.map { point => - val score = model_pca.predict(point.features) - (score, point.label) -} - -val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean() -val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean() - -println("Mean Squared Error = " + MSE) -println("PCA Mean Squared Error = " + MSE_pca) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/PCAExample.scala %} </div> </div> diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java new file mode 100644 index 0000000000..ad44acb4cd --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.VoidFunction; +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.feature.ChiSqSelector; +import org.apache.spark.mllib.feature.ChiSqSelectorModel; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +// $example off$ + +public class JavaChiSqSelectorExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(jsc.sc(), + "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache(); + + // Discretize data in 16 equal bins since ChiSqSelector requires categorical features + // Although features are doubles, the ChiSqSelector treats each unique value as a category + JavaRDD<LabeledPoint> discretizedData = points.map( + new Function<LabeledPoint, LabeledPoint>() { + @Override + public LabeledPoint call(LabeledPoint lp) { + final double[] discretizedFeatures = new double[lp.features().size()]; + for (int i = 0; i < lp.features().size(); ++i) { + discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16); + } + return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures)); + } + } + ); + + // Create ChiSqSelector that will select top 50 of 692 features + ChiSqSelector selector = new ChiSqSelector(50); + // Create ChiSqSelector model (selecting features) + final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); + // Filter the top 50 features from each feature vector + JavaRDD<LabeledPoint> filteredData = discretizedData.map( + new Function<LabeledPoint, LabeledPoint>() { + @Override + public LabeledPoint call(LabeledPoint lp) { + return new LabeledPoint(lp.label(), transformer.transform(lp.features())); + } + } + ); + // $example off$ + + System.out.println("filtered data: "); + filteredData.foreach(new VoidFunction<LabeledPoint>() { + @Override + public void call(LabeledPoint labeledPoint) throws Exception { + System.out.println(labeledPoint.toString()); + } + }); + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java new file mode 100644 index 0000000000..c8ce6ab284 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +// $example on$ +import java.util.Arrays; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.feature.ElementwiseProduct; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +// $example off$ +import org.apache.spark.api.java.function.VoidFunction; + +public class JavaElementwiseProductExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + // Create some vector data; also works for sparse vectors + JavaRDD<Vector> data = jsc.parallelize(Arrays.asList( + Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))); + Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); + final ElementwiseProduct transformer = new ElementwiseProduct(transformingVector); + + // Batch transform and per-row transform give the same results: + JavaRDD<Vector> transformedData = transformer.transform(data); + JavaRDD<Vector> transformedData2 = data.map( + new Function<Vector, Vector>() { + @Override + public Vector call(Vector v) { + return transformer.transform(v); + } + } + ); + // $example off$ + + System.out.println("transformedData: "); + transformedData.foreach(new VoidFunction<Vector>() { + @Override + public void call(Vector vector) throws Exception { + System.out.println(vector.toString()); + } + }); + + System.out.println("transformedData2: "); + transformedData2.foreach(new VoidFunction<Vector>() { + @Override + public void call(Vector vector) throws Exception { + System.out.println(vector.toString()); + } + }); + + jsc.stop(); + } +} diff --git a/examples/src/main/python/mllib/elementwise_product_example.py b/examples/src/main/python/mllib/elementwise_product_example.py new file mode 100644 index 0000000000..6d8bf6d42e --- /dev/null +++ b/examples/src/main/python/mllib/elementwise_product_example.py @@ -0,0 +1,51 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import ElementwiseProduct +from pyspark.mllib.linalg import Vectors +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="ElementwiseProductExample") # SparkContext + + # $example on$ + data = sc.textFile("data/mllib/kmeans_data.txt") + parsedData = data.map(lambda x: [float(t) for t in x.split(" ")]) + + # Create weight vector. + transformingVector = Vectors.dense([0.0, 1.0, 2.0]) + transformer = ElementwiseProduct(transformingVector) + + # Batch transform + transformedData = transformer.transform(parsedData) + # Single-row transform + transformedData2 = transformer.transform(parsedData.first()) + # $example off$ + + print("transformedData:") + for each in transformedData.collect(): + print(each) + + print("transformedData2:") + for each in transformedData2.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/normalizer_example.py b/examples/src/main/python/mllib/normalizer_example.py new file mode 100644 index 0000000000..a4e028ca9a --- /dev/null +++ b/examples/src/main/python/mllib/normalizer_example.py @@ -0,0 +1,52 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import Normalizer +from pyspark.mllib.util import MLUtils +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="NormalizerExample") # SparkContext + + # $example on$ + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + labels = data.map(lambda x: x.label) + features = data.map(lambda x: x.features) + + normalizer1 = Normalizer() + normalizer2 = Normalizer(p=float("inf")) + + # Each sample in data1 will be normalized using $L^2$ norm. + data1 = labels.zip(normalizer1.transform(features)) + + # Each sample in data2 will be normalized using $L^\infty$ norm. + data2 = labels.zip(normalizer2.transform(features)) + # $example off$ + + print("data1:") + for each in data1.collect(): + print(each) + + print("data2:") + for each in data2.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py new file mode 100644 index 0000000000..20a77a4708 --- /dev/null +++ b/examples/src/main/python/mllib/standard_scaler_example.py @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import StandardScaler, StandardScalerModel +from pyspark.mllib.linalg import Vectors +from pyspark.mllib.util import MLUtils +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="StandardScalerExample") # SparkContext + + # $example on$ + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + label = data.map(lambda x: x.label) + features = data.map(lambda x: x.features) + + scaler1 = StandardScaler().fit(features) + scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) + + # data1 will be unit variance. + data1 = label.zip(scaler1.transform(features)) + + # Without converting the features into dense vectors, transformation with zero mean will raise + # exception on sparse vector. + # data2 will be unit variance and zero mean. + data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) + # $example off$ + + print("data1:") + for each in data1.collect(): + print(each) + + print("data2:") + for each in data2.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py new file mode 100644 index 0000000000..c4d53333a9 --- /dev/null +++ b/examples/src/main/python/mllib/tf_idf_example.py @@ -0,0 +1,57 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import HashingTF, IDF +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="TFIDFExample") # SparkContext + + # $example on$ + # Load documents (one per line). + documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) + + hashingTF = HashingTF() + tf = hashingTF.transform(documents) + + # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: + # First to compute the IDF vector and second to scale the term frequencies by IDF. + tf.cache() + idf = IDF().fit(tf) + tfidf = idf.transform(tf) + + # spark.mllib's IDF implementation provides an option for ignoring terms + # which occur in less than a minimum number of documents. + # In such cases, the IDF for these terms is set to 0. + # This feature can be used by passing the minDocFreq value to the IDF constructor. + idfIgnore = IDF(minDocFreq=2).fit(tf) + tfidfIgnore = idf.transform(tf) + # $example off$ + + print("tfidf:") + for each in tfidf.collect(): + print(each) + + print("tfidfIgnore:") + for each in tfidfIgnore.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/word2vec_example.py b/examples/src/main/python/mllib/word2vec_example.py new file mode 100644 index 0000000000..ad1090c77e --- /dev/null +++ b/examples/src/main/python/mllib/word2vec_example.py @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.feature import Word2Vec +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="Word2VecExample") # SparkContext + + # $example on$ + inp = sc.textFile("data/mllib/sample_lda_data.txt").map(lambda row: row.split(" ")) + + word2vec = Word2Vec() + model = word2vec.fit(inp) + + synonyms = model.findSynonyms('1', 5) + + for word, cosine_distance in synonyms: + print("{}: {}".format(word, cosine_distance)) + # $example off$ + + sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala new file mode 100644 index 0000000000..5e400b7d71 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.ChiSqSelector +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object ChiSqSelectorExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("ChiSqSelectorExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load some data in libsvm format + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + // Discretize data in 16 equal bins since ChiSqSelector requires categorical features + // Even though features are doubles, the ChiSqSelector treats each unique value as a category + val discretizedData = data.map { lp => + LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor })) + } + // Create ChiSqSelector that will select top 50 of 692 features + val selector = new ChiSqSelector(50) + // Create ChiSqSelector model (selecting features) + val transformer = selector.fit(discretizedData) + // Filter the top 50 features from each feature vector + val filteredData = discretizedData.map { lp => + LabeledPoint(lp.label, transformer.transform(lp.features)) + } + // $example off$ + + println("filtered data: ") + filteredData.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala new file mode 100644 index 0000000000..1e4e354319 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.ElementwiseProduct +import org.apache.spark.mllib.linalg.Vectors +// $example off$ + +object ElementwiseProductExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("ElementwiseProductExample") + val sc = new SparkContext(conf) + + // $example on$ + // Create some vector data; also works for sparse vectors + val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) + + val transformingVector = Vectors.dense(0.0, 1.0, 2.0) + val transformer = new ElementwiseProduct(transformingVector) + + // Batch transform and per-row transform give the same results: + val transformedData = transformer.transform(data) + val transformedData2 = data.map(x => transformer.transform(x)) + // $example off$ + + println("transformedData: ") + transformedData.foreach(x => println(x)) + + println("transformedData2: ") + transformedData2.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala new file mode 100644 index 0000000000..b3a9604c2b --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.Normalizer +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object NormalizerExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("NormalizerExample") + val sc = new SparkContext(conf) + + // $example on$ + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + + val normalizer1 = new Normalizer() + val normalizer2 = new Normalizer(p = Double.PositiveInfinity) + + // Each sample in data1 will be normalized using $L^2$ norm. + val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) + + // Each sample in data2 will be normalized using $L^\infty$ norm. + val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) + // $example off$ + + println("data1: ") + data1.foreach(x => println(x)) + + println("data2: ") + data2.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala new file mode 100644 index 0000000000..f7a8136953 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.PCA +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} +// $example off$ + +object PCAExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("PCAExample") + val sc = new SparkContext(conf) + + // $example on$ + val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => + val parts = line.split(',') + LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) + }.cache() + + val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) + val training = splits(0).cache() + val test = splits(1) + + val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) + val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) + val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) + + val numIterations = 100 + val model = LinearRegressionWithSGD.train(training, numIterations) + val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) + + val valuesAndPreds = test.map { point => + val score = model.predict(point.features) + (score, point.label) + } + + val valuesAndPreds_pca = test_pca.map { point => + val score = model_pca.predict(point.features) + (score, point.label) + } + + val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() + val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() + + println("Mean Squared Error = " + MSE) + println("PCA Mean Squared Error = " + MSE_pca) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala new file mode 100644 index 0000000000..fc0aa1b7f0 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object StandardScalerExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("StandardScalerExample") + val sc = new SparkContext(conf) + + // $example on$ + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + + val scaler1 = new StandardScaler().fit(data.map(x => x.features)) + val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) + // scaler3 is an identical model to scaler2, and will produce identical transformations + val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) + + // data1 will be unit variance. + val data1 = data.map(x => (x.label, scaler1.transform(x.features))) + + // Without converting the features into dense vectors, transformation with zero mean will raise + // exception on sparse vector. + // data2 will be unit variance and zero mean. + val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) + // $example off$ + + println("data1: ") + data1.foreach(x => println(x)) + + println("data2: ") + data2.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala new file mode 100644 index 0000000000..a5bdcd8f2e --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.{HashingTF, IDF} +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.rdd.RDD +// $example off$ + +object TFIDFExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("TFIDFExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load documents (one per line). + val documents: RDD[Seq[String]] = sc.textFile("data/mllib/kmeans_data.txt") + .map(_.split(" ").toSeq) + + val hashingTF = new HashingTF() + val tf: RDD[Vector] = hashingTF.transform(documents) + + // While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: + // First to compute the IDF vector and second to scale the term frequencies by IDF. + tf.cache() + val idf = new IDF().fit(tf) + val tfidf: RDD[Vector] = idf.transform(tf) + + // spark.mllib IDF implementation provides an option for ignoring terms which occur in less than + // a minimum number of documents. In such cases, the IDF for these terms is set to 0. + // This feature can be used by passing the minDocFreq value to the IDF constructor. + val idfIgnore = new IDF(minDocFreq = 2).fit(tf) + val tfidfIgnore: RDD[Vector] = idfIgnore.transform(tf) + // $example off$ + + println("tfidf: ") + tfidf.foreach(x => println(x)) + + println("tfidfIgnore: ") + tfidfIgnore.foreach(x => println(x)) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala new file mode 100644 index 0000000000..ea794c700a --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +// $example on$ +import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} +// $example off$ + +object Word2VecExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("Word2VecExample") + val sc = new SparkContext(conf) + + // $example on$ + val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) + + val word2vec = new Word2Vec() + + val model = word2vec.fit(input) + + val synonyms = model.findSynonyms("1", 5) + + for((synonym, cosineSimilarity) <- synonyms) { + println(s"$synonym $cosineSimilarity") + } + + // Save and load model + model.save(sc, "myModelPath") + val sameModel = Word2VecModel.load(sc, "myModelPath") + // $example off$ + + sc.stop() + } +} +// scalastyle:on println |