diff options
Diffstat (limited to 'docs/mllib-feature-extraction.md')
-rw-r--r-- | docs/mllib-feature-extraction.md | 375 |
1 files changed, 14 insertions, 361 deletions
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 7796bac697..7a97285032 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -58,46 +58,7 @@ Each record could be an iterable of strings or other types. Refer to the [`HashingTF` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.HashingTF) for details on the API. - -{% highlight scala %} -import org.apache.spark.rdd.RDD -import org.apache.spark.SparkContext -import org.apache.spark.mllib.feature.HashingTF -import org.apache.spark.mllib.linalg.Vector - -val sc: SparkContext = ... - -// Load documents (one per line). -val documents: RDD[Seq[String]] = sc.textFile("...").map(_.split(" ").toSeq) - -val hashingTF = new HashingTF() -val tf: RDD[Vector] = hashingTF.transform(documents) -{% endhighlight %} - -While applying `HashingTF` only needs a single pass to the data, applying `IDF` needs two passes: -first to compute the IDF vector and second to scale the term frequencies by IDF. - -{% highlight scala %} -import org.apache.spark.mllib.feature.IDF - -// ... continue from the previous example -tf.cache() -val idf = new IDF().fit(tf) -val tfidf: RDD[Vector] = idf.transform(tf) -{% endhighlight %} - -`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a -minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature -can be used by passing the `minDocFreq` value to the IDF constructor. - -{% highlight scala %} -import org.apache.spark.mllib.feature.IDF - -// ... continue from the previous example -tf.cache() -val idf = new IDF(minDocFreq = 2).fit(tf) -val tfidf: RDD[Vector] = idf.transform(tf) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/TFIDFExample.scala %} </div> <div data-lang="python" markdown="1"> @@ -109,41 +70,7 @@ Each record could be an iterable of strings or other types. Refer to the [`HashingTF` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.HashingTF) for details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.feature import HashingTF - -sc = SparkContext() - -# Load documents (one per line). -documents = sc.textFile("...").map(lambda line: line.split(" ")) - -hashingTF = HashingTF() -tf = hashingTF.transform(documents) -{% endhighlight %} - -While applying `HashingTF` only needs a single pass to the data, applying `IDF` needs two passes: -first to compute the IDF vector and second to scale the term frequencies by IDF. - -{% highlight python %} -from pyspark.mllib.feature import IDF - -# ... continue from the previous example -tf.cache() -idf = IDF().fit(tf) -tfidf = idf.transform(tf) -{% endhighlight %} - -`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a -minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature -can be used by passing the `minDocFreq` value to the IDF constructor. - -{% highlight python %} -# ... continue from the previous example -tf.cache() -idf = IDF(minDocFreq=2).fit(tf) -tfidf = idf.transform(tf) -{% endhighlight %} +{% include_example python/mllib/tf_idf_example.py %} </div> </div> @@ -192,47 +119,12 @@ Here we assume the extracted file is `text8` and in same directory as you run th <div data-lang="scala" markdown="1"> Refer to the [`Word2Vec` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec) for details on the API. -{% highlight scala %} -import org.apache.spark._ -import org.apache.spark.rdd._ -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} - -val input = sc.textFile("text8").map(line => line.split(" ").toSeq) - -val word2vec = new Word2Vec() - -val model = word2vec.fit(input) - -val synonyms = model.findSynonyms("china", 40) - -for((synonym, cosineSimilarity) <- synonyms) { - println(s"$synonym $cosineSimilarity") -} - -// Save and load model -model.save(sc, "myModelPath") -val sameModel = Word2VecModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/Word2VecExample.scala %} </div> <div data-lang="python" markdown="1"> Refer to the [`Word2Vec` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Word2Vec) for more details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.feature import Word2Vec - -sc = SparkContext(appName='Word2Vec') -inp = sc.textFile("text8_lines").map(lambda row: row.split(" ")) - -word2vec = Word2Vec() -model = word2vec.fit(inp) - -synonyms = model.findSynonyms('china', 40) - -for word, cosine_distance in synonyms: - print("{}: {}".format(word, cosine_distance)) -{% endhighlight %} +{% include_example python/mllib/word2vec_example.py %} </div> </div> @@ -277,55 +169,13 @@ so that the new features have unit standard deviation and/or zero mean. <div data-lang="scala" markdown="1"> Refer to the [`StandardScaler` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.StandardScaler -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLUtils - -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") - -val scaler1 = new StandardScaler().fit(data.map(x => x.features)) -val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) -// scaler3 is an identical model to scaler2, and will produce identical transformations -val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) - -// data1 will be unit variance. -val data1 = data.map(x => (x.label, scaler1.transform(x.features))) - -// Without converting the features into dense vectors, transformation with zero mean will raise -// exception on sparse vector. -// data2 will be unit variance and zero mean. -val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/StandardScalerExample.scala %} </div> <div data-lang="python" markdown="1"> Refer to the [`StandardScaler` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.StandardScaler) for more details on the API. -{% highlight python %} -from pyspark.mllib.util import MLUtils -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.feature import StandardScaler - -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -label = data.map(lambda x: x.label) -features = data.map(lambda x: x.features) - -scaler1 = StandardScaler().fit(features) -scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) -# scaler3 is an identical model to scaler2, and will produce identical transformations -scaler3 = StandardScalerModel(scaler2.std, scaler2.mean) - - -# data1 will be unit variance. -data1 = label.zip(scaler1.transform(features)) - -# Without converting the features into dense vectors, transformation with zero mean will raise -# exception on sparse vector. -# data2 will be unit variance and zero mean. -data2 = label.zip(scaler1.transform(features.map(lambda x: Vectors.dense(x.toArray())))) -{% endhighlight %} +{% include_example python/mllib/standard_scaler_example.py %} </div> </div> @@ -355,46 +205,13 @@ with $L^2$ norm, and $L^\infty$ norm. <div data-lang="scala" markdown="1"> Refer to the [`Normalizer` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Normalizer) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.Normalizer -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLUtils - -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") - -val normalizer1 = new Normalizer() -val normalizer2 = new Normalizer(p = Double.PositiveInfinity) - -// Each sample in data1 will be normalized using $L^2$ norm. -val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) - -// Each sample in data2 will be normalized using $L^\infty$ norm. -val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/NormalizerExample.scala %} </div> <div data-lang="python" markdown="1"> Refer to the [`Normalizer` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Normalizer) for more details on the API. -{% highlight python %} -from pyspark.mllib.util import MLUtils -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.feature import Normalizer - -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -labels = data.map(lambda x: x.label) -features = data.map(lambda x: x.features) - -normalizer1 = Normalizer() -normalizer2 = Normalizer(p=float("inf")) - -# Each sample in data1 will be normalized using $L^2$ norm. -data1 = labels.zip(normalizer1.transform(features)) - -# Each sample in data2 will be normalized using $L^\infty$ norm. -data2 = labels.zip(normalizer2.transform(features)) -{% endhighlight %} +{% include_example python/mllib/normalizer_example.py %} </div> </div> @@ -435,29 +252,7 @@ The following example shows the basic use of ChiSqSelector. The data set used ha Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.mllib.feature.ChiSqSelector - -// Load some data in libsvm format -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") -// Discretize data in 16 equal bins since ChiSqSelector requires categorical features -// Even though features are doubles, the ChiSqSelector treats each unique value as a category -val discretizedData = data.map { lp => - LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor } ) ) -} -// Create ChiSqSelector that will select top 50 of 692 features -val selector = new ChiSqSelector(50) -// Create ChiSqSelector model (selecting features) -val transformer = selector.fit(discretizedData) -// Filter the top 50 features from each feature vector -val filteredData = discretizedData.map { lp => - LabeledPoint(lp.label, transformer.transform(lp.features)) -} -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala %} </div> <div data-lang="java" markdown="1"> @@ -465,52 +260,7 @@ val filteredData = discretizedData.map { lp => Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html) for details on the API. -{% highlight java %} -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.feature.ChiSqSelector; -import org.apache.spark.mllib.feature.ChiSqSelectorModel; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; - -SparkConf sparkConf = new SparkConf().setAppName("JavaChiSqSelector"); -JavaSparkContext sc = new JavaSparkContext(sparkConf); -JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(sc.sc(), - "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache(); - -// Discretize data in 16 equal bins since ChiSqSelector requires categorical features -// Even though features are doubles, the ChiSqSelector treats each unique value as a category -JavaRDD<LabeledPoint> discretizedData = points.map( - new Function<LabeledPoint, LabeledPoint>() { - @Override - public LabeledPoint call(LabeledPoint lp) { - final double[] discretizedFeatures = new double[lp.features().size()]; - for (int i = 0; i < lp.features().size(); ++i) { - discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16); - } - return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures)); - } - }); - -// Create ChiSqSelector that will select top 50 of 692 features -ChiSqSelector selector = new ChiSqSelector(50); -// Create ChiSqSelector model (selecting features) -final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); -// Filter the top 50 features from each feature vector -JavaRDD<LabeledPoint> filteredData = discretizedData.map( - new Function<LabeledPoint, LabeledPoint>() { - @Override - public LabeledPoint call(LabeledPoint lp) { - return new LabeledPoint(lp.label(), transformer.transform(lp.features())); - } - } -); - -sc.stop(); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %} </div> </div> @@ -554,78 +304,19 @@ This example below demonstrates how to transform vectors using a transforming ve Refer to the [`ElementwiseProduct` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.feature.ElementwiseProduct -import org.apache.spark.mllib.linalg.Vectors - -// Create some vector data; also works for sparse vectors -val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) - -val transformingVector = Vectors.dense(0.0, 1.0, 2.0) -val transformer = new ElementwiseProduct(transformingVector) - -// Batch transform and per-row transform give the same results: -val transformedData = transformer.transform(data) -val transformedData2 = data.map(x => transformer.transform(x)) - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala %} </div> <div data-lang="java" markdown="1"> Refer to the [`ElementwiseProduct` Java docs](api/java/org/apache/spark/mllib/feature/ElementwiseProduct.html) for details on the API. -{% highlight java %} -import java.util.Arrays; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.feature.ElementwiseProduct; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; - -// Create some vector data; also works for sparse vectors -JavaRDD<Vector> data = sc.parallelize(Arrays.asList( - Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))); -Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); -ElementwiseProduct transformer = new ElementwiseProduct(transformingVector); - -// Batch transform and per-row transform give the same results: -JavaRDD<Vector> transformedData = transformer.transform(data); -JavaRDD<Vector> transformedData2 = data.map( - new Function<Vector, Vector>() { - @Override - public Vector call(Vector v) { - return transformer.transform(v); - } - } -); - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java %} </div> <div data-lang="python" markdown="1"> Refer to the [`ElementwiseProduct` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ElementwiseProduct) for more details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.feature import ElementwiseProduct - -# Load and parse the data -sc = SparkContext() -data = sc.textFile("data/mllib/kmeans_data.txt") -parsedData = data.map(lambda x: [float(t) for t in x.split(" ")]) - -# Create weight vector. -transformingVector = Vectors.dense([0.0, 1.0, 2.0]) -transformer = ElementwiseProduct(transformingVector) - -# Batch transform -transformedData = transformer.transform(parsedData) -# Single-row transform -transformedData2 = transformer.transform(parsedData.first()) - -{% endhighlight %} +{% include_example python/mllib/elementwise_product_example.py %} </div> </div> @@ -645,44 +336,6 @@ for calculation a [Linear Regression]((mllib-linear-methods.html)) <div data-lang="scala" markdown="1"> Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.feature.PCA - -val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => - val parts = line.split(',') - LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) -}.cache() - -val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) -val training = splits(0).cache() -val test = splits(1) - -val pca = new PCA(training.first().features.size/2).fit(data.map(_.features)) -val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) -val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) - -val numIterations = 100 -val model = LinearRegressionWithSGD.train(training, numIterations) -val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) - -val valuesAndPreds = test.map { point => - val score = model.predict(point.features) - (score, point.label) -} - -val valuesAndPreds_pca = test_pca.map { point => - val score = model_pca.predict(point.features) - (score, point.label) -} - -val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean() -val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean() - -println("Mean Squared Error = " + MSE) -println("PCA Mean Squared Error = " + MSE_pca) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/PCAExample.scala %} </div> </div> |