[SPARK-11289][DOC] Substitute code examples in ML features extractors with include_example

mengxr https://issues.apache.org/jira/browse/SPARK-11289 I make some changes in ML feature extractors. I.e. TF-IDF, Word2Vec, and CountVectorizer. I add new example code in spark/examples, hope it is the right place to add those examples. Author: Xusen Yin <yinxusen@gmail.com> Closes #9266 from yinxusen/SPARK-11289.
author: Xusen Yin <yinxusen@gmail.com> 2015-10-26 21:17:53 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-10-26 21:17:53 -0700
commit: 943d4fa204a827ca8ecc39d9cf04e86890ee9840 (patch)
tree: f50ba101226bb1f0c0e11a3cffe00aa4daa166f1 /docs/ml-features.md
parent: a150e6c1b03b64a35855b8074b2fe077a6081a34 (diff)
download: spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.gz
spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.bz2
spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.zip
1 files changed, 8 insertions, 209 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 44a9882939..142afac2f3 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -37,23 +37,7 @@ In the following code segment, we start with a set of sentences.  We split each
 Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
 the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
-
-val sentenceData = sqlContext.createDataFrame(Seq(
-  (0, "Hi I heard about Spark"),
-  (0, "I wish Java could use case classes"),
-  (1, "Logistic regression models are neat")
-)).toDF("label", "sentence")
-val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val wordsData = tokenizer.transform(sentenceData)
-val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
-val featurizedData = hashingTF.transform(wordsData)
-val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
-val idfModel = idf.fit(featurizedData)
-val rescaledData = idfModel.transform(featurizedData)
-rescaledData.select("features", "label").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -61,49 +45,7 @@ rescaledData.select("features", "label").take(3).foreach(println)
 Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
 [IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.IDF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0, "Hi I heard about Spark"),
-  RowFactory.create(0, "I wish Java could use case classes"),
-  RowFactory.create(1, "Logistic regression models are neat")
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
-  new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
-Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-DataFrame wordsData = tokenizer.transform(sentenceData);
-int numFeatures = 20;
-HashingTF hashingTF = new HashingTF()
-  .setInputCol("words")
-  .setOutputCol("rawFeatures")
-  .setNumFeatures(numFeatures);
-DataFrame featurizedData = hashingTF.transform(wordsData);
-IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
-IDFModel idfModel = idf.fit(featurizedData);
-DataFrame rescaledData = idfModel.transform(featurizedData);
-for (Row r : rescaledData.select("features", "label").take(3)) {
-  Vector features = r.getAs(0);
-  Double label = r.getDouble(1);
-  System.out.println(features);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaTfIdfExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -111,24 +53,7 @@ for (Row r : rescaledData.select("features", "label").take(3)) {
 Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
 the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import HashingTF, IDF, Tokenizer
-
-sentenceData = sqlContext.createDataFrame([
-  (0, "Hi I heard about Spark"),
-  (0, "I wish Java could use case classes"),
-  (1, "Logistic regression models are neat")
-], ["label", "sentence"])
-tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
-wordsData = tokenizer.transform(sentenceData)
-hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
-featurizedData = hashingTF.transform(wordsData)
-idf = IDF(inputCol="rawFeatures", outputCol="features")
-idfModel = idf.fit(featurizedData)
-rescaledData = idfModel.transform(featurizedData)
-for features_label in rescaledData.select("features", "label").take(3):
-  print(features_label)
-{% endhighlight %}
+{% include_example python/ml/tf_idf_example.py %}
 </div>
 </div>
 
@@ -149,26 +74,7 @@ In the following code segment, we start with a set of documents, each of which i
 Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.Word2Vec
-
-// Input data: Each row is a bag of words from a sentence or document.
-val documentDF = sqlContext.createDataFrame(Seq(
-  "Hi I heard about Spark".split(" "),
-  "I wish Java could use case classes".split(" "),
-  "Logistic regression models are neat".split(" ")
-).map(Tuple1.apply)).toDF("text")
-
-// Learn a mapping from words to Vectors.
-val word2Vec = new Word2Vec()
-  .setInputCol("text")
-  .setOutputCol("result")
-  .setVectorSize(3)
-  .setMinCount(0)
-val model = word2Vec.fit(documentDF)
-val result = model.transform(documentDF)
-result.select("result").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -176,43 +82,7 @@ result.select("result").take(3).foreach(println)
 Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.*;
-
-JavaSparkContext jsc = ...
-SQLContext sqlContext = ...
-
-// Input data: Each row is a bag of words from a sentence or document.
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
-  RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
-  RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-});
-DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
-
-// Learn a mapping from words to Vectors.
-Word2Vec word2Vec = new Word2Vec()
-  .setInputCol("text")
-  .setOutputCol("result")
-  .setVectorSize(3)
-  .setMinCount(0);
-Word2VecModel model = word2Vec.fit(documentDF);
-DataFrame result = model.transform(documentDF);
-for (Row r: result.select("result").take(3)) {
-  System.out.println(r);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaWord2VecExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -220,22 +90,7 @@ for (Row r: result.select("result").take(3)) {
 Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import Word2Vec
-
-# Input data: Each row is a bag of words from a sentence or document.
-documentDF = sqlContext.createDataFrame([
-  ("Hi I heard about Spark".split(" "), ),
-  ("I wish Java could use case classes".split(" "), ),
-  ("Logistic regression models are neat".split(" "), )
-], ["text"])
-# Learn a mapping from words to Vectors.
-word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
-model = word2Vec.fit(documentDF)
-result = model.transform(documentDF)
-for feature in result.select("result").take(3):
-  print(feature)
-{% endhighlight %}
+{% include_example python/ml/word2vec_example.py %}
 </div>
 </div>
 
@@ -283,30 +138,7 @@ Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.
 and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.CountVectorizer
-import org.apache.spark.mllib.util.CountVectorizerModel
-
-val df = sqlContext.createDataFrame(Seq(
-  (0, Array("a", "b", "c")),
-  (1, Array("a", "b", "b", "c", "a"))
-)).toDF("id", "words")
-
-// fit a CountVectorizerModel from the corpus
-val cvModel: CountVectorizerModel = new CountVectorizer()
-  .setInputCol("words")
-  .setOutputCol("features")
-  .setVocabSize(3)
-  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
-  .fit(df)
-
-// alternatively, define CountVectorizerModel with a-priori vocabulary
-val cvm = new CountVectorizerModel(Array("a", "b", "c"))
-  .setInputCol("words")
-  .setOutputCol("features")
-
-cvModel.transform(df).select("features").show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -315,40 +147,7 @@ Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/Co
 and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
 for more details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.CountVectorizer;
-import org.apache.spark.ml.feature.CountVectorizerModel;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-
-// Input data: Each row is a bag of words from a sentence or document.
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Arrays.asList("a", "b", "c")),
-  RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
-));
-StructType schema = new StructType(new StructField [] {
-  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-
-// fit a CountVectorizerModel from the corpus
-CountVectorizerModel cvModel = new CountVectorizer()
-  .setInputCol("text")
-  .setOutputCol("feature")
-  .setVocabSize(3)
-  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
-  .fit(df);
-
-// alternatively, define CountVectorizerModel with a-priori vocabulary
-CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
-  .setInputCol("text")
-  .setOutputCol("feature");
-
-cvModel.transform(df).show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java %}
 </div>
 </div>
author	Xusen Yin <yinxusen@gmail.com>	2015-10-26 21:17:53 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-10-26 21:17:53 -0700
commit	943d4fa204a827ca8ecc39d9cf04e86890ee9840 (patch)
tree	f50ba101226bb1f0c0e11a3cffe00aa4daa166f1 /docs/ml-features.md
parent	a150e6c1b03b64a35855b8074b2fe077a6081a34 (diff)
download	spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.gz spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.bz2 spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.zip