[SPARK-11289][DOC] Substitute code examples in ML features extractors with include_example

mengxr https://issues.apache.org/jira/browse/SPARK-11289 I make some changes in ML feature extractors. I.e. TF-IDF, Word2Vec, and CountVectorizer. I add new example code in spark/examples, hope it is the right place to add those examples. Author: Xusen Yin <yinxusen@gmail.com> Closes #9266 from yinxusen/SPARK-11289.
author: Xusen Yin <yinxusen@gmail.com> 2015-10-26 21:17:53 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-10-26 21:17:53 -0700
commit: 943d4fa204a827ca8ecc39d9cf04e86890ee9840 (patch)
tree: f50ba101226bb1f0c0e11a3cffe00aa4daa166f1 /examples
parent: a150e6c1b03b64a35855b8074b2fe077a6081a34 (diff)
download: spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.gz
spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.bz2
spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.zip
8 files changed, 472 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
new file mode 100644
index 0000000000..ac33adb652
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.CountVectorizer;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaCountVectorizerExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    // Input data: Each row is a bag of words from a sentence or document.
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("a", "b", "c")),
+      RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
+    ));
+    StructType schema = new StructType(new StructField [] {
+      new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+    });
+    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+    // fit a CountVectorizerModel from the corpus
+    CountVectorizerModel cvModel = new CountVectorizer()
+      .setInputCol("text")
+      .setOutputCol("feature")
+      .setVocabSize(3)
+      .setMinDF(2)
+      .fit(df);
+
+    // alternatively, define CountVectorizerModel with a-priori vocabulary
+    CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+      .setInputCol("text")
+      .setOutputCol("feature");
+
+    cvModel.transform(df).show();
+    // $example off$
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
new file mode 100644
index 0000000000..a41a5ec9bf
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.HashingTF;
+import org.apache.spark.ml.feature.IDF;
+import org.apache.spark.ml.feature.IDFModel;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaTfIdfExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaTfIdfExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, "Hi I heard about Spark"),
+      RowFactory.create(0, "I wish Java could use case classes"),
+      RowFactory.create(1, "Logistic regression models are neat")
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+    });
+    DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
+    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+    DataFrame wordsData = tokenizer.transform(sentenceData);
+    int numFeatures = 20;
+    HashingTF hashingTF = new HashingTF()
+      .setInputCol("words")
+      .setOutputCol("rawFeatures")
+      .setNumFeatures(numFeatures);
+    DataFrame featurizedData = hashingTF.transform(wordsData);
+    IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
+    IDFModel idfModel = idf.fit(featurizedData);
+    DataFrame rescaledData = idfModel.transform(featurizedData);
+    for (Row r : rescaledData.select("features", "label").take(3)) {
+      Vector features = r.getAs(0);
+      Double label = r.getDouble(1);
+      System.out.println(features);
+      System.out.println(label);
+    }
+    // $example off$
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
new file mode 100644
index 0000000000..d472375ca9
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Word2Vec;
+import org.apache.spark.ml.feature.Word2VecModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaWord2VecExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    // Input data: Each row is a bag of words from a sentence or document.
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+      RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+      RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+    });
+    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+    // Learn a mapping from words to Vectors.
+    Word2Vec word2Vec = new Word2Vec()
+      .setInputCol("text")
+      .setOutputCol("result")
+      .setVectorSize(3)
+      .setMinCount(0);
+    Word2VecModel model = word2Vec.fit(documentDF);
+    DataFrame result = model.transform(documentDF);
+    for (Row r : result.select("result").take(3)) {
+      System.out.println(r);
+    }
+    // $example off$
+  }
+}
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
new file mode 100644
index 0000000000..c92313378e
--- /dev/null
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer
+# $example off$
+from pyspark.sql import SQLContext
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="TfIdfExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    sentenceData = sqlContext.createDataFrame([
+        (0, "Hi I heard about Spark"),
+        (0, "I wish Java could use case classes"),
+        (1, "Logistic regression models are neat")
+    ], ["label", "sentence"])
+    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+    wordsData = tokenizer.transform(sentenceData)
+    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
+    featurizedData = hashingTF.transform(wordsData)
+    idf = IDF(inputCol="rawFeatures", outputCol="features")
+    idfModel = idf.fit(featurizedData)
+    rescaledData = idfModel.transform(featurizedData)
+    for features_label in rescaledData.select("features", "label").take(3):
+        print(features_label)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
new file mode 100644
index 0000000000..53c77feb10
--- /dev/null
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Word2Vec
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="Word2VecExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    # Input data: Each row is a bag of words from a sentence or document.
+    documentDF = sqlContext.createDataFrame([
+        ("Hi I heard about Spark".split(" "), ),
+        ("I wish Java could use case classes".split(" "), ),
+        ("Logistic regression models are neat".split(" "), )
+    ], ["text"])
+    # Learn a mapping from words to Vectors.
+    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+    model = word2Vec.fit(documentDF)
+    result = model.transform(documentDF)
+    for feature in result.select("result").take(3):
+        print(feature)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
new file mode 100644
index 0000000000..ba916f66c4
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+object CountVectorizerExample {
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("CounterVectorizerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val df = sqlContext.createDataFrame(Seq(
+      (0, Array("a", "b", "c")),
+      (1, Array("a", "b", "b", "c", "a"))
+    )).toDF("id", "words")
+
+    // fit a CountVectorizerModel from the corpus
+    val cvModel: CountVectorizerModel = new CountVectorizer()
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setVocabSize(3)
+      .setMinDF(2)
+      .fit(df)
+
+    // alternatively, define CountVectorizerModel with a-priori vocabulary
+    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
+      .setInputCol("words")
+      .setOutputCol("features")
+
+    cvModel.transform(df).select("features").show()
+    // $example off$
+  }
+}
+// scalastyle:on println
+
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
new file mode 100644
index 0000000000..40c33e4e7d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TfIdfExample {
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("TfIdfExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val sentenceData = sqlContext.createDataFrame(Seq(
+      (0, "Hi I heard about Spark"),
+      (0, "I wish Java could use case classes"),
+      (1, "Logistic regression models are neat")
+    )).toDF("label", "sentence")
+
+    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+    val wordsData = tokenizer.transform(sentenceData)
+    val hashingTF = new HashingTF()
+      .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
+    val featurizedData = hashingTF.transform(wordsData)
+    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
+    val idfModel = idf.fit(featurizedData)
+    val rescaledData = idfModel.transform(featurizedData)
+    rescaledData.select("features", "label").take(3).foreach(println)
+    // $example off$
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
new file mode 100644
index 0000000000..631ab4c8ef
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Word2Vec
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object Word2VecExample {
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("Word2Vec example")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    // Input data: Each row is a bag of words from a sentence or document.
+    val documentDF = sqlContext.createDataFrame(Seq(
+      "Hi I heard about Spark".split(" "),
+      "I wish Java could use case classes".split(" "),
+      "Logistic regression models are neat".split(" ")
+    ).map(Tuple1.apply)).toDF("text")
+
+    // Learn a mapping from words to Vectors.
+    val word2Vec = new Word2Vec()
+      .setInputCol("text")
+      .setOutputCol("result")
+      .setVectorSize(3)
+      .setMinCount(0)
+    val model = word2Vec.fit(documentDF)
+    val result = model.transform(documentDF)
+    result.select("result").take(3).foreach(println)
+    // $example off$
+  }
+}
+// scalastyle:on println
author	Xusen Yin <yinxusen@gmail.com>	2015-10-26 21:17:53 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-10-26 21:17:53 -0700
commit	943d4fa204a827ca8ecc39d9cf04e86890ee9840 (patch)
tree	f50ba101226bb1f0c0e11a3cffe00aa4daa166f1 /examples
parent	a150e6c1b03b64a35855b8074b2fe077a6081a34 (diff)
download	spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.gz spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.tar.bz2 spark-943d4fa204a827ca8ecc39d9cf04e86890ee9840.zip