aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFeynman Liang <fliang@databricks.com>2015-08-27 16:10:37 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-27 16:10:37 -0700
commit5bfe9e1111d9862084586549a7dc79476f67bab9 (patch)
tree1fca589260438d9eb39fc57d724739f9e4b264ac
parentc94ecdfc5b3c0fe6c38a170dc2af9259354dc9e3 (diff)
downloadspark-5bfe9e1111d9862084586549a7dc79476f67bab9.tar.gz
spark-5bfe9e1111d9862084586549a7dc79476f67bab9.tar.bz2
spark-5bfe9e1111d9862084586549a7dc79476f67bab9.zip
[SPARK-9680] [MLLIB] [DOC] StopWordsRemovers user guide and Java compatibility test
* Adds user guide for ml.feature.StopWordsRemovers, ran code examples on my machine * Cleans up scaladocs for public methods * Adds test for Java compatibility * Follow up Python user guide code example is tracked by SPARK-10249 Author: Feynman Liang <fliang@databricks.com> Closes #8436 from feynmanliang/SPARK-10230.
-rw-r--r--docs/ml-features.md102
-rw-r--r--mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java72
2 files changed, 171 insertions, 3 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 62de483898..89a9bad570 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -306,15 +306,111 @@ regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern=
</div>
</div>
+## StopWordsRemover
+[Stop words](https://en.wikipedia.org/wiki/Stop_words) are words which
+should be excluded from the input, typically because the words appear
+frequently and don't carry as much meaning.
+
+`StopWordsRemover` takes as input a sequence of strings (e.g. the output
+of a [Tokenizer](ml-features.html#tokenizer)) and drops all the stop
+words from the input sequences. The list of stopwords is specified by
+the `stopWords` parameter. We provide [a list of stop
+words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by
+default, accessible by calling `getStopWords` on a newly instantiated
+`StopWordsRemover` instance.
-## $n$-gram
+**Examples**
-An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams.
+Assume that we have the following DataFrame with columns `id` and `raw`:
-`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer). The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than `n` strings, no output is produced.
+~~~~
+ id | raw
+----|----------
+ 0 | [I, saw, the, red, baloon]
+ 1 | [Mary, had, a, little, lamb]
+~~~~
+
+Applying `StopWordsRemover` with `raw` as the input column and `filtered` as the output
+column, we should get the following:
+
+~~~~
+ id | raw | filtered
+----|-----------------------------|--------------------
+ 0 | [I, saw, the, red, baloon] | [saw, red, baloon]
+ 1 | [Mary, had, a, little, lamb]|[Mary, little, lamb]
+~~~~
+
+In `filtered`, the stop words "I", "the", "had", and "a" have been
+filtered out.
<div class="codetabs">
+
<div data-lang="scala" markdown="1">
+
+[`StopWordsRemover`](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.StopWordsRemover
+
+val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+val dataSet = sqlContext.createDataFrame(Seq(
+ (0, Seq("I", "saw", "the", "red", "baloon")),
+ (1, Seq("Mary", "had", "a", "little", "lamb"))
+)).toDF("id", "raw")
+
+remover.transform(dataSet).show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`StopWordsRemover`](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+StopWordsRemover remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered");
+
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+});
+DataFrame dataset = jsql.createDataFrame(rdd, schema);
+
+remover.transform(dataset).show();
+{% endhighlight %}
+</div>
+</div>
+
+## $n$-gram
+
+An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams.
+
+`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer)). The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than `n` strings, no output is produced.
+
<div class="codetabs">
<div data-lang="scala" markdown="1">
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
new file mode 100644
index 0000000000..76cdd0fae8
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+
+public class JavaStopWordsRemoverSuite {
+
+ private transient JavaSparkContext jsc;
+ private transient SQLContext jsql;
+
+ @Before
+ public void setUp() {
+ jsc = new JavaSparkContext("local", "JavaStopWordsRemoverSuite");
+ jsql = new SQLContext(jsc);
+ }
+
+ @After
+ public void tearDown() {
+ jsc.stop();
+ jsc = null;
+ }
+
+ @Test
+ public void javaCompatibilityTest() {
+ StopWordsRemover remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered");
+
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+ ));
+ StructType schema = new StructType(new StructField[] {
+ new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+ });
+ DataFrame dataset = jsql.createDataFrame(rdd, schema);
+
+ remover.transform(dataset).collect();
+ }
+}