[SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover

## What changes were proposed in this pull request? This PR continues the work from #11871 with the following changes: * load English stopwords as default * covert stopwords to list in Python * update some tests and doc ## How was this patch tested? Unit tests. Closes #11871 cc: burakkose srowen Author: Burak Köse <burakks41@gmail.com> Author: Xiangrui Meng <meng@databricks.com> Author: Burak KOSE <burakks41@gmail.com> Closes #12843 from mengxr/SPARK-14050.
author: Burak Köse <burakks41@gmail.com> 2016-05-06 13:58:12 -0700
committer: Xiangrui Meng <meng@databricks.com> 2016-05-06 13:58:12 -0700
commit: e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch)
tree: ea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9 /mllib/src/test
parent: 5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff)
download: spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip
1 files changed, 55 insertions, 2 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 3505befdf8..8e7e000fbc 100644..100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -44,6 +44,24 @@ class StopWordsRemoverSuite
       .setOutputCol("filtered")
     val dataSet = sqlContext.createDataFrame(Seq(
       (Seq("test", "test"), Seq("test", "test")),
+      (Seq("a", "b", "c", "d"), Seq("b", "c")),
+      (Seq("a", "the", "an"), Seq()),
+      (Seq("A", "The", "AN"), Seq()),
+      (Seq(null), Seq(null)),
+      (Seq(), Seq())
+    )).toDF("raw", "expected")
+
+    testStopWordsRemover(remover, dataSet)
+  }
+
+  test("StopWordsRemover with particular stop words list") {
+    val stopWords = Array("test", "a", "an", "the")
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+      .setStopWords(stopWords)
+    val dataSet = sqlContext.createDataFrame(Seq(
+      (Seq("test", "test"), Seq()),
       (Seq("a", "b", "c", "d"), Seq("b", "c", "d")),
       (Seq("a", "the", "an"), Seq()),
       (Seq("A", "The", "AN"), Seq()),
@@ -67,13 +85,48 @@ class StopWordsRemoverSuite
     testStopWordsRemover(remover, dataSet)
   }
 
-  test("StopWordsRemover with additional words") {
-    val stopWords = StopWords.English ++ Array("python", "scala")
+  test("default stop words of supported languages are not empty") {
+    StopWordsRemover.supportedLanguages.foreach { lang =>
+      assert(StopWordsRemover.loadDefaultStopWords(lang).nonEmpty,
+        s"The default stop words of $lang cannot be empty.")
+    }
+  }
+
+  test("StopWordsRemover with language selection") {
+    val stopWords = StopWordsRemover.loadDefaultStopWords("turkish")
     val remover = new StopWordsRemover()
       .setInputCol("raw")
       .setOutputCol("filtered")
       .setStopWords(stopWords)
     val dataSet = sqlContext.createDataFrame(Seq(
+      (Seq("acaba", "ama", "biri"), Seq()),
+      (Seq("hep", "her", "scala"), Seq("scala"))
+    )).toDF("raw", "expected")
+
+    testStopWordsRemover(remover, dataSet)
+  }
+
+  test("StopWordsRemover with ignored words") {
+    val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet -- Set("a")
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+      .setStopWords(stopWords.toArray)
+    val dataSet = sqlContext.createDataFrame(Seq(
+      (Seq("python", "scala", "a"), Seq("python", "scala", "a")),
+      (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift"))
+    )).toDF("raw", "expected")
+
+    testStopWordsRemover(remover, dataSet)
+  }
+
+  test("StopWordsRemover with additional words") {
+    val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet ++ Set("python", "scala")
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+      .setStopWords(stopWords.toArray)
+    val dataSet = sqlContext.createDataFrame(Seq(
       (Seq("python", "scala", "a"), Seq()),
       (Seq("Python", "Scala", "swift"), Seq("swift"))
     )).toDF("raw", "expected")
author	Burak Köse <burakks41@gmail.com>	2016-05-06 13:58:12 -0700
committer	Xiangrui Meng <meng@databricks.com>	2016-05-06 13:58:12 -0700
commit	e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch)
tree	ea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9 /mllib/src/test
parent	5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff)
download	spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2 spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip