aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorBurak Köse <burakks41@gmail.com>2016-05-06 13:58:12 -0700
committerXiangrui Meng <meng@databricks.com>2016-05-06 13:58:12 -0700
commite20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch)
treeea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9 /mllib/src/test
parent5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff)
downloadspark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip
[SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover
## What changes were proposed in this pull request? This PR continues the work from #11871 with the following changes: * load English stopwords as default * covert stopwords to list in Python * update some tests and doc ## How was this patch tested? Unit tests. Closes #11871 cc: burakkose srowen Author: Burak Köse <burakks41@gmail.com> Author: Xiangrui Meng <meng@databricks.com> Author: Burak KOSE <burakks41@gmail.com> Closes #12843 from mengxr/SPARK-14050.
Diffstat (limited to 'mllib/src/test')
-rwxr-xr-x[-rw-r--r--]mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala57
1 files changed, 55 insertions, 2 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 3505befdf8..8e7e000fbc 100644..100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -44,6 +44,24 @@ class StopWordsRemoverSuite
.setOutputCol("filtered")
val dataSet = sqlContext.createDataFrame(Seq(
(Seq("test", "test"), Seq("test", "test")),
+ (Seq("a", "b", "c", "d"), Seq("b", "c")),
+ (Seq("a", "the", "an"), Seq()),
+ (Seq("A", "The", "AN"), Seq()),
+ (Seq(null), Seq(null)),
+ (Seq(), Seq())
+ )).toDF("raw", "expected")
+
+ testStopWordsRemover(remover, dataSet)
+ }
+
+ test("StopWordsRemover with particular stop words list") {
+ val stopWords = Array("test", "a", "an", "the")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords)
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("test", "test"), Seq()),
(Seq("a", "b", "c", "d"), Seq("b", "c", "d")),
(Seq("a", "the", "an"), Seq()),
(Seq("A", "The", "AN"), Seq()),
@@ -67,13 +85,48 @@ class StopWordsRemoverSuite
testStopWordsRemover(remover, dataSet)
}
- test("StopWordsRemover with additional words") {
- val stopWords = StopWords.English ++ Array("python", "scala")
+ test("default stop words of supported languages are not empty") {
+ StopWordsRemover.supportedLanguages.foreach { lang =>
+ assert(StopWordsRemover.loadDefaultStopWords(lang).nonEmpty,
+ s"The default stop words of $lang cannot be empty.")
+ }
+ }
+
+ test("StopWordsRemover with language selection") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("turkish")
val remover = new StopWordsRemover()
.setInputCol("raw")
.setOutputCol("filtered")
.setStopWords(stopWords)
val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("acaba", "ama", "biri"), Seq()),
+ (Seq("hep", "her", "scala"), Seq("scala"))
+ )).toDF("raw", "expected")
+
+ testStopWordsRemover(remover, dataSet)
+ }
+
+ test("StopWordsRemover with ignored words") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet -- Set("a")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords.toArray)
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("python", "scala", "a"), Seq("python", "scala", "a")),
+ (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift"))
+ )).toDF("raw", "expected")
+
+ testStopWordsRemover(remover, dataSet)
+ }
+
+ test("StopWordsRemover with additional words") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet ++ Set("python", "scala")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords.toArray)
+ val dataSet = sqlContext.createDataFrame(Seq(
(Seq("python", "scala", "a"), Seq()),
(Seq("Python", "Scala", "swift"), Seq("swift"))
)).toDF("raw", "expected")