aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/tests.py
diff options
context:
space:
mode:
authorBurak Köse <burakks41@gmail.com>2016-05-06 13:58:12 -0700
committerXiangrui Meng <meng@databricks.com>2016-05-06 13:58:12 -0700
commite20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch)
treeea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9 /python/pyspark/ml/tests.py
parent5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff)
downloadspark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip
[SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover
## What changes were proposed in this pull request? This PR continues the work from #11871 with the following changes: * load English stopwords as default * covert stopwords to list in Python * update some tests and doc ## How was this patch tested? Unit tests. Closes #11871 cc: burakkose srowen Author: Burak Köse <burakks41@gmail.com> Author: Xiangrui Meng <meng@databricks.com> Author: Burak KOSE <burakks41@gmail.com> Closes #12843 from mengxr/SPARK-14050.
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rwxr-xr-x[-rw-r--r--]python/pyspark/ml/tests.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 78ec96af8a..ad1631fb5b 100644..100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -417,6 +417,13 @@ class FeatureTests(PySparkTestCase):
self.assertEqual(stopWordRemover.getStopWords(), stopwords)
transformedDF = stopWordRemover.transform(dataset)
self.assertEqual(transformedDF.head().output, ["a"])
+ # with language selection
+ stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
+ dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])])
+ stopWordRemover.setStopWords(stopwords)
+ self.assertEqual(stopWordRemover.getStopWords(), stopwords)
+ transformedDF = stopWordRemover.transform(dataset)
+ self.assertEqual(transformedDF.head().output, [])
def test_count_vectorizer_with_binary(self):
sqlContext = SQLContext(self.sc)