diff options
author | Burak Köse <burakks41@gmail.com> | 2016-05-06 13:58:12 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-05-06 13:58:12 -0700 |
commit | e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch) | |
tree | ea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9 /python/pyspark/ml/tests.py | |
parent | 5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff) | |
download | spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2 spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip |
[SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover
## What changes were proposed in this pull request?
This PR continues the work from #11871 with the following changes:
* load English stopwords as default
* covert stopwords to list in Python
* update some tests and doc
## How was this patch tested?
Unit tests.
Closes #11871
cc: burakkose srowen
Author: Burak Köse <burakks41@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Author: Burak KOSE <burakks41@gmail.com>
Closes #12843 from mengxr/SPARK-14050.
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rwxr-xr-x[-rw-r--r--] | python/pyspark/ml/tests.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 78ec96af8a..ad1631fb5b 100644..100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -417,6 +417,13 @@ class FeatureTests(PySparkTestCase): self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) + # with language selection + stopwords = StopWordsRemover.loadDefaultStopWords("turkish") + dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])]) + stopWordRemover.setStopWords(stopwords) + self.assertEqual(stopWordRemover.getStopWords(), stopwords) + transformedDF = stopWordRemover.transform(dataset) + self.assertEqual(transformedDF.head().output, []) def test_count_vectorizer_with_binary(self): sqlContext = SQLContext(self.sc) |