From e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 Mon Sep 17 00:00:00 2001 From: Burak Köse Date: Fri, 6 May 2016 13:58:12 -0700 Subject: [SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What changes were proposed in this pull request? This PR continues the work from #11871 with the following changes: * load English stopwords as default * covert stopwords to list in Python * update some tests and doc ## How was this patch tested? Unit tests. Closes #11871 cc: burakkose srowen Author: Burak Köse Author: Xiangrui Meng Author: Burak KOSE Closes #12843 from mengxr/SPARK-14050. --- python/pyspark/ml/tests.py | 7 +++++++ 1 file changed, 7 insertions(+) mode change 100644 => 100755 python/pyspark/ml/tests.py (limited to 'python/pyspark/ml/tests.py') diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py old mode 100644 new mode 100755 index 78ec96af8a..ad1631fb5b --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -417,6 +417,13 @@ class FeatureTests(PySparkTestCase): self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) + # with language selection + stopwords = StopWordsRemover.loadDefaultStopWords("turkish") + dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])]) + stopWordRemover.setStopWords(stopwords) + self.assertEqual(stopWordRemover.getStopWords(), stopwords) + transformedDF = stopWordRemover.transform(dataset) + self.assertEqual(transformedDF.head().output, []) def test_count_vectorizer_with_binary(self): sqlContext = SQLContext(self.sc) -- cgit v1.2.3