From e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 Mon Sep 17 00:00:00 2001
From: Burak Köse <burakks41@gmail.com>
Date: Fri, 6 May 2016 13:58:12 -0700
Subject: [SPARK-14050][ML] Add multiple languages support and additional
 methods for Stop Words Remover
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This PR continues the work from #11871 with the following changes:
* load English stopwords as default
* covert stopwords to list in Python
* update some tests and doc

## How was this patch tested?

Unit tests.

Closes #11871

cc: burakkose srowen

Author: Burak Köse <burakks41@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Author: Burak KOSE <burakks41@gmail.com>

Closes #12843 from mengxr/SPARK-14050.
---
 python/pyspark/ml/tests.py | 7 +++++++
 1 file changed, 7 insertions(+)
 mode change 100644 => 100755 python/pyspark/ml/tests.py

(limited to 'python/pyspark/ml/tests.py')

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
old mode 100644
new mode 100755
index 78ec96af8a..ad1631fb5b
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -417,6 +417,13 @@ class FeatureTests(PySparkTestCase):
         self.assertEqual(stopWordRemover.getStopWords(), stopwords)
         transformedDF = stopWordRemover.transform(dataset)
         self.assertEqual(transformedDF.head().output, ["a"])
+        # with language selection
+        stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
+        dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])])
+        stopWordRemover.setStopWords(stopwords)
+        self.assertEqual(stopWordRemover.getStopWords(), stopwords)
+        transformedDF = stopWordRemover.transform(dataset)
+        self.assertEqual(transformedDF.head().output, [])
 
     def test_count_vectorizer_with_binary(self):
         sqlContext = SQLContext(self.sc)
-- 
cgit v1.2.3