diff options
author | Burak Köse <burakks41@gmail.com> | 2016-05-06 13:58:12 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-05-06 13:58:12 -0700 |
commit | e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch) | |
tree | ea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9 /python/pyspark/ml/feature.py | |
parent | 5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff) | |
download | spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2 spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip |
[SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover
## What changes were proposed in this pull request?
This PR continues the work from #11871 with the following changes:
* load English stopwords as default
* covert stopwords to list in Python
* update some tests and doc
## How was this patch tested?
Unit tests.
Closes #11871
cc: burakkose srowen
Author: Burak Köse <burakks41@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Author: Burak KOSE <burakks41@gmail.com>
Closes #12843 from mengxr/SPARK-14050.
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rwxr-xr-x[-rw-r--r--] | python/pyspark/ml/feature.py | 38 |
1 files changed, 22 insertions, 16 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index f21e3062ef..d2989fa4cd 100644..100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1738,28 +1738,23 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl "comparison over the stop words", typeConverter=TypeConverters.toBoolean) @keyword_only - def __init__(self, inputCol=None, outputCol=None, stopWords=None, - caseSensitive=False): + def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False): """ - __init__(self, inputCol=None, outputCol=None, stopWords=None,\ - caseSensitive=false) + __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false) """ super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords - defaultStopWords = list(stopWordsObj.English()) - self._setDefault(stopWords=defaultStopWords, caseSensitive=False) + self._setDefault(stopWords=StopWordsRemover.loadDefaultStopWords("english"), + caseSensitive=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, stopWords=None, - caseSensitive=False): + def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False): """ - setParams(self, inputCol="input", outputCol="output", stopWords=None,\ - caseSensitive=false) + setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false) Sets params for this StopWordRemover. """ kwargs = self.setParams._input_kwargs @@ -1768,31 +1763,42 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl @since("1.6.0") def setStopWords(self, value): """ - Specify the stopwords to be filtered. + Sets the value of :py:attr:`stopWords`. """ return self._set(stopWords=value) @since("1.6.0") def getStopWords(self): """ - Get the stopwords. + Gets the value of :py:attr:`stopWords` or its default value. """ return self.getOrDefault(self.stopWords) @since("1.6.0") def setCaseSensitive(self, value): """ - Set whether to do a case sensitive comparison over the stop words + Sets the value of :py:attr:`caseSensitive`. """ return self._set(caseSensitive=value) @since("1.6.0") def getCaseSensitive(self): """ - Get whether to do a case sensitive comparison over the stop words. + Gets the value of :py:attr:`caseSensitive` or its default value. """ return self.getOrDefault(self.caseSensitive) + @staticmethod + @since("2.0.0") + def loadDefaultStopWords(language): + """ + Loads the default stop words for the given language. + Supported languages: danish, dutch, english, finnish, french, german, hungarian, + italian, norwegian, portuguese, russian, spanish, swedish, turkish + """ + stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover + return list(stopWordsObj.loadDefaultStopWords(language)) + @inherit_doc @ignore_unicode_prefix @@ -1843,7 +1849,7 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java @since("1.3.0") def setParams(self, inputCol=None, outputCol=None): """ - setParams(self, inputCol="input", outputCol="output") + setParams(self, inputCol=None, outputCol=None) Sets params for this Tokenizer. """ kwargs = self.setParams._input_kwargs |