aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/feature.py
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-05-21 17:59:03 -0700
committerXiangrui Meng <meng@databricks.com>2015-05-21 17:59:03 -0700
commitf5db4b416c922db7a8f1b0c098b4f08647106231 (patch)
tree97ddc7467b77b169324bba8b0ec93c5d76005ca9 /python/pyspark/ml/feature.py
parent17791a58159b3e4619d0367f54a4c5332342658b (diff)
downloadspark-f5db4b416c922db7a8f1b0c098b4f08647106231.tar.gz
spark-f5db4b416c922db7a8f1b0c098b4f08647106231.tar.bz2
spark-f5db4b416c922db7a8f1b0c098b4f08647106231.zip
[SPARK-7794] [MLLIB] update RegexTokenizer default settings
The previous default is `{gaps: false, pattern: "\\p{L}+|[^\\p{L}\\s]+"}`. The default pattern is hard to understand. This PR changes the default to `{gaps: true, pattern: "\\s+"}`. jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #6330 from mengxr/SPARK-7794 and squashes the following commits: 5ee7cde [Xiangrui Meng] update RegexTokenizer default settings
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rw-r--r--python/pyspark/ml/feature.py40
1 files changed, 19 insertions, 21 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 5511dceb70..b0479d9b07 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -446,23 +446,25 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
@ignore_unicode_prefix
class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
- A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
- or using it to split the text (set matching to false). Optional parameters also allow filtering
- tokens using a minimal length.
+ A regex based tokenizer that extracts tokens either by using the
+ provided regex pattern (in Java dialect) to split the text
+ (default) or repeatedly matching the regex (if gaps is true).
+ Optional parameters also allow filtering tokens using a minimal
+ length.
It returns an array of strings that can be empty.
- >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
+ >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
>>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
>>> reTokenizer.transform(df).head()
- Row(text=u'a b c', words=[u'a', u'b', u'c'])
+ Row(text=u'a b c', words=[u'a', u'b', u'c'])
>>> # Change a parameter.
>>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+ Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
>>> # Temporarily modify a parameter.
>>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
- Row(text=u'a b c', words=[u'a', u'b', u'c'])
+ Row(text=u'a b c', words=[u'a', u'b', u'c'])
>>> reTokenizer.transform(df).head()
- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+ Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
>>> # Must use keyword arguments to specify params.
>>> reTokenizer.setParams("text")
Traceback (most recent call last):
@@ -472,31 +474,27 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
# a placeholder to make it appear in the generated doc
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
- gaps = Param(Params._dummy(), "gaps", "Set regex to match gaps or tokens")
- pattern = Param(Params._dummy(), "pattern", "regex pattern used for tokenizing")
+ gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
+ pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
@keyword_only
- def __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+",
- inputCol=None, outputCol=None):
+ def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
"""
- __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", \
- inputCol=None, outputCol=None)
+ __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
"""
super(RegexTokenizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
- self.gaps = Param(self, "gaps", "Set regex to match gaps or tokens")
- self.pattern = Param(self, "pattern", "regex pattern used for tokenizing")
- self._setDefault(minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+")
+ self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
+ self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
+ self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
- def setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+",
- inputCol=None, outputCol=None):
+ def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
"""
- setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", \
- inputCol="input", outputCol="output")
+ setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
Sets params for this RegexTokenizer.
"""
kwargs = self.setParams._input_kwargs