aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-01-15 15:54:19 -0800
committerJoseph K. Bradley <joseph@databricks.com>2016-01-15 15:54:19 -0800
commit5f843781e3e7581c61b7e235d4041d85e8e48c7e (patch)
tree2c24d74d55531bff30dc2978b7b1f0491ad64773 /python
parent7cd7f2202547224593517b392f56e49e4c94cabc (diff)
downloadspark-5f843781e3e7581c61b7e235d4041d85e8e48c7e.tar.gz
spark-5f843781e3e7581c61b7e235d4041d85e8e48c7e.tar.bz2
spark-5f843781e3e7581c61b7e235d4041d85e8e48c7e.zip
[SPARK-11925][ML][PYSPARK] Add PySpark missing methods for ml.feature during Spark 1.6 QA
Add PySpark missing methods and params for ml.feature: * ```RegexTokenizer``` should support setting ```toLowercase```. * ```MinMaxScalerModel``` should support output ```originalMin``` and ```originalMax```. * ```PCAModel``` should support output ```pc```. Author: Yanbo Liang <ybliang8@gmail.com> Closes #9908 from yanboliang/spark-11925.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/feature.py72
1 files changed, 62 insertions, 10 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index b02d41b52a..141ec3492a 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -606,6 +606,10 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
>>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")
>>> model = mmScaler.fit(df)
+ >>> model.originalMin
+ DenseVector([0.0])
+ >>> model.originalMax
+ DenseVector([2.0])
>>> model.transform(df).show()
+-----+------+
| a|scaled|
@@ -688,6 +692,22 @@ class MinMaxScalerModel(JavaModel):
.. versionadded:: 1.6.0
"""
+ @property
+ @since("2.0.0")
+ def originalMin(self):
+ """
+ Min value for each original column during fitting.
+ """
+ return self._call_java("originalMin")
+
+ @property
+ @since("2.0.0")
+ def originalMax(self):
+ """
+ Max value for each original column during fitting.
+ """
+ return self._call_java("originalMax")
+
@inherit_doc
@ignore_unicode_prefix
@@ -984,18 +1004,18 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
length.
It returns an array of strings that can be empty.
- >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
+ >>> df = sqlContext.createDataFrame([("A B c",)], ["text"])
>>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
>>> reTokenizer.transform(df).head()
- Row(text=u'a b c', words=[u'a', u'b', u'c'])
+ Row(text=u'A B c', words=[u'a', u'b', u'c'])
>>> # Change a parameter.
>>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+ Row(text=u'A B c', tokens=[u'a', u'b', u'c'])
>>> # Temporarily modify a parameter.
>>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
- Row(text=u'a b c', words=[u'a', u'b', u'c'])
+ Row(text=u'A B c', words=[u'a', u'b', u'c'])
>>> reTokenizer.transform(df).head()
- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+ Row(text=u'A B c', tokens=[u'a', u'b', u'c'])
>>> # Must use keyword arguments to specify params.
>>> reTokenizer.setParams("text")
Traceback (most recent call last):
@@ -1009,26 +1029,34 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
+ toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " +
+ "lowercase before tokenizing")
@keyword_only
- def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
+ def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None,
+ outputCol=None, toLowercase=True):
"""
- __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
+ __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \
+ outputCol=None, toLowercase=True)
"""
super(RegexTokenizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
- self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+")
+ self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " +
+ "lowercase before tokenizing")
+ self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("1.4.0")
- def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
+ def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None,
+ outputCol=None, toLowercase=True):
"""
- setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
+ setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \
+ outputCol=None, toLowercase=True)
Sets params for this RegexTokenizer.
"""
kwargs = self.setParams._input_kwargs
@@ -1079,6 +1107,21 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
return self.getOrDefault(self.pattern)
+ @since("2.0.0")
+ def setToLowercase(self, value):
+ """
+ Sets the value of :py:attr:`toLowercase`.
+ """
+ self._paramMap[self.toLowercase] = value
+ return self
+
+ @since("2.0.0")
+ def getToLowercase(self):
+ """
+ Gets the value of toLowercase or its default value.
+ """
+ return self.getOrDefault(self.toLowercase)
+
@inherit_doc
class SQLTransformer(JavaTransformer):
@@ -2000,6 +2043,15 @@ class PCAModel(JavaModel):
.. versionadded:: 1.5.0
"""
+ @property
+ @since("2.0.0")
+ def pc(self):
+ """
+ Returns a principal components Matrix.
+ Each column is one principal component.
+ """
+ return self._call_java("pc")
+
@inherit_doc
class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):