From 5f843781e3e7581c61b7e235d4041d85e8e48c7e Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 15 Jan 2016 15:54:19 -0800 Subject: [SPARK-11925][ML][PYSPARK] Add PySpark missing methods for ml.feature during Spark 1.6 QA Add PySpark missing methods and params for ml.feature: * ```RegexTokenizer``` should support setting ```toLowercase```. * ```MinMaxScalerModel``` should support output ```originalMin``` and ```originalMax```. * ```PCAModel``` should support output ```pc```. Author: Yanbo Liang Closes #9908 from yanboliang/spark-11925. --- python/pyspark/ml/feature.py | 72 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 10 deletions(-) (limited to 'python/pyspark') diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b02d41b52a..141ec3492a 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -606,6 +606,10 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") >>> model = mmScaler.fit(df) + >>> model.originalMin + DenseVector([0.0]) + >>> model.originalMax + DenseVector([2.0]) >>> model.transform(df).show() +-----+------+ | a|scaled| @@ -688,6 +692,22 @@ class MinMaxScalerModel(JavaModel): .. versionadded:: 1.6.0 """ + @property + @since("2.0.0") + def originalMin(self): + """ + Min value for each original column during fitting. + """ + return self._call_java("originalMin") + + @property + @since("2.0.0") + def originalMax(self): + """ + Max value for each original column during fitting. + """ + return self._call_java("originalMax") + @inherit_doc @ignore_unicode_prefix @@ -984,18 +1004,18 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): length. It returns an array of strings that can be empty. - >>> df = sqlContext.createDataFrame([("a b c",)], ["text"]) + >>> df = sqlContext.createDataFrame([("A B c",)], ["text"]) >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words") >>> reTokenizer.transform(df).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) + Row(text=u'A B c', words=[u'a', u'b', u'c']) >>> # Change a parameter. >>> reTokenizer.setParams(outputCol="tokens").transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + Row(text=u'A B c', tokens=[u'a', u'b', u'c']) >>> # Temporarily modify a parameter. >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) + Row(text=u'A B c', words=[u'a', u'b', u'c']) >>> reTokenizer.transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + Row(text=u'A B c', tokens=[u'a', u'b', u'c']) >>> # Must use keyword arguments to specify params. >>> reTokenizer.setParams("text") Traceback (most recent call last): @@ -1009,26 +1029,34 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)") gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing") + toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " + + "lowercase before tokenizing") @keyword_only - def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None): + def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, + outputCol=None, toLowercase=True): """ - __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None) + __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \ + outputCol=None, toLowercase=True) """ super(RegexTokenizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid) self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)") self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens") self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing") - self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+") + self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " + + "lowercase before tokenizing") + self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") - def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None): + def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, + outputCol=None, toLowercase=True): """ - setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None) + setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \ + outputCol=None, toLowercase=True) Sets params for this RegexTokenizer. """ kwargs = self.setParams._input_kwargs @@ -1079,6 +1107,21 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): """ return self.getOrDefault(self.pattern) + @since("2.0.0") + def setToLowercase(self, value): + """ + Sets the value of :py:attr:`toLowercase`. + """ + self._paramMap[self.toLowercase] = value + return self + + @since("2.0.0") + def getToLowercase(self): + """ + Gets the value of toLowercase or its default value. + """ + return self.getOrDefault(self.toLowercase) + @inherit_doc class SQLTransformer(JavaTransformer): @@ -2000,6 +2043,15 @@ class PCAModel(JavaModel): .. versionadded:: 1.5.0 """ + @property + @since("2.0.0") + def pc(self): + """ + Returns a principal components Matrix. + Each column is one principal component. + """ + return self._call_java("pc") + @inherit_doc class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): -- cgit v1.2.3