From eb917291ca1a2d68ca0639cb4b1464a546603eba Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Tue, 26 Jan 2016 15:53:48 -0800
Subject: [SPARK-10509][PYSPARK] Reduce excessive param boiler plate code

The current python ml params require cut-and-pasting the param setup and description between the class & ```__init__``` methods. Remove this possible case of errors & simplify use of custom params by adding a ```_copy_new_parent``` method to param so as to avoid cut and pasting (and cut and pasting at different indentation levels urgh).

Author: Holden Karau <holden@us.ibm.com>

Closes #10216 from holdenk/SPARK-10509-excessive-param-boiler-plate-code.
---
 python/pyspark/ml/feature.py | 98 +-------------------------------------------
 1 file changed, 1 insertion(+), 97 deletions(-)

(limited to 'python/pyspark/ml/feature.py')

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 32f324685a..22081233b0 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -57,7 +57,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     threshold = Param(Params._dummy(), "threshold",
                       "threshold in binary classification prediction, in range [0, 1]")
 
@@ -68,8 +67,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(Binarizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
-        self.threshold = Param(self, "threshold",
-                               "threshold in binary classification prediction, in range [0, 1]")
         self._setDefault(threshold=0.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -125,7 +122,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.3.0
     """
 
-    # a placeholder to make it appear in the generated doc
     splits = \
         Param(Params._dummy(), "splits",
               "Split points for mapping continuous features into buckets. With n+1 splits, " +
@@ -142,19 +138,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(Bucketizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
-        #: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
-        #  there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
-        #  except the last bucket, which also includes y. The splits should be strictly increasing.
-        #  Values at -inf, inf must be explicitly provided to cover all Double values; otherwise,
-        #  values outside the splits specified will be treated as errors.
-        self.splits = \
-            Param(self, "splits",
-                  "Split points for mapping continuous features into buckets. With n+1 splits, " +
-                  "there are n buckets. A bucket defined by splits x,y holds values in the " +
-                  "range [x,y) except the last bucket, which also includes y. The splits " +
-                  "should be strictly increasing. Values at -inf, inf must be explicitly " +
-                  "provided to cover all Double values; otherwise, values outside the splits " +
-                  "specified will be treated as errors.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -210,7 +193,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     minTF = Param(
         Params._dummy(), "minTF", "Filter to ignore rare words in" +
         " a document. For each document, terms with frequency/count less than the given" +
@@ -235,22 +217,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
         super(CountVectorizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
                                             self.uid)
-        self.minTF = Param(
-            self, "minTF", "Filter to ignore rare words in" +
-            " a document. For each document, terms with frequency/count less than the given" +
-            " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
-            " times the term must appear in the document); if this is a double in [0,1), then " +
-            "this specifies a fraction (out of the document's token count). Note that the " +
-            "parameter is only used in transform of CountVectorizerModel and does not affect" +
-            "fitting. Default 1.0")
-        self.minDF = Param(
-            self, "minDF", "Specifies the minimum number of" +
-            " different documents a term must appear in to be included in the vocabulary." +
-            " If this is an integer >= 1, this specifies the number of documents the term must" +
-            " appear in; if this is a double in [0,1), then this specifies the fraction of " +
-            "documents. Default 1.0")
-        self.vocabSize = Param(
-            self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
         self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -359,7 +325,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
                     "default False.")
 
@@ -370,8 +335,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(DCT, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
-        self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
-                             "default False.")
         self._setDefault(inverse=False)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -423,7 +386,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
                        "it must be MLlib Vector type.")
 
@@ -435,8 +397,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
         super(ElementwiseProduct, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
                                             self.uid)
-        self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
-                                "it must be MLlib Vector type.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -531,7 +491,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     minDocFreq = Param(Params._dummy(), "minDocFreq",
                        "minimum of documents in which a term should appear for filtering")
 
@@ -542,8 +501,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(IDF, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
-        self.minDocFreq = Param(self, "minDocFreq",
-                                "minimum of documents in which a term should appear for filtering")
         self._setDefault(minDocFreq=0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -623,7 +580,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
     max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
 
@@ -634,8 +590,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(MinMaxScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
-        self.min = Param(self, "min", "Lower bound of the output feature range")
-        self.max = Param(self, "max", "Upper bound of the output feature range")
         self._setDefault(min=0.0, max=1.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -745,7 +699,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
 
     @keyword_only
@@ -755,7 +708,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(NGram, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
-        self.n = Param(self, "n", "number of elements per n-gram (>=1)")
         self._setDefault(n=2)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -808,7 +760,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     p = Param(Params._dummy(), "p", "the p norm value.")
 
     @keyword_only
@@ -818,7 +769,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(Normalizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
-        self.p = Param(self, "p", "the p norm value.")
         self._setDefault(p=2.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -887,7 +837,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
 
     @keyword_only
@@ -897,7 +846,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(OneHotEncoder, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
-        self.dropLast = Param(self, "dropLast", "whether to drop the last category")
         self._setDefault(dropLast=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -950,7 +898,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
 
     @keyword_only
@@ -961,7 +908,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
         super(PolynomialExpansion, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
-        self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
         self._setDefault(degree=2)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1107,7 +1053,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
     gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
     pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
@@ -1123,11 +1068,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(RegexTokenizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
-        self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
-        self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
-        self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
-        self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " +
-                                 "lowercase before tokenizing")
         self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1223,7 +1163,6 @@ class SQLTransformer(JavaTransformer):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     statement = Param(Params._dummy(), "statement", "SQL statement")
 
     @keyword_only
@@ -1233,7 +1172,6 @@ class SQLTransformer(JavaTransformer):
         """
         super(SQLTransformer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
-        self.statement = Param(self, "statement", "SQL statement")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1285,7 +1223,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     withMean = Param(Params._dummy(), "withMean", "Center data with mean")
     withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
 
@@ -1296,8 +1233,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(StandardScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
-        self.withMean = Param(self, "withMean", "Center data with mean")
-        self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
         self._setDefault(withMean=False, withStd=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1453,7 +1388,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make the labels show up in generated doc
     labels = Param(Params._dummy(), "labels",
                    "Optional array of labels specifying index-string mapping." +
                    " If not provided or if empty, then metadata from inputCol is used instead.")
@@ -1466,9 +1400,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
         super(IndexToString, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
                                             self.uid)
-        self.labels = Param(self, "labels",
-                            "Optional array of labels specifying index-string mapping. If not" +
-                            " provided or if empty, then metadata from inputCol is used instead.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1507,7 +1438,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
 
     .. versionadded:: 1.6.0
     """
-    # a placeholder to make the stopwords show up in generated doc
+
     stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
     caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
                           "comparison over the stop words")
@@ -1522,9 +1453,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
         super(StopWordsRemover, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
                                             self.uid)
-        self.stopWords = Param(self, "stopWords", "The words to be filtered out")
-        self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
-                                   "sensitive comparison over the stop words")
         stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
         defaultStopWords = stopWordsObj.English()
         self._setDefault(stopWords=defaultStopWords)
@@ -1727,7 +1655,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     maxCategories = Param(Params._dummy(), "maxCategories",
                           "Threshold for the number of values a categorical feature can take " +
                           "(>= 2). If a feature is found to have > maxCategories values, then " +
@@ -1740,10 +1667,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(VectorIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
-        self.maxCategories = Param(self, "maxCategories",
-                                   "Threshold for the number of values a categorical feature " +
-                                   "can take (>= 2). If a feature is found to have " +
-                                   "> maxCategories values, then it is declared continuous.")
         self._setDefault(maxCategories=20)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1832,7 +1755,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
                     "a vector column. There can be no overlap with names.")
     names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
@@ -1847,12 +1769,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(VectorSlicer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
-        self.indices = Param(self, "indices", "An array of indices to select features from " +
-                             "a vector column. There can be no overlap with names.")
-        self.names = Param(self, "names", "An array of feature names to select features from " +
-                           "a vector column. These names must be specified by ML " +
-                           "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
-                           "with indices.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1932,7 +1848,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     vectorSize = Param(Params._dummy(), "vectorSize",
                        "the dimension of codes after transforming from words")
     numPartitions = Param(Params._dummy(), "numPartitions",
@@ -1950,13 +1865,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
         """
         super(Word2Vec, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
-        self.vectorSize = Param(self, "vectorSize",
-                                "the dimension of codes after transforming from words")
-        self.numPartitions = Param(self, "numPartitions",
-                                   "number of partitions for sentences of words")
-        self.minCount = Param(self, "minCount",
-                              "the minimum number of times a token must appear to be included " +
-                              "in the word2vec model's vocabulary")
         self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
                          seed=None)
         kwargs = self.__init__._input_kwargs
@@ -2075,7 +1983,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     k = Param(Params._dummy(), "k", "the number of principal components")
 
     @keyword_only
@@ -2085,7 +1992,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(PCA, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
-        self.k = Param(self, "k", "the number of principal components")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -2185,7 +2091,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     formula = Param(Params._dummy(), "formula", "R model formula")
 
     @keyword_only
@@ -2195,7 +2100,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
         """
         super(RFormula, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
-        self.formula = Param(self, "formula", "R model formula")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
-- 
cgit v1.2.3