From eb917291ca1a2d68ca0639cb4b1464a546603eba Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 15:53:48 -0800 Subject: [SPARK-10509][PYSPARK] Reduce excessive param boiler plate code The current python ml params require cut-and-pasting the param setup and description between the class & ```__init__``` methods. Remove this possible case of errors & simplify use of custom params by adding a ```_copy_new_parent``` method to param so as to avoid cut and pasting (and cut and pasting at different indentation levels urgh). Author: Holden Karau Closes #10216 from holdenk/SPARK-10509-excessive-param-boiler-plate-code. --- python/pyspark/ml/feature.py | 98 +------------------------------------------- 1 file changed, 1 insertion(+), 97 deletions(-) (limited to 'python/pyspark/ml/feature.py') diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 32f324685a..22081233b0 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -57,7 +57,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc threshold = Param(Params._dummy(), "threshold", "threshold in binary classification prediction, in range [0, 1]") @@ -68,8 +67,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ super(Binarizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid) - self.threshold = Param(self, "threshold", - "threshold in binary classification prediction, in range [0, 1]") self._setDefault(threshold=0.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -125,7 +122,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.3.0 """ - # a placeholder to make it appear in the generated doc splits = \ Param(Params._dummy(), "splits", "Split points for mapping continuous features into buckets. With n+1 splits, " + @@ -142,19 +138,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): """ super(Bucketizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid) - #: param for Splitting points for mapping continuous features into buckets. With n+1 splits, - # there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) - # except the last bucket, which also includes y. The splits should be strictly increasing. - # Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, - # values outside the splits specified will be treated as errors. - self.splits = \ - Param(self, "splits", - "Split points for mapping continuous features into buckets. With n+1 splits, " + - "there are n buckets. A bucket defined by splits x,y holds values in the " + - "range [x,y) except the last bucket, which also includes y. The splits " + - "should be strictly increasing. Values at -inf, inf must be explicitly " + - "provided to cover all Double values; otherwise, values outside the splits " + - "specified will be treated as errors.") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -210,7 +193,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc minTF = Param( Params._dummy(), "minTF", "Filter to ignore rare words in" + " a document. For each document, terms with frequency/count less than the given" + @@ -235,22 +217,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): super(CountVectorizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer", self.uid) - self.minTF = Param( - self, "minTF", "Filter to ignore rare words in" + - " a document. For each document, terms with frequency/count less than the given" + - " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" + - " times the term must appear in the document); if this is a double in [0,1), then " + - "this specifies a fraction (out of the document's token count). Note that the " + - "parameter is only used in transform of CountVectorizerModel and does not affect" + - "fitting. Default 1.0") - self.minDF = Param( - self, "minDF", "Specifies the minimum number of" + - " different documents a term must appear in to be included in the vocabulary." + - " If this is an integer >= 1, this specifies the number of documents the term must" + - " appear in; if this is a double in [0,1), then this specifies the fraction of " + - "documents. Default 1.0") - self.vocabSize = Param( - self, "vocabSize", "max size of the vocabulary. Default 1 << 18.") self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -359,7 +325,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + "default False.") @@ -370,8 +335,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol): """ super(DCT, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid) - self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " + - "default False.") self._setDefault(inverse=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -423,7 +386,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " + "it must be MLlib Vector type.") @@ -435,8 +397,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): super(ElementwiseProduct, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct", self.uid) - self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " + - "it must be MLlib Vector type.") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -531,7 +491,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc minDocFreq = Param(Params._dummy(), "minDocFreq", "minimum of documents in which a term should appear for filtering") @@ -542,8 +501,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol): """ super(IDF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) - self.minDocFreq = Param(self, "minDocFreq", - "minimum of documents in which a term should appear for filtering") self._setDefault(minDocFreq=0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -623,7 +580,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc min = Param(Params._dummy(), "min", "Lower bound of the output feature range") max = Param(Params._dummy(), "max", "Upper bound of the output feature range") @@ -634,8 +590,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): """ super(MinMaxScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self.min = Param(self, "min", "Lower bound of the output feature range") - self.max = Param(self, "max", "Upper bound of the output feature range") self._setDefault(min=0.0, max=1.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -745,7 +699,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)") @keyword_only @@ -755,7 +708,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol): """ super(NGram, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid) - self.n = Param(self, "n", "number of elements per n-gram (>=1)") self._setDefault(n=2) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -808,7 +760,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc p = Param(Params._dummy(), "p", "the p norm value.") @keyword_only @@ -818,7 +769,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol): """ super(Normalizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid) - self.p = Param(self, "p", "the p norm value.") self._setDefault(p=2.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -887,7 +837,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category") @keyword_only @@ -897,7 +846,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol): """ super(OneHotEncoder, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid) - self.dropLast = Param(self, "dropLast", "whether to drop the last category") self._setDefault(dropLast=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -950,7 +898,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)") @keyword_only @@ -961,7 +908,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol): super(PolynomialExpansion, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.feature.PolynomialExpansion", self.uid) - self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)") self._setDefault(degree=2) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1107,7 +1053,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)") gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing") @@ -1123,11 +1068,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): """ super(RegexTokenizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid) - self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)") - self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens") - self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing") - self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " + - "lowercase before tokenizing") self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1223,7 +1163,6 @@ class SQLTransformer(JavaTransformer): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc statement = Param(Params._dummy(), "statement", "SQL statement") @keyword_only @@ -1233,7 +1172,6 @@ class SQLTransformer(JavaTransformer): """ super(SQLTransformer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid) - self.statement = Param(self, "statement", "SQL statement") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1285,7 +1223,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc withMean = Param(Params._dummy(), "withMean", "Center data with mean") withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation") @@ -1296,8 +1233,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol): """ super(StandardScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid) - self.withMean = Param(self, "withMean", "Center data with mean") - self.withStd = Param(self, "withStd", "Scale to unit standard deviation") self._setDefault(withMean=False, withStd=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1453,7 +1388,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make the labels show up in generated doc labels = Param(Params._dummy(), "labels", "Optional array of labels specifying index-string mapping." + " If not provided or if empty, then metadata from inputCol is used instead.") @@ -1466,9 +1400,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): super(IndexToString, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) - self.labels = Param(self, "labels", - "Optional array of labels specifying index-string mapping. If not" + - " provided or if empty, then metadata from inputCol is used instead.") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1507,7 +1438,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make the stopwords show up in generated doc + stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out") caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + "comparison over the stop words") @@ -1522,9 +1453,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol): super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self.stopWords = Param(self, "stopWords", "The words to be filtered out") - self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " + - "sensitive comparison over the stop words") stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords defaultStopWords = stopWordsObj.English() self._setDefault(stopWords=defaultStopWords) @@ -1727,7 +1655,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc maxCategories = Param(Params._dummy(), "maxCategories", "Threshold for the number of values a categorical feature can take " + "(>= 2). If a feature is found to have > maxCategories values, then " + @@ -1740,10 +1667,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): """ super(VectorIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid) - self.maxCategories = Param(self, "maxCategories", - "Threshold for the number of values a categorical feature " + - "can take (>= 2). If a feature is found to have " + - "> maxCategories values, then it is declared continuous.") self._setDefault(maxCategories=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1832,7 +1755,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + "a vector column. There can be no overlap with names.") names = Param(Params._dummy(), "names", "An array of feature names to select features from " + @@ -1847,12 +1769,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): """ super(VectorSlicer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) - self.indices = Param(self, "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with names.") - self.names = Param(self, "names", "An array of feature names to select features from " + - "a vector column. These names must be specified by ML " + - "org.apache.spark.ml.attribute.Attribute. There can be no overlap " + - "with indices.") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1932,7 +1848,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc vectorSize = Param(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words") numPartitions = Param(Params._dummy(), "numPartitions", @@ -1950,13 +1865,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) - self.vectorSize = Param(self, "vectorSize", - "the dimension of codes after transforming from words") - self.numPartitions = Param(self, "numPartitions", - "number of partitions for sentences of words") - self.minCount = Param(self, "minCount", - "the minimum number of times a token must appear to be included " + - "in the word2vec model's vocabulary") self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None) kwargs = self.__init__._input_kwargs @@ -2075,7 +1983,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc k = Param(Params._dummy(), "k", "the number of principal components") @keyword_only @@ -2085,7 +1992,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol): """ super(PCA, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid) - self.k = Param(self, "k", "the number of principal components") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -2185,7 +2091,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc formula = Param(Params._dummy(), "formula", "R model formula") @keyword_only @@ -2195,7 +2100,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): """ super(RFormula, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self.formula = Param(self, "formula", "R model formula") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) -- cgit v1.2.3