aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/feature.py
diff options
context:
space:
mode:
authorHolden Karau <holden@us.ibm.com>2016-01-26 15:53:48 -0800
committerJoseph K. Bradley <joseph@databricks.com>2016-01-26 15:53:48 -0800
commiteb917291ca1a2d68ca0639cb4b1464a546603eba (patch)
tree380dcaa33273baa68beaf089387bd498d5ee88e8 /python/pyspark/ml/feature.py
parent19fdb21afbf0eae4483cf6d4ef32daffd1994b89 (diff)
downloadspark-eb917291ca1a2d68ca0639cb4b1464a546603eba.tar.gz
spark-eb917291ca1a2d68ca0639cb4b1464a546603eba.tar.bz2
spark-eb917291ca1a2d68ca0639cb4b1464a546603eba.zip
[SPARK-10509][PYSPARK] Reduce excessive param boiler plate code
The current python ml params require cut-and-pasting the param setup and description between the class & ```__init__``` methods. Remove this possible case of errors & simplify use of custom params by adding a ```_copy_new_parent``` method to param so as to avoid cut and pasting (and cut and pasting at different indentation levels urgh). Author: Holden Karau <holden@us.ibm.com> Closes #10216 from holdenk/SPARK-10509-excessive-param-boiler-plate-code.
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rw-r--r--python/pyspark/ml/feature.py98
1 files changed, 1 insertions, 97 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 32f324685a..22081233b0 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -57,7 +57,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
threshold = Param(Params._dummy(), "threshold",
"threshold in binary classification prediction, in range [0, 1]")
@@ -68,8 +67,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(Binarizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
- self.threshold = Param(self, "threshold",
- "threshold in binary classification prediction, in range [0, 1]")
self._setDefault(threshold=0.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -125,7 +122,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.3.0
"""
- # a placeholder to make it appear in the generated doc
splits = \
Param(Params._dummy(), "splits",
"Split points for mapping continuous features into buckets. With n+1 splits, " +
@@ -142,19 +138,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(Bucketizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
- #: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
- # there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
- # except the last bucket, which also includes y. The splits should be strictly increasing.
- # Values at -inf, inf must be explicitly provided to cover all Double values; otherwise,
- # values outside the splits specified will be treated as errors.
- self.splits = \
- Param(self, "splits",
- "Split points for mapping continuous features into buckets. With n+1 splits, " +
- "there are n buckets. A bucket defined by splits x,y holds values in the " +
- "range [x,y) except the last bucket, which also includes y. The splits " +
- "should be strictly increasing. Values at -inf, inf must be explicitly " +
- "provided to cover all Double values; otherwise, values outside the splits " +
- "specified will be treated as errors.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -210,7 +193,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
minTF = Param(
Params._dummy(), "minTF", "Filter to ignore rare words in" +
" a document. For each document, terms with frequency/count less than the given" +
@@ -235,22 +217,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
super(CountVectorizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
self.uid)
- self.minTF = Param(
- self, "minTF", "Filter to ignore rare words in" +
- " a document. For each document, terms with frequency/count less than the given" +
- " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
- " times the term must appear in the document); if this is a double in [0,1), then " +
- "this specifies a fraction (out of the document's token count). Note that the " +
- "parameter is only used in transform of CountVectorizerModel and does not affect" +
- "fitting. Default 1.0")
- self.minDF = Param(
- self, "minDF", "Specifies the minimum number of" +
- " different documents a term must appear in to be included in the vocabulary." +
- " If this is an integer >= 1, this specifies the number of documents the term must" +
- " appear in; if this is a double in [0,1), then this specifies the fraction of " +
- "documents. Default 1.0")
- self.vocabSize = Param(
- self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -359,7 +325,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
"default False.")
@@ -370,8 +335,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(DCT, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
- self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
- "default False.")
self._setDefault(inverse=False)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -423,7 +386,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
"it must be MLlib Vector type.")
@@ -435,8 +397,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
super(ElementwiseProduct, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
self.uid)
- self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
- "it must be MLlib Vector type.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -531,7 +491,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
minDocFreq = Param(Params._dummy(), "minDocFreq",
"minimum of documents in which a term should appear for filtering")
@@ -542,8 +501,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(IDF, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
- self.minDocFreq = Param(self, "minDocFreq",
- "minimum of documents in which a term should appear for filtering")
self._setDefault(minDocFreq=0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -623,7 +580,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
@@ -634,8 +590,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(MinMaxScaler, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
- self.min = Param(self, "min", "Lower bound of the output feature range")
- self.max = Param(self, "max", "Upper bound of the output feature range")
self._setDefault(min=0.0, max=1.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -745,7 +699,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
@keyword_only
@@ -755,7 +708,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(NGram, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
- self.n = Param(self, "n", "number of elements per n-gram (>=1)")
self._setDefault(n=2)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -808,7 +760,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
p = Param(Params._dummy(), "p", "the p norm value.")
@keyword_only
@@ -818,7 +769,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(Normalizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
- self.p = Param(self, "p", "the p norm value.")
self._setDefault(p=2.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -887,7 +837,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
@keyword_only
@@ -897,7 +846,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(OneHotEncoder, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
- self.dropLast = Param(self, "dropLast", "whether to drop the last category")
self._setDefault(dropLast=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -950,7 +898,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
@keyword_only
@@ -961,7 +908,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
super(PolynomialExpansion, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
- self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
self._setDefault(degree=2)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1107,7 +1053,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
@@ -1123,11 +1068,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(RegexTokenizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
- self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
- self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
- self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
- self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " +
- "lowercase before tokenizing")
self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1223,7 +1163,6 @@ class SQLTransformer(JavaTransformer):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
statement = Param(Params._dummy(), "statement", "SQL statement")
@keyword_only
@@ -1233,7 +1172,6 @@ class SQLTransformer(JavaTransformer):
"""
super(SQLTransformer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
- self.statement = Param(self, "statement", "SQL statement")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1285,7 +1223,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
withMean = Param(Params._dummy(), "withMean", "Center data with mean")
withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
@@ -1296,8 +1233,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(StandardScaler, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
- self.withMean = Param(self, "withMean", "Center data with mean")
- self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
self._setDefault(withMean=False, withStd=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1453,7 +1388,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make the labels show up in generated doc
labels = Param(Params._dummy(), "labels",
"Optional array of labels specifying index-string mapping." +
" If not provided or if empty, then metadata from inputCol is used instead.")
@@ -1466,9 +1400,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
super(IndexToString, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
self.uid)
- self.labels = Param(self, "labels",
- "Optional array of labels specifying index-string mapping. If not" +
- " provided or if empty, then metadata from inputCol is used instead.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1507,7 +1438,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make the stopwords show up in generated doc
+
stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
"comparison over the stop words")
@@ -1522,9 +1453,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
super(StopWordsRemover, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
self.uid)
- self.stopWords = Param(self, "stopWords", "The words to be filtered out")
- self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
- "sensitive comparison over the stop words")
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
defaultStopWords = stopWordsObj.English()
self._setDefault(stopWords=defaultStopWords)
@@ -1727,7 +1655,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
maxCategories = Param(Params._dummy(), "maxCategories",
"Threshold for the number of values a categorical feature can take " +
"(>= 2). If a feature is found to have > maxCategories values, then " +
@@ -1740,10 +1667,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(VectorIndexer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
- self.maxCategories = Param(self, "maxCategories",
- "Threshold for the number of values a categorical feature " +
- "can take (>= 2). If a feature is found to have " +
- "> maxCategories values, then it is declared continuous.")
self._setDefault(maxCategories=20)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1832,7 +1755,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
"a vector column. There can be no overlap with names.")
names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
@@ -1847,12 +1769,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(VectorSlicer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
- self.indices = Param(self, "indices", "An array of indices to select features from " +
- "a vector column. There can be no overlap with names.")
- self.names = Param(self, "names", "An array of feature names to select features from " +
- "a vector column. These names must be specified by ML " +
- "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
- "with indices.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1932,7 +1848,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
vectorSize = Param(Params._dummy(), "vectorSize",
"the dimension of codes after transforming from words")
numPartitions = Param(Params._dummy(), "numPartitions",
@@ -1950,13 +1865,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
"""
super(Word2Vec, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
- self.vectorSize = Param(self, "vectorSize",
- "the dimension of codes after transforming from words")
- self.numPartitions = Param(self, "numPartitions",
- "number of partitions for sentences of words")
- self.minCount = Param(self, "minCount",
- "the minimum number of times a token must appear to be included " +
- "in the word2vec model's vocabulary")
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=None)
kwargs = self.__init__._input_kwargs
@@ -2075,7 +1983,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
k = Param(Params._dummy(), "k", "the number of principal components")
@keyword_only
@@ -2085,7 +1992,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(PCA, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
- self.k = Param(self, "k", "the number of principal components")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -2185,7 +2091,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
formula = Param(Params._dummy(), "formula", "R model formula")
@keyword_only
@@ -2195,7 +2100,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
"""
super(RFormula, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
- self.formula = Param(self, "formula", "R model formula")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)