aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark
diff options
context:
space:
mode:
authorHolden Karau <holden@us.ibm.com>2016-01-26 15:53:48 -0800
committerJoseph K. Bradley <joseph@databricks.com>2016-01-26 15:53:48 -0800
commiteb917291ca1a2d68ca0639cb4b1464a546603eba (patch)
tree380dcaa33273baa68beaf089387bd498d5ee88e8 /python/pyspark
parent19fdb21afbf0eae4483cf6d4ef32daffd1994b89 (diff)
downloadspark-eb917291ca1a2d68ca0639cb4b1464a546603eba.tar.gz
spark-eb917291ca1a2d68ca0639cb4b1464a546603eba.tar.bz2
spark-eb917291ca1a2d68ca0639cb4b1464a546603eba.zip
[SPARK-10509][PYSPARK] Reduce excessive param boiler plate code
The current python ml params require cut-and-pasting the param setup and description between the class & ```__init__``` methods. Remove this possible case of errors & simplify use of custom params by adding a ```_copy_new_parent``` method to param so as to avoid cut and pasting (and cut and pasting at different indentation levels urgh). Author: Holden Karau <holden@us.ibm.com> Closes #10216 from holdenk/SPARK-10509-excessive-param-boiler-plate-code.
Diffstat (limited to 'python/pyspark')
-rw-r--r--python/pyspark/ml/classification.py32
-rw-r--r--python/pyspark/ml/clustering.py7
-rw-r--r--python/pyspark/ml/evaluation.py12
-rw-r--r--python/pyspark/ml/feature.py98
-rw-r--r--python/pyspark/ml/param/__init__.py22
-rw-r--r--python/pyspark/ml/param/_shared_params_code_gen.py17
-rw-r--r--python/pyspark/ml/param/shared.py81
-rw-r--r--python/pyspark/ml/pipeline.py4
-rw-r--r--python/pyspark/ml/recommendation.py11
-rw-r--r--python/pyspark/ml/regression.py46
-rw-r--r--python/pyspark/ml/tests.py12
-rw-r--r--python/pyspark/ml/tuning.py18
12 files changed, 43 insertions, 317 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 265c6a14f1..3179fb30ab 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -72,7 +72,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
.. versionadded:: 1.3.0
"""
- # a placeholder to make it appear in the generated doc
threshold = Param(Params._dummy(), "threshold",
"Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.")
@@ -92,10 +91,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
super(LogisticRegression, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.LogisticRegression", self.uid)
- #: param for threshold in binary classification, in range [0, 1].
- self.threshold = Param(self, "threshold",
- "Threshold in binary classification prediction, in range [0, 1]." +
- " If threshold and thresholds are both set, they must match.")
self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -232,7 +227,6 @@ class TreeClassifierParams(object):
"""
supportedImpurities = ["entropy", "gini"]
- # a placeholder to make it appear in the generated doc
impurity = Param(Params._dummy(), "impurity",
"Criterion used for information gain calculation (case-insensitive). " +
"Supported options: " +
@@ -240,10 +234,6 @@ class TreeClassifierParams(object):
def __init__(self):
super(TreeClassifierParams, self).__init__()
- #: param for Criterion used for information gain calculation (case-insensitive).
- self.impurity = Param(self, "impurity", "Criterion used for information " +
- "gain calculation (case-insensitive). Supported options: " +
- ", ".join(self.supportedImpurities))
@since("1.6.0")
def setImpurity(self, value):
@@ -485,7 +475,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
lossType = Param(Params._dummy(), "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
@@ -504,10 +493,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
super(GBTClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.GBTClassifier", self.uid)
- #: param for Loss function which GBT tries to minimize (case-insensitive).
- self.lossType = Param(self, "lossType",
- "Loss function which GBT tries to minimize (case-insensitive). " +
- "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
lossType="logistic", maxIter=20, stepSize=0.1)
@@ -597,7 +582,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
"default is 1.0")
modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
@@ -615,13 +599,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
super(NaiveBayes, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.NaiveBayes", self.uid)
- #: param for the smoothing parameter.
- self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
- "default is 1.0")
- #: param for the model type.
- self.modelType = Param(self, "modelType", "The model type which is a string " +
- "(case-sensitive). Supported options: multinomial (default) " +
- "and bernoulli.")
self._setDefault(smoothing=1.0, modelType="multinomial")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -734,7 +711,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
"neurons and output layer of 10 neurons, default is [1, 1].")
@@ -753,14 +729,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
super(MultilayerPerceptronClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
- self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
- "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
- "100 neurons and output layer of 10 neurons, default is [1, 1].")
- self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
- "matrices. Data is stacked within partitions. If block size is " +
- "more than remaining data in a partition then it is adjusted to " +
- "the size of this data. Recommended size is between 10 and 1000, " +
- "default is 128.")
self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 9189c02220..60d1c9aaec 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -73,7 +73,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
k = Param(Params._dummy(), "k", "number of clusters to create")
initMode = Param(Params._dummy(), "initMode",
"the initialization algorithm. This can be either \"random\" to " +
@@ -90,12 +89,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
"""
super(KMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
- self.k = Param(self, "k", "number of clusters to create")
- self.initMode = Param(self, "initMode",
- "the initialization algorithm. This can be either \"random\" to " +
- "choose random points as initial cluster centers, or \"k-means||\" " +
- "to use a parallel variant of k-means++")
- self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode")
self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 6ff68abd8f..c9b95b3bf4 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -124,7 +124,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation (areaUnderROC|areaUnderPR)")
@@ -138,9 +137,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
super(BinaryClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
- #: param for metric name in evaluation (areaUnderROC|areaUnderPR)
- self.metricName = Param(self, "metricName",
- "metric name in evaluation (areaUnderROC|areaUnderPR)")
self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
metricName="areaUnderROC")
kwargs = self.__init__._input_kwargs
@@ -210,9 +206,6 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
super(RegressionEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
- #: param for metric name in evaluation (mse|rmse|r2|mae)
- self.metricName = Param(self, "metricName",
- "metric name in evaluation (mse|rmse|r2|mae)")
self._setDefault(predictionCol="prediction", labelCol="label",
metricName="rmse")
kwargs = self.__init__._input_kwargs
@@ -265,7 +258,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation "
"(f1|precision|recall|weightedPrecision|weightedRecall)")
@@ -280,10 +272,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
super(MulticlassClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
- # param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall)
- self.metricName = Param(self, "metricName",
- "metric name in evaluation"
- " (f1|precision|recall|weightedPrecision|weightedRecall)")
self._setDefault(predictionCol="prediction", labelCol="label",
metricName="f1")
kwargs = self.__init__._input_kwargs
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 32f324685a..22081233b0 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -57,7 +57,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
threshold = Param(Params._dummy(), "threshold",
"threshold in binary classification prediction, in range [0, 1]")
@@ -68,8 +67,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(Binarizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
- self.threshold = Param(self, "threshold",
- "threshold in binary classification prediction, in range [0, 1]")
self._setDefault(threshold=0.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -125,7 +122,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.3.0
"""
- # a placeholder to make it appear in the generated doc
splits = \
Param(Params._dummy(), "splits",
"Split points for mapping continuous features into buckets. With n+1 splits, " +
@@ -142,19 +138,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(Bucketizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
- #: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
- # there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
- # except the last bucket, which also includes y. The splits should be strictly increasing.
- # Values at -inf, inf must be explicitly provided to cover all Double values; otherwise,
- # values outside the splits specified will be treated as errors.
- self.splits = \
- Param(self, "splits",
- "Split points for mapping continuous features into buckets. With n+1 splits, " +
- "there are n buckets. A bucket defined by splits x,y holds values in the " +
- "range [x,y) except the last bucket, which also includes y. The splits " +
- "should be strictly increasing. Values at -inf, inf must be explicitly " +
- "provided to cover all Double values; otherwise, values outside the splits " +
- "specified will be treated as errors.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -210,7 +193,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
minTF = Param(
Params._dummy(), "minTF", "Filter to ignore rare words in" +
" a document. For each document, terms with frequency/count less than the given" +
@@ -235,22 +217,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
super(CountVectorizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
self.uid)
- self.minTF = Param(
- self, "minTF", "Filter to ignore rare words in" +
- " a document. For each document, terms with frequency/count less than the given" +
- " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
- " times the term must appear in the document); if this is a double in [0,1), then " +
- "this specifies a fraction (out of the document's token count). Note that the " +
- "parameter is only used in transform of CountVectorizerModel and does not affect" +
- "fitting. Default 1.0")
- self.minDF = Param(
- self, "minDF", "Specifies the minimum number of" +
- " different documents a term must appear in to be included in the vocabulary." +
- " If this is an integer >= 1, this specifies the number of documents the term must" +
- " appear in; if this is a double in [0,1), then this specifies the fraction of " +
- "documents. Default 1.0")
- self.vocabSize = Param(
- self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -359,7 +325,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
"default False.")
@@ -370,8 +335,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(DCT, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
- self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
- "default False.")
self._setDefault(inverse=False)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -423,7 +386,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
"it must be MLlib Vector type.")
@@ -435,8 +397,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
super(ElementwiseProduct, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
self.uid)
- self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
- "it must be MLlib Vector type.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -531,7 +491,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
minDocFreq = Param(Params._dummy(), "minDocFreq",
"minimum of documents in which a term should appear for filtering")
@@ -542,8 +501,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(IDF, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
- self.minDocFreq = Param(self, "minDocFreq",
- "minimum of documents in which a term should appear for filtering")
self._setDefault(minDocFreq=0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -623,7 +580,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
@@ -634,8 +590,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(MinMaxScaler, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
- self.min = Param(self, "min", "Lower bound of the output feature range")
- self.max = Param(self, "max", "Upper bound of the output feature range")
self._setDefault(min=0.0, max=1.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -745,7 +699,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
@keyword_only
@@ -755,7 +708,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(NGram, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
- self.n = Param(self, "n", "number of elements per n-gram (>=1)")
self._setDefault(n=2)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -808,7 +760,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
p = Param(Params._dummy(), "p", "the p norm value.")
@keyword_only
@@ -818,7 +769,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(Normalizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
- self.p = Param(self, "p", "the p norm value.")
self._setDefault(p=2.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -887,7 +837,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
@keyword_only
@@ -897,7 +846,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(OneHotEncoder, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
- self.dropLast = Param(self, "dropLast", "whether to drop the last category")
self._setDefault(dropLast=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -950,7 +898,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
@keyword_only
@@ -961,7 +908,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
super(PolynomialExpansion, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
- self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
self._setDefault(degree=2)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1107,7 +1053,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
@@ -1123,11 +1068,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(RegexTokenizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
- self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
- self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
- self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
- self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " +
- "lowercase before tokenizing")
self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1223,7 +1163,6 @@ class SQLTransformer(JavaTransformer):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
statement = Param(Params._dummy(), "statement", "SQL statement")
@keyword_only
@@ -1233,7 +1172,6 @@ class SQLTransformer(JavaTransformer):
"""
super(SQLTransformer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
- self.statement = Param(self, "statement", "SQL statement")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1285,7 +1223,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
withMean = Param(Params._dummy(), "withMean", "Center data with mean")
withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
@@ -1296,8 +1233,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(StandardScaler, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
- self.withMean = Param(self, "withMean", "Center data with mean")
- self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
self._setDefault(withMean=False, withStd=True)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1453,7 +1388,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make the labels show up in generated doc
labels = Param(Params._dummy(), "labels",
"Optional array of labels specifying index-string mapping." +
" If not provided or if empty, then metadata from inputCol is used instead.")
@@ -1466,9 +1400,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
super(IndexToString, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
self.uid)
- self.labels = Param(self, "labels",
- "Optional array of labels specifying index-string mapping. If not" +
- " provided or if empty, then metadata from inputCol is used instead.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1507,7 +1438,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make the stopwords show up in generated doc
+
stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
"comparison over the stop words")
@@ -1522,9 +1453,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
super(StopWordsRemover, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
self.uid)
- self.stopWords = Param(self, "stopWords", "The words to be filtered out")
- self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
- "sensitive comparison over the stop words")
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
defaultStopWords = stopWordsObj.English()
self._setDefault(stopWords=defaultStopWords)
@@ -1727,7 +1655,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
maxCategories = Param(Params._dummy(), "maxCategories",
"Threshold for the number of values a categorical feature can take " +
"(>= 2). If a feature is found to have > maxCategories values, then " +
@@ -1740,10 +1667,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(VectorIndexer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
- self.maxCategories = Param(self, "maxCategories",
- "Threshold for the number of values a categorical feature " +
- "can take (>= 2). If a feature is found to have " +
- "> maxCategories values, then it is declared continuous.")
self._setDefault(maxCategories=20)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1832,7 +1755,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
"a vector column. There can be no overlap with names.")
names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
@@ -1847,12 +1769,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
"""
super(VectorSlicer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
- self.indices = Param(self, "indices", "An array of indices to select features from " +
- "a vector column. There can be no overlap with names.")
- self.names = Param(self, "names", "An array of feature names to select features from " +
- "a vector column. These names must be specified by ML " +
- "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
- "with indices.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -1932,7 +1848,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
vectorSize = Param(Params._dummy(), "vectorSize",
"the dimension of codes after transforming from words")
numPartitions = Param(Params._dummy(), "numPartitions",
@@ -1950,13 +1865,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
"""
super(Word2Vec, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
- self.vectorSize = Param(self, "vectorSize",
- "the dimension of codes after transforming from words")
- self.numPartitions = Param(self, "numPartitions",
- "number of partitions for sentences of words")
- self.minCount = Param(self, "minCount",
- "the minimum number of times a token must appear to be included " +
- "in the word2vec model's vocabulary")
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=None)
kwargs = self.__init__._input_kwargs
@@ -2075,7 +1983,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
k = Param(Params._dummy(), "k", "the number of principal components")
@keyword_only
@@ -2085,7 +1992,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
"""
super(PCA, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
- self.k = Param(self, "k", "the number of principal components")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -2185,7 +2091,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
.. versionadded:: 1.5.0
"""
- # a placeholder to make it appear in the generated doc
formula = Param(Params._dummy(), "formula", "R model formula")
@keyword_only
@@ -2195,7 +2100,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
"""
super(RFormula, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
- self.formula = Param(self, "formula", "R model formula")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 92ce96aa3c..3da36d32c5 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -40,6 +40,15 @@ class Param(object):
self.doc = str(doc)
self.expectedType = expectedType
+ def _copy_new_parent(self, parent):
+ """Copy the current param to a new parent, must be a dummy param."""
+ if self.parent == "undefined":
+ param = copy.copy(self)
+ param.parent = parent.uid
+ return param
+ else:
+ raise ValueError("Cannot copy from non-dummy parent %s." % parent)
+
def __str__(self):
return str(self.parent) + "__" + self.name
@@ -77,6 +86,19 @@ class Params(Identifiable):
#: value returned by :py:func:`params`
self._params = None
+ # Copy the params from the class to the object
+ self._copy_params()
+
+ def _copy_params(self):
+ """
+ Copy all params defined on the class to current object.
+ """
+ cls = type(self)
+ src_name_attrs = [(x, getattr(cls, x)) for x in dir(cls)]
+ src_params = list(filter(lambda nameAttr: isinstance(nameAttr[1], Param), src_name_attrs))
+ for name, param in src_params:
+ setattr(self, name, param._copy_new_parent(self))
+
@property
@since("1.3.0")
def params(self):
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 82855bc4c7..5e297b8214 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -50,13 +50,11 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType):
Mixin for param $name: $doc
"""
- # a placeholder to make it appear in the generated doc
$name = Param(Params._dummy(), "$name", "$doc", $expectedType)
def __init__(self):
- super(Has$Name, self).__init__()
- #: param for $doc
- self.$name = Param(self, "$name", "$doc", $expectedType)'''
+ super(Has$Name, self).__init__()'''
+
if defaultValueStr is not None:
template += '''
self._setDefault($name=$defaultValueStr)'''
@@ -171,22 +169,17 @@ if __name__ == "__main__":
Mixin for Decision Tree parameters.
"""
- # a placeholder to make it appear in the generated doc
$dummyPlaceHolders
def __init__(self):
- super(DecisionTreeParams, self).__init__()
- $realParams'''
+ super(DecisionTreeParams, self).__init__()'''
dtParamMethods = ""
dummyPlaceholders = ""
- realParams = ""
paramTemplate = """$name = Param($owner, "$name", "$doc")"""
for name, doc in decisionTreeParams:
variable = paramTemplate.replace("$name", name).replace("$doc", doc)
dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n "
- realParams += "#: param for " + doc + "\n "
- realParams += "self." + variable.replace("$owner", "self") + "\n "
dtParamMethods += _gen_param_code(name, doc, None) + "\n"
- code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders)
- .replace("$realParams", realParams) + dtParamMethods)
+ code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" +
+ dtParamMethods)
print("\n\n\n".join(code))
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 23f9431484..db4a8a54d4 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -25,13 +25,10 @@ class HasMaxIter(Params):
Mixin for param maxIter: max number of iterations (>= 0).
"""
- # a placeholder to make it appear in the generated doc
maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", int)
def __init__(self):
super(HasMaxIter, self).__init__()
- #: param for max number of iterations (>= 0).
- self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0).", int)
def setMaxIter(self, value):
"""
@@ -52,13 +49,10 @@ class HasRegParam(Params):
Mixin for param regParam: regularization parameter (>= 0).
"""
- # a placeholder to make it appear in the generated doc
regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", float)
def __init__(self):
super(HasRegParam, self).__init__()
- #: param for regularization parameter (>= 0).
- self.regParam = Param(self, "regParam", "regularization parameter (>= 0).", float)
def setRegParam(self, value):
"""
@@ -79,13 +73,10 @@ class HasFeaturesCol(Params):
Mixin for param featuresCol: features column name.
"""
- # a placeholder to make it appear in the generated doc
featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", str)
def __init__(self):
super(HasFeaturesCol, self).__init__()
- #: param for features column name.
- self.featuresCol = Param(self, "featuresCol", "features column name.", str)
self._setDefault(featuresCol='features')
def setFeaturesCol(self, value):
@@ -107,13 +98,10 @@ class HasLabelCol(Params):
Mixin for param labelCol: label column name.
"""
- # a placeholder to make it appear in the generated doc
labelCol = Param(Params._dummy(), "labelCol", "label column name.", str)
def __init__(self):
super(HasLabelCol, self).__init__()
- #: param for label column name.
- self.labelCol = Param(self, "labelCol", "label column name.", str)
self._setDefault(labelCol='label')
def setLabelCol(self, value):
@@ -135,13 +123,10 @@ class HasPredictionCol(Params):
Mixin for param predictionCol: prediction column name.
"""
- # a placeholder to make it appear in the generated doc
predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", str)
def __init__(self):
super(HasPredictionCol, self).__init__()
- #: param for prediction column name.
- self.predictionCol = Param(self, "predictionCol", "prediction column name.", str)
self._setDefault(predictionCol='prediction')
def setPredictionCol(self, value):
@@ -163,13 +148,10 @@ class HasProbabilityCol(Params):
Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
"""
- # a placeholder to make it appear in the generated doc
probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str)
def __init__(self):
super(HasProbabilityCol, self).__init__()
- #: param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
- self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str)
self._setDefault(probabilityCol='probability')
def setProbabilityCol(self, value):
@@ -191,13 +173,10 @@ class HasRawPredictionCol(Params):
Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) column name.
"""
- # a placeholder to make it appear in the generated doc
rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str)
def __init__(self):
super(HasRawPredictionCol, self).__init__()
- #: param for raw prediction (a.k.a. confidence) column name.
- self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str)
self._setDefault(rawPredictionCol='rawPrediction')
def setRawPredictionCol(self, value):
@@ -219,13 +198,10 @@ class HasInputCol(Params):
Mixin for param inputCol: input column name.
"""
- # a placeholder to make it appear in the generated doc
inputCol = Param(Params._dummy(), "inputCol", "input column name.", str)
def __init__(self):
super(HasInputCol, self).__init__()
- #: param for input column name.
- self.inputCol = Param(self, "inputCol", "input column name.", str)
def setInputCol(self, value):
"""
@@ -246,13 +222,10 @@ class HasInputCols(Params):
Mixin for param inputCols: input column names.
"""
- # a placeholder to make it appear in the generated doc
inputCols = Param(Params._dummy(), "inputCols", "input column names.", None)
def __init__(self):
super(HasInputCols, self).__init__()
- #: param for input column names.
- self.inputCols = Param(self, "inputCols", "input column names.", None)
def setInputCols(self, value):
"""
@@ -273,13 +246,10 @@ class HasOutputCol(Params):
Mixin for param outputCol: output column name.
"""
- # a placeholder to make it appear in the generated doc
outputCol = Param(Params._dummy(), "outputCol", "output column name.", str)
def __init__(self):
super(HasOutputCol, self).__init__()
- #: param for output column name.
- self.outputCol = Param(self, "outputCol", "output column name.", str)
self._setDefault(outputCol=self.uid + '__output')
def setOutputCol(self, value):
@@ -301,13 +271,10 @@ class HasNumFeatures(Params):
Mixin for param numFeatures: number of features.
"""
- # a placeholder to make it appear in the generated doc
numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", int)
def __init__(self):
super(HasNumFeatures, self).__init__()
- #: param for number of features.
- self.numFeatures = Param(self, "numFeatures", "number of features.", int)
def setNumFeatures(self, value):
"""
@@ -328,13 +295,10 @@ class HasCheckpointInterval(Params):
Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
"""
- # a placeholder to make it appear in the generated doc
checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int)
def __init__(self):
super(HasCheckpointInterval, self).__init__()
- #: param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
- self.checkpointInterval = Param(self, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int)
def setCheckpointInterval(self, value):
"""
@@ -355,13 +319,10 @@ class HasSeed(Params):
Mixin for param seed: random seed.
"""
- # a placeholder to make it appear in the generated doc
seed = Param(Params._dummy(), "seed", "random seed.", int)
def __init__(self):
super(HasSeed, self).__init__()
- #: param for random seed.
- self.seed = Param(self, "seed", "random seed.", int)
self._setDefault(seed=hash(type(self).__name__))
def setSeed(self, value):
@@ -383,13 +344,10 @@ class HasTol(Params):
Mixin for param tol: the convergence tolerance for iterative algorithms.
"""
- # a placeholder to make it appear in the generated doc
tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", float)
def __init__(self):
super(HasTol, self).__init__()
- #: param for the convergence tolerance for iterative algorithms.
- self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms.", float)
def setTol(self, value):
"""
@@ -410,13 +368,10 @@ class HasStepSize(Params):
Mixin for param stepSize: Step size to be used for each iteration of optimization.
"""
- # a placeholder to make it appear in the generated doc
stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", float)
def __init__(self):
super(HasStepSize, self).__init__()
- #: param for Step size to be used for each iteration of optimization.
- self.stepSize = Param(self, "stepSize", "Step size to be used for each iteration of optimization.", float)
def setStepSize(self, value):
"""
@@ -437,13 +392,10 @@ class HasHandleInvalid(Params):
Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
"""
- # a placeholder to make it appear in the generated doc
handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str)
def __init__(self):
super(HasHandleInvalid, self).__init__()
- #: param for how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
- self.handleInvalid = Param(self, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str)
def setHandleInvalid(self, value):
"""
@@ -464,13 +416,10 @@ class HasElasticNetParam(Params):
Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
"""
- # a placeholder to make it appear in the generated doc
elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float)
def __init__(self):
super(HasElasticNetParam, self).__init__()
- #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
- self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float)
self._setDefault(elasticNetParam=0.0)
def setElasticNetParam(self, value):
@@ -492,13 +441,10 @@ class HasFitIntercept(Params):
Mixin for param fitIntercept: whether to fit an intercept term.
"""
- # a placeholder to make it appear in the generated doc
fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", bool)
def __init__(self):
super(HasFitIntercept, self).__init__()
- #: param for whether to fit an intercept term.
- self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.", bool)
self._setDefault(fitIntercept=True)
def setFitIntercept(self, value):
@@ -520,13 +466,10 @@ class HasStandardization(Params):
Mixin for param standardization: whether to standardize the training features before fitting the model.
"""
- # a placeholder to make it appear in the generated doc
standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", bool)
def __init__(self):
super(HasStandardization, self).__init__()
- #: param for whether to standardize the training features before fitting the model.
- self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.", bool)
self._setDefault(standardization=True)
def setStandardization(self, value):
@@ -548,13 +491,10 @@ class HasThresholds(Params):
Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
"""
- # a placeholder to make it appear in the generated doc
thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None)
def __init__(self):
super(HasThresholds, self).__init__()
- #: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
- self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None)
def setThresholds(self, value):
"""
@@ -575,13 +515,10 @@ class HasWeightCol(Params):
Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0.
"""
- # a placeholder to make it appear in the generated doc
weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str)
def __init__(self):
super(HasWeightCol, self).__init__()
- #: param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.
- self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str)
def setWeightCol(self, value):
"""
@@ -602,13 +539,10 @@ class HasSolver(Params):
Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
"""
- # a placeholder to make it appear in the generated doc
solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str)
def __init__(self):
super(HasSolver, self).__init__()
- #: param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
- self.solver = Param(self, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str)
self._setDefault(solver='auto')
def setSolver(self, value):
@@ -630,7 +564,6 @@ class DecisionTreeParams(Params):
Mixin for Decision Tree parameters.
"""
- # a placeholder to make it appear in the generated doc
maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.")
minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
@@ -641,19 +574,7 @@ class DecisionTreeParams(Params):
def __init__(self):
super(DecisionTreeParams, self).__init__()
- #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
- self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
- #: param for Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.
- self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.")
- #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
- self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
- #: param for Minimum information gain for a split to be considered at a tree node.
- self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
- #: param for Maximum memory in MB allocated to histogram aggregation.
- self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
- #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.
- self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
-
+
def setMaxDepth(self, value):
"""
Sets the value of :py:attr:`maxDepth`.
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 9f5f6ac8fa..661074ca96 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -149,6 +149,8 @@ class Pipeline(Estimator):
.. versionadded:: 1.3.0
"""
+ stages = Param(Params._dummy(), "stages", "pipeline stages")
+
@keyword_only
def __init__(self, stages=None):
"""
@@ -157,8 +159,6 @@ class Pipeline(Estimator):
if stages is None:
stages = []
super(Pipeline, self).__init__()
- #: Param for pipeline stages.
- self.stages = Param(self, "stages", "pipeline stages")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index b44c66f73c..08180a2f25 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -85,7 +85,6 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
rank = Param(Params._dummy(), "rank", "rank of the factorization")
numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks")
numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks")
@@ -108,16 +107,6 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
"""
super(ALS, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid)
- self.rank = Param(self, "rank", "rank of the factorization")
- self.numUserBlocks = Param(self, "numUserBlocks", "number of user blocks")
- self.numItemBlocks = Param(self, "numItemBlocks", "number of item blocks")
- self.implicitPrefs = Param(self, "implicitPrefs", "whether to use implicit preference")
- self.alpha = Param(self, "alpha", "alpha for implicit preference")
- self.userCol = Param(self, "userCol", "column name for user ids")
- self.itemCol = Param(self, "itemCol", "column name for item ids")
- self.ratingCol = Param(self, "ratingCol", "column name for ratings")
- self.nonnegative = Param(self, "nonnegative",
- "whether to use nonnegative constraint for least squares")
self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
ratingCol="rating", nonnegative=False, checkpointInterval=10)
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 401bac0223..74a2248ed0 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -162,7 +162,6 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
DenseVector([0.0, 1.0])
"""
- # a placeholder to make it appear in the generated doc
isotonic = \
Param(Params._dummy(), "isotonic",
"whether the output sequence should be isotonic/increasing (true) or" +
@@ -181,14 +180,6 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
super(IsotonicRegression, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.regression.IsotonicRegression", self.uid)
- self.isotonic = \
- Param(self, "isotonic",
- "whether the output sequence should be isotonic/increasing (true) or" +
- "antitonic/decreasing (false).")
- self.featureIndex = \
- Param(self, "featureIndex",
- "The index of the feature if featuresCol is a vector column, no effect " +
- "otherwise.")
self._setDefault(isotonic=True, featureIndex=0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -262,15 +253,11 @@ class TreeEnsembleParams(DecisionTreeParams):
Mixin for Decision Tree-based ensemble algorithms parameters.
"""
- # a placeholder to make it appear in the generated doc
subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " +
"used for learning each decision tree, in range (0, 1].")
def __init__(self):
super(TreeEnsembleParams, self).__init__()
- #: param for Fraction of the training data, in range (0, 1].
- self.subsamplingRate = Param(self, "subsamplingRate", "Fraction of the training data " +
- "used for learning each decision tree, in range (0, 1].")
@since("1.4.0")
def setSubsamplingRate(self, value):
@@ -294,7 +281,6 @@ class TreeRegressorParams(Params):
"""
supportedImpurities = ["variance"]
- # a placeholder to make it appear in the generated doc
impurity = Param(Params._dummy(), "impurity",
"Criterion used for information gain calculation (case-insensitive). " +
"Supported options: " +
@@ -302,10 +288,6 @@ class TreeRegressorParams(Params):
def __init__(self):
super(TreeRegressorParams, self).__init__()
- #: param for Criterion used for information gain calculation (case-insensitive).
- self.impurity = Param(self, "impurity", "Criterion used for information " +
- "gain calculation (case-insensitive). Supported options: " +
- ", ".join(self.supportedImpurities))
@since("1.4.0")
def setImpurity(self, value):
@@ -329,7 +311,6 @@ class RandomForestParams(TreeEnsembleParams):
"""
supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
- # a placeholder to make it appear in the generated doc
numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).")
featureSubsetStrategy = \
Param(Params._dummy(), "featureSubsetStrategy",
@@ -338,13 +319,6 @@ class RandomForestParams(TreeEnsembleParams):
def __init__(self):
super(RandomForestParams, self).__init__()
- #: param for Number of trees to train (>= 1).
- self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1).")
- #: param for The number of features to consider for splits at each tree node.
- self.featureSubsetStrategy = \
- Param(self, "featureSubsetStrategy",
- "The number of features to consider for splits at each tree node. Supported " +
- "options: " + ", ".join(self.supportedFeatureSubsetStrategies))
@since("1.4.0")
def setNumTrees(self, value):
@@ -609,7 +583,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
lossType = Param(Params._dummy(), "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
@@ -627,10 +600,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
"""
super(GBTRegressor, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
- #: param for Loss function which GBT tries to minimize (case-insensitive).
- self.lossType = Param(self, "lossType",
- "Loss function which GBT tries to minimize (case-insensitive). " +
- "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
@@ -713,7 +682,6 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
.. versionadded:: 1.6.0
"""
- # a placeholder to make it appear in the generated doc
censorCol = Param(Params._dummy(), "censorCol",
"censor column name. The value of this column could be 0 or 1. " +
"If the value is 1, it means the event has occurred i.e. " +
@@ -739,20 +707,6 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
super(AFTSurvivalRegression, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid)
- #: Param for censor column name
- self.censorCol = Param(self, "censorCol",
- "censor column name. The value of this column could be 0 or 1. " +
- "If the value is 1, it means the event has occurred i.e. " +
- "uncensored; otherwise censored.")
- #: Param for quantile probabilities array
- self.quantileProbabilities = \
- Param(self, "quantileProbabilities",
- "quantile probabilities array. Values of the quantile probabilities array " +
- "should be in the range (0, 1) and the array should be non-empty.")
- #: Param for quantiles column name
- self.quantilesCol = Param(self, "quantilesCol",
- "quantiles column name. This column will output quantiles of " +
- "corresponding quantileProbabilities if it is set.")
self._setDefault(censorCol="censor",
quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
kwargs = self.__init__._input_kwargs
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 9ea639dc4f..c45a159c46 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -185,6 +185,18 @@ class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
class ParamTests(PySparkTestCase):
+ def test_copy_new_parent(self):
+ testParams = TestParams()
+ # Copying an instantiated param should fail
+ with self.assertRaises(ValueError):
+ testParams.maxIter._copy_new_parent(testParams)
+ # Copying a dummy param should succeed
+ TestParams.maxIter._copy_new_parent(testParams)
+ maxIter = testParams.maxIter
+ self.assertEqual(maxIter.name, "maxIter")
+ self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
+ self.assertTrue(maxIter.parent == testParams.uid)
+
def test_param(self):
testParams = TestParams()
maxIter = testParams.maxIter
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 08f8db57f4..0cbe97f1d8 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -115,18 +115,11 @@ class CrossValidator(Estimator, HasSeed):
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
estimator = Param(Params._dummy(), "estimator", "estimator to be cross-validated")
-
- # a placeholder to make it appear in the generated doc
estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps", "estimator param maps")
-
- # a placeholder to make it appear in the generated doc
evaluator = Param(
Params._dummy(), "evaluator",
"evaluator used to select hyper-parameters that maximize the cross-validated metric")
-
- # a placeholder to make it appear in the generated doc
numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation")
@keyword_only
@@ -137,17 +130,6 @@ class CrossValidator(Estimator, HasSeed):
seed=None)
"""
super(CrossValidator, self).__init__()
- #: param for estimator to be cross-validated
- self.estimator = Param(self, "estimator", "estimator to be cross-validated")
- #: param for estimator param maps
- self.estimatorParamMaps = Param(self, "estimatorParamMaps", "estimator param maps")
- #: param for the evaluator used to select hyper-parameters that
- #: maximize the cross-validated metric
- self.evaluator = Param(
- self, "evaluator",
- "evaluator used to select hyper-parameters that maximize the cross-validated metric")
- #: param for number of folds for cross validation
- self.numFolds = Param(self, "numFolds", "number of folds for cross validation")
self._setDefault(numFolds=3)
kwargs = self.__init__._input_kwargs
self._set(**kwargs)