From a6428292f78fd594f41a4a7bf254d40268f46305 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 1 May 2016 12:29:01 -0700 Subject: [SPARK-14931][ML][PYTHON] Mismatched default values between pipelines in Spark and PySpark - update ## What changes were proposed in this pull request? This PR is an update for [https://github.com/apache/spark/pull/12738] which: * Adds a generic unit test for JavaParams wrappers in pyspark.ml for checking default Param values vs. the defaults in the Scala side * Various fixes for bugs found * This includes changing classes taking weightCol to treat unset and empty String Param values the same way. Defaults changed: * Scala * LogisticRegression: weightCol defaults to not set (instead of empty string) * StringIndexer: labels default to not set (instead of empty array) * GeneralizedLinearRegression: * maxIter always defaults to 25 (simpler than defaulting to 25 for a particular solver) * weightCol defaults to not set (instead of empty string) * LinearRegression: weightCol defaults to not set (instead of empty string) * Python * MultilayerPerceptron: layers default to not set (instead of [1,1]) * ChiSqSelector: numTopFeatures defaults to 50 (instead of not set) ## How was this patch tested? Generic unit test. Manually tested that unit test by changing defaults and verifying that broke the test. Author: Joseph K. Bradley Author: yinxusen Closes #12816 from jkbradley/yinxusen-SPARK-14931. --- python/pyspark/ml/classification.py | 13 ++++------ python/pyspark/ml/feature.py | 1 + python/pyspark/ml/regression.py | 9 ++++--- python/pyspark/ml/tests.py | 48 +++++++++++++++++++++++++++++++++++++ python/pyspark/ml/wrapper.py | 3 ++- 5 files changed, 62 insertions(+), 12 deletions(-) (limited to 'python') diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f616c7fbec..4331f73b73 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1056,7 +1056,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + - "neurons and output layer of 10 neurons, default is [1, 1].", + "neurons and output layer of 10 neurons.", typeConverter=TypeConverters.toListInt) blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + "matrices. Data is stacked within partitions. If block size is more than " + @@ -1069,12 +1069,12 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128) """ super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128) + self._setDefault(maxIter=100, tol=1E-4, blockSize=128) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1084,14 +1084,11 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128) Sets params for MultilayerPerceptronClassifier. """ kwargs = self.setParams._input_kwargs - if layers is None: - return self._set(**kwargs).setLayers([1, 1]) - else: - return self._set(**kwargs) + return self._set(**kwargs) def _create_model(self, java_model): return MultilayerPerceptronClassificationModel(java_model) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1b059a7199..b95d288198 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2617,6 +2617,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja """ super(ChiSqSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) + self._setDefault(numTopFeatures=50) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d490953f79..0f08f9b973 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1080,7 +1080,8 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", - quantileProbabilities=None, quantilesCol=None): + quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), + quantilesCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \ @@ -1091,7 +1092,8 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid) self._setDefault(censorCol="censor", - quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]) + quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], + maxIter=100, tol=1E-6) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1099,7 +1101,8 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi @since("1.6.0") def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", - quantileProbabilities=None, quantilesCol=None): + quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), + quantilesCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \ diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index d5dd6d43c2..78ec96af8a 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -41,6 +41,7 @@ else: from shutil import rmtree import tempfile import numpy as np +import inspect from pyspark import keyword_only from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer @@ -54,6 +55,7 @@ from pyspark.ml.recommendation import ALS from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor from pyspark.ml.tuning import * from pyspark.ml.wrapper import JavaParams +from pyspark.mllib.common import _java2py from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector from pyspark.sql import DataFrame, SQLContext, Row from pyspark.sql.functions import rand @@ -1026,6 +1028,52 @@ class ALSTest(PySparkTestCase): self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY") +class DefaultValuesTests(PySparkTestCase): + """ + Test :py:class:`JavaParams` classes to see if their default Param values match + those in their Scala counterparts. + """ + + def check_params(self, py_stage): + if not hasattr(py_stage, "_to_java"): + return + java_stage = py_stage._to_java() + if java_stage is None: + return + for p in py_stage.params: + java_param = java_stage.getParam(p.name) + py_has_default = py_stage.hasDefault(p) + java_has_default = java_stage.hasDefault(java_param) + self.assertEqual(py_has_default, java_has_default, + "Default value mismatch of param %s for Params %s" + % (p.name, str(py_stage))) + if py_has_default: + if p.name == "seed": + return # Random seeds between Spark and PySpark are different + java_default =\ + _java2py(self.sc, java_stage.clear(java_param).getOrDefault(java_param)) + py_stage._clear(p) + py_default = py_stage.getOrDefault(p) + self.assertEqual(java_default, py_default, + "Java default %s != python default %s of param %s for Params %s" + % (str(java_default), str(py_default), p.name, str(py_stage))) + + def test_java_params(self): + import pyspark.ml.feature + import pyspark.ml.classification + import pyspark.ml.clustering + import pyspark.ml.pipeline + import pyspark.ml.recommendation + import pyspark.ml.regression + modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, + pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression] + for module in modules: + for name, cls in inspect.getmembers(module, inspect.isclass): + if not name.endswith('Model') and issubclass(cls, JavaParams)\ + and not inspect.isabstract(cls): + self.check_params(cls()) + + if __name__ == "__main__": from pyspark.ml.tests import * if xmlrunner: diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index fef626c7fa..fef0040faf 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -110,7 +110,8 @@ class JavaParams(JavaWrapper, Params): for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) - if self._java_obj.isDefined(java_param): + # SPARK-14931: Only check set params back to avoid default params mismatch. + if self._java_obj.isSet(java_param): value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value}) -- cgit v1.2.3