From a6428292f78fd594f41a4a7bf254d40268f46305 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Sun, 1 May 2016 12:29:01 -0700
Subject: [SPARK-14931][ML][PYTHON] Mismatched default values between pipelines
 in Spark and PySpark - update

## What changes were proposed in this pull request?

This PR is an update for [https://github.com/apache/spark/pull/12738] which:
* Adds a generic unit test for JavaParams wrappers in pyspark.ml for checking default Param values vs. the defaults in the Scala side
* Various fixes for bugs found
  * This includes changing classes taking weightCol to treat unset and empty String Param values the same way.

Defaults changed:
* Scala
 * LogisticRegression: weightCol defaults to not set (instead of empty string)
 * StringIndexer: labels default to not set (instead of empty array)
 * GeneralizedLinearRegression:
   * maxIter always defaults to 25 (simpler than defaulting to 25 for a particular solver)
   * weightCol defaults to not set (instead of empty string)
 * LinearRegression: weightCol defaults to not set (instead of empty string)
* Python
 * MultilayerPerceptron: layers default to not set (instead of [1,1])
 * ChiSqSelector: numTopFeatures defaults to 50 (instead of not set)

## How was this patch tested?

Generic unit test.  Manually tested that unit test by changing defaults and verifying that broke the test.

Author: Joseph K. Bradley <joseph@databricks.com>
Author: yinxusen <yinxusen@gmail.com>

Closes #12816 from jkbradley/yinxusen-SPARK-14931.
---
 python/pyspark/ml/classification.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'python/pyspark/ml/classification.py')

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index f616c7fbec..4331f73b73 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1056,7 +1056,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
 
     layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
                    "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
-                   "neurons and output layer of 10 neurons, default is [1, 1].",
+                   "neurons and output layer of 10 neurons.",
                    typeConverter=TypeConverters.toListInt)
     blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
                       "matrices. Data is stacked within partitions. If block size is more than " +
@@ -1069,12 +1069,12 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
                  maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                 maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+                 maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128)
         """
         super(MultilayerPerceptronClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
-        self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
+        self._setDefault(maxIter=100, tol=1E-4, blockSize=128)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1084,14 +1084,11 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
                   maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+                  maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128)
         Sets params for MultilayerPerceptronClassifier.
         """
         kwargs = self.setParams._input_kwargs
-        if layers is None:
-            return self._set(**kwargs).setLayers([1, 1])
-        else:
-            return self._set(**kwargs)
+        return self._set(**kwargs)
 
     def _create_model(self, java_model):
         return MultilayerPerceptronClassificationModel(java_model)
-- 
cgit v1.2.3