diff options
author | Joseph K. Bradley <joseph@databricks.com> | 2016-04-18 17:15:12 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-04-18 17:15:12 -0700 |
commit | d29e429eeb7bea3b49cfb9227d64a609f3c11531 (patch) | |
tree | b1a42a7cb23a8abc8234a2111cd4967e2882735a | |
parent | 9bfb35da1ec40af005cd9e4dac61a5c70f3e3d17 (diff) | |
download | spark-d29e429eeb7bea3b49cfb9227d64a609f3c11531.tar.gz spark-d29e429eeb7bea3b49cfb9227d64a609f3c11531.tar.bz2 spark-d29e429eeb7bea3b49cfb9227d64a609f3c11531.zip |
[SPARK-14714][ML][PYTHON] Fixed issues with non-kwarg typeConverter arg for Param constructor
## What changes were proposed in this pull request?
PySpark Param constructors need to pass the TypeConverter argument by name, partly to make sure it is not mistaken for the expectedType arg and partly because we will remove the expectedType arg in 2.1. In several places, this is not being done correctly.
This PR changes all usages in pyspark/ml/ to keyword args.
## How was this patch tested?
Existing unit tests. I will not test type conversion for every Param unless we really think it necessary.
Also, if you start the PySpark shell and import classes (e.g., pyspark.ml.feature.StandardScaler), then you no longer get this warning:
```
/Users/josephkb/spark/python/pyspark/ml/param/__init__.py:58: UserWarning: expectedType is deprecated and will be removed in 2.1. Use typeConverter instead, as a keyword argument.
"Use typeConverter instead, as a keyword argument.")
```
That warning came from the typeConverter argument being passes as the expectedType arg by mistake.
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #12480 from jkbradley/typeconverter-fix.
-rw-r--r-- | python/pyspark/ml/clustering.py | 3 | ||||
-rw-r--r-- | python/pyspark/ml/feature.py | 17 | ||||
-rw-r--r-- | python/pyspark/ml/recommendation.py | 12 |
3 files changed, 19 insertions, 13 deletions
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 64c4bf1b92..05aa2dfe74 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -92,7 +92,8 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol initMode = Param(Params._dummy(), "initMode", "the initialization algorithm. This can be either \"random\" to " + "choose random points as initial cluster centers, or \"k-means||\" " + - "to use a parallel variant of k-means++", TypeConverters.toString) + "to use a parallel variant of k-means++", + typeConverter=TypeConverters.toString) initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode", typeConverter=TypeConverters.toInt) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 49a78ede37..4310f154b5 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1317,9 +1317,9 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, typeConverter=TypeConverters.toInt) gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing", - TypeConverters.toString) + typeConverter=TypeConverters.toString) toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " + - "lowercase before tokenizing", TypeConverters.toBoolean) + "lowercase before tokenizing", typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, @@ -1430,7 +1430,8 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): .. versionadded:: 1.6.0 """ - statement = Param(Params._dummy(), "statement", "SQL statement", TypeConverters.toString) + statement = Param(Params._dummy(), "statement", "SQL statement", + typeConverter=TypeConverters.toString) @keyword_only def __init__(self, statement=None): @@ -1504,9 +1505,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J .. versionadded:: 1.4.0 """ - withMean = Param(Params._dummy(), "withMean", "Center data with mean", TypeConverters.toBoolean) + withMean = Param(Params._dummy(), "withMean", "Center data with mean", + typeConverter=TypeConverters.toBoolean) withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", - TypeConverters.toBoolean) + typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): @@ -1754,7 +1756,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out", typeConverter=TypeConverters.toListString) caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + - "comparison over the stop words", TypeConverters.toBoolean) + "comparison over the stop words", typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, inputCol=None, outputCol=None, stopWords=None, @@ -2510,7 +2512,8 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM .. versionadded:: 1.5.0 """ - formula = Param(Params._dummy(), "formula", "R model formula", TypeConverters.toString) + formula = Param(Params._dummy(), "formula", "R model formula", + typeConverter=TypeConverters.toString) @keyword_only def __init__(self, formula=None, featuresCol="features", labelCol="label"): diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 9c38f2431b..9d7f22a66f 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -107,16 +107,18 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks", typeConverter=TypeConverters.toInt) implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use implicit preference", - TypeConverters.toBoolean) + typeConverter=TypeConverters.toBoolean) alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference", typeConverter=TypeConverters.toFloat) - userCol = Param(Params._dummy(), "userCol", "column name for user ids", TypeConverters.toString) - itemCol = Param(Params._dummy(), "itemCol", "column name for item ids", TypeConverters.toString) + userCol = Param(Params._dummy(), "userCol", "column name for user ids", + typeConverter=TypeConverters.toString) + itemCol = Param(Params._dummy(), "itemCol", "column name for item ids", + typeConverter=TypeConverters.toString) ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings", - TypeConverters.toString) + typeConverter=TypeConverters.toString) nonnegative = Param(Params._dummy(), "nonnegative", "whether to use nonnegative constraint for least squares", - TypeConverters.toBoolean) + typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, |