aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/classification.py
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2017-03-03 16:43:45 -0800
committerJoseph K. Bradley <joseph@databricks.com>2017-03-03 16:43:45 -0800
commit44281ca81d4eda02b627ba21841108438b7d1c27 (patch)
tree4125cfa2e8dd98e247ae7240d88f3845ce871734 /python/pyspark/ml/classification.py
parent2a7921a813ecd847fd933ffef10edc64684e9df7 (diff)
downloadspark-44281ca81d4eda02b627ba21841108438b7d1c27.tar.gz
spark-44281ca81d4eda02b627ba21841108438b7d1c27.tar.bz2
spark-44281ca81d4eda02b627ba21841108438b7d1c27.zip
[SPARK-19348][PYTHON] PySpark keyword_only decorator is not thread-safe
## What changes were proposed in this pull request? The `keyword_only` decorator in PySpark is not thread-safe. It writes kwargs to a static class variable in the decorator, which is then retrieved later in the class method as `_input_kwargs`. If multiple threads are constructing the same class with different kwargs, it becomes a race condition to read from the static class variable before it's overwritten. See [SPARK-19348](https://issues.apache.org/jira/browse/SPARK-19348) for reproduction code. This change will write the kwargs to a member variable so that multiple threads can operate on separate instances without the race condition. It does not protect against multiple threads operating on a single instance, but that is better left to the user to synchronize. ## How was this patch tested? Added new unit tests for using the keyword_only decorator and a regression test that verifies `_input_kwargs` can be overwritten from different class instances. Author: Bryan Cutler <cutlerb@gmail.com> Closes #16782 from BryanCutler/pyspark-keyword_only-threadsafe-SPARK-19348.
Diffstat (limited to 'python/pyspark/ml/classification.py')
-rw-r--r--python/pyspark/ml/classification.py32
1 files changed, 16 insertions, 16 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index ac40fceaf8..b4fc357e42 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -124,7 +124,7 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha
"org.apache.spark.ml.classification.LinearSVC", self.uid)
self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True,
standardization=True, threshold=0.0, aggregationDepth=2)
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@@ -140,7 +140,7 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha
aggregationDepth=2):
Sets params for Linear SVM Classifier.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _create_model(self, java_model):
@@ -266,7 +266,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.LogisticRegression", self.uid)
self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto")
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
self._checkThresholdConsistency()
@@ -286,7 +286,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
self._set(**kwargs)
self._checkThresholdConsistency()
return self
@@ -760,7 +760,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="gini")
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@@ -778,7 +778,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
seed=None)
Sets params for the DecisionTreeClassifier.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _create_model(self, java_model):
@@ -890,7 +890,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="gini", numTrees=20, featureSubsetStrategy="auto",
subsamplingRate=1.0)
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@@ -908,7 +908,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0)
Sets params for linear classification.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _create_model(self, java_model):
@@ -1031,7 +1031,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0)
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@@ -1047,7 +1047,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)
Sets params for Gradient Boosted Tree Classification.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _create_model(self, java_model):
@@ -1174,7 +1174,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.NaiveBayes", self.uid)
self._setDefault(smoothing=1.0, modelType="multinomial")
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@@ -1188,7 +1188,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
modelType="multinomial", thresholds=None, weightCol=None)
Sets params for Naive Bayes.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _create_model(self, java_model):
@@ -1329,7 +1329,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs")
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@@ -1343,7 +1343,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
solver="l-bfgs", initialWeights=None)
Sets params for MultilayerPerceptronClassifier.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _create_model(self, java_model):
@@ -1519,7 +1519,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
classifier=None)
"""
super(OneVsRest, self).__init__()
- kwargs = self.__init__._input_kwargs
+ kwargs = self._input_kwargs
self._set(**kwargs)
@keyword_only
@@ -1529,7 +1529,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
setParams(self, featuresCol=None, labelCol=None, predictionCol=None, classifier=None):
Sets params for OneVsRest.
"""
- kwargs = self.setParams._input_kwargs
+ kwargs = self._input_kwargs
return self._set(**kwargs)
def _fit(self, dataset):