aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorlewuathe <lewuathe@me.com>2015-03-20 17:18:18 -0400
committerXiangrui Meng <meng@databricks.com>2015-03-20 17:18:18 -0400
commit257cde7c363efb3317bfb5c13975cca9154894e2 (patch)
tree420d26c15072875caef30987209539debaf8888f /python
parent11e025956be3818c00effef0d650734f8feeb436 (diff)
downloadspark-257cde7c363efb3317bfb5c13975cca9154894e2.tar.gz
spark-257cde7c363efb3317bfb5c13975cca9154894e2.tar.bz2
spark-257cde7c363efb3317bfb5c13975cca9154894e2.zip
[SPARK-6421][MLLIB] _regression_train_wrapper does not test initialWeights correctly
Weight parameters must be initialized correctly even when numpy array is passed as initial weights. Author: lewuathe <lewuathe@me.com> Closes #5101 from Lewuathe/SPARK-6421 and squashes the following commits: 7795201 [lewuathe] Fix lint-python errors 21d4fe3 [lewuathe] Fix init logic of weights
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/regression.py3
-rw-r--r--python/pyspark/mllib/tests.py7
2 files changed, 9 insertions, 1 deletions
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 015a786011..414a0ada80 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -163,7 +163,8 @@ def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
first = data.first()
if not isinstance(first, LabeledPoint):
raise ValueError("data should be an RDD of LabeledPoint, but got %s" % first)
- initial_weights = initial_weights or [0.0] * len(data.first().features)
+ if initial_weights is None:
+ initial_weights = [0.0] * len(data.first().features)
weights, intercept = train_func(data, _convert_to_vector(initial_weights))
return modelClass(weights, intercept)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 5328d99b69..155019638f 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -323,6 +323,13 @@ class ListTests(PySparkTestCase):
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
+ try:
+ LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
+ LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
+ RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
+ except ValueError:
+ self.fail()
+
class StatTests(PySparkTestCase):
# SPARK-4023