aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/tests.py
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-04-06 12:07:47 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-04-06 12:07:47 -0700
commit9c6556c5f8ab013b36312db4bf02c4c6d965a535 (patch)
treee4200c088c376f26f27de4f3a96c99006dd99b20 /python/pyspark/ml/tests.py
parentbb1fa5b2182f384cb711fc2be45b0f1a8c466ed6 (diff)
downloadspark-9c6556c5f8ab013b36312db4bf02c4c6d965a535.tar.gz
spark-9c6556c5f8ab013b36312db4bf02c4c6d965a535.tar.bz2
spark-9c6556c5f8ab013b36312db4bf02c4c6d965a535.zip
[SPARK-13430][PYSPARK][ML] Python API for training summaries of linear and logistic regression
## What changes were proposed in this pull request? Adding Python API for training summaries of LogisticRegression and LinearRegression in PySpark ML. ## How was this patch tested? Added unit tests to exercise the api calls for the summary classes. Also, manually verified values are expected and match those from Scala directly. Author: Bryan Cutler <cutlerb@gmail.com> Closes #11621 from BryanCutler/pyspark-ml-summary-SPARK-13430.
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rw-r--r--python/pyspark/ml/tests.py87
1 files changed, 79 insertions, 8 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index e3f873e3a7..2dcd5eeb52 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -239,6 +239,17 @@ class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
return self._set(**kwargs)
+class HasThrowableProperty(Params):
+
+ def __init__(self):
+ super(HasThrowableProperty, self).__init__()
+ self.p = Param(self, "none", "empty param")
+
+ @property
+ def test_property(self):
+ raise RuntimeError("Test property to raise error when invoked")
+
+
class ParamTests(PySparkTestCase):
def test_copy_new_parent(self):
@@ -749,15 +760,75 @@ class PersistenceTest(PySparkTestCase):
pass
-class HasThrowableProperty(Params):
-
- def __init__(self):
- super(HasThrowableProperty, self).__init__()
- self.p = Param(self, "none", "empty param")
+class TrainingSummaryTest(PySparkTestCase):
- @property
- def test_property(self):
- raise RuntimeError("Test property to raise error when invoked")
+ def test_linear_regression_summary(self):
+ from pyspark.mllib.linalg import Vectors
+ sqlContext = SQLContext(self.sc)
+ df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
+ (0.0, 2.0, Vectors.sparse(1, [], []))],
+ ["label", "weight", "features"])
+ lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
+ fitIntercept=False)
+ model = lr.fit(df)
+ self.assertTrue(model.hasSummary)
+ s = model.summary
+ # test that api is callable and returns expected types
+ self.assertGreater(s.totalIterations, 0)
+ self.assertTrue(isinstance(s.predictions, DataFrame))
+ self.assertEqual(s.predictionCol, "prediction")
+ self.assertEqual(s.labelCol, "label")
+ self.assertEqual(s.featuresCol, "features")
+ objHist = s.objectiveHistory
+ self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
+ self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
+ self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
+ self.assertAlmostEqual(s.meanSquaredError, 0.0)
+ self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
+ self.assertAlmostEqual(s.r2, 1.0, 2)
+ self.assertTrue(isinstance(s.residuals, DataFrame))
+ self.assertEqual(s.numInstances, 2)
+ devResiduals = s.devianceResiduals
+ self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
+ coefStdErr = s.coefficientStandardErrors
+ self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
+ tValues = s.tValues
+ self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
+ pValues = s.pValues
+ self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
+ # test evaluation (with training dataset) produces a summary with same values
+ # one check is enough to verify a summary is returned, Scala version runs full test
+ sameSummary = model.evaluate(df)
+ self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
+
+ def test_logistic_regression_summary(self):
+ from pyspark.mllib.linalg import Vectors
+ sqlContext = SQLContext(self.sc)
+ df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
+ (0.0, 2.0, Vectors.sparse(1, [], []))],
+ ["label", "weight", "features"])
+ lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
+ model = lr.fit(df)
+ self.assertTrue(model.hasSummary)
+ s = model.summary
+ # test that api is callable and returns expected types
+ self.assertTrue(isinstance(s.predictions, DataFrame))
+ self.assertEqual(s.probabilityCol, "probability")
+ self.assertEqual(s.labelCol, "label")
+ self.assertEqual(s.featuresCol, "features")
+ objHist = s.objectiveHistory
+ self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
+ self.assertGreater(s.totalIterations, 0)
+ self.assertTrue(isinstance(s.roc, DataFrame))
+ self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
+ self.assertTrue(isinstance(s.pr, DataFrame))
+ self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
+ self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
+ self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
+ # test evaluation (with training dataset) produces a summary with same values
+ # one check is enough to verify a summary is returned, Scala version runs full test
+ sameSummary = model.evaluate(df)
+ self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
if __name__ == "__main__":