From 71ad945bbbdd154eae852cd7f841e98f7a83e8d4 Mon Sep 17 00:00:00 2001 From: z001qdp Date: Fri, 15 Jul 2016 12:30:22 +0100 Subject: [SPARK-16426][MLLIB] Fix bug that caused NaNs in IsotonicRegression ## What changes were proposed in this pull request? Fixed a bug that caused `NaN`s in `IsotonicRegression`. The problem occurs when training rows with the same feature value but different labels end up on different partitions. This patch changes a `sortBy` call to a `partitionBy(RangePartitioner)` followed by a `mapPartitions(sortBy)` in order to ensure that all rows with the same feature value end up on the same partition. ## How was this patch tested? Added a unit test. Author: z001qdp Closes #14140 from neggert/SPARK-16426-isotonic-nan. --- .../spark/mllib/regression/IsotonicRegressionSuite.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mllib/src/test') diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala index ea4f286575..94da626d92 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala @@ -176,6 +176,17 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w assert(model.predictions === Array(1, 2, 2)) } + test("SPARK-16426 isotonic regression with duplicate features that produce NaNs") { + val trainRDD = sc.parallelize(Seq[(Double, Double, Double)]((2, 1, 1), (1, 1, 1), (0, 2, 1), + (1, 2, 1), (0.5, 3, 1), (0, 3, 1)), + 2) + + val model = new IsotonicRegression().run(trainRDD) + + assert(model.boundaries === Array(1.0, 3.0)) + assert(model.predictions === Array(0.75, 0.75)) + } + test("isotonic regression prediction") { val model = runIsotonicRegression(Seq(1, 2, 7, 1, 2), true) -- cgit v1.2.3