aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/mllib/sample_isotonic_regression_data.txt100
-rw-r--r--data/mllib/sample_isotonic_regression_libsvm_data.txt100
-rw-r--r--docs/ml-classification-regression.md70
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java62
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java19
-rw-r--r--examples/src/main/python/ml/isotonic_regression_example.py54
-rw-r--r--examples/src/main/python/mllib/isotonic_regression_example.py11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala62
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala9
9 files changed, 373 insertions, 114 deletions
diff --git a/data/mllib/sample_isotonic_regression_data.txt b/data/mllib/sample_isotonic_regression_data.txt
deleted file mode 100644
index d257b509d4..0000000000
--- a/data/mllib/sample_isotonic_regression_data.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-0.24579296,0.01
-0.28505864,0.02
-0.31208567,0.03
-0.35900051,0.04
-0.35747068,0.05
-0.16675166,0.06
-0.17491076,0.07
-0.04181540,0.08
-0.04793473,0.09
-0.03926568,0.10
-0.12952575,0.11
-0.00000000,0.12
-0.01376849,0.13
-0.13105558,0.14
-0.08873024,0.15
-0.12595614,0.16
-0.15247323,0.17
-0.25956145,0.18
-0.20040796,0.19
-0.19581846,0.20
-0.15757267,0.21
-0.13717491,0.22
-0.19020908,0.23
-0.19581846,0.24
-0.20091790,0.25
-0.16879143,0.26
-0.18510964,0.27
-0.20040796,0.28
-0.29576747,0.29
-0.43396226,0.30
-0.53391127,0.31
-0.52116267,0.32
-0.48546660,0.33
-0.49209587,0.34
-0.54156043,0.35
-0.59765426,0.36
-0.56144824,0.37
-0.58592555,0.38
-0.52983172,0.39
-0.50178480,0.40
-0.52626211,0.41
-0.58286588,0.42
-0.64660887,0.43
-0.68077511,0.44
-0.74298827,0.45
-0.64864865,0.46
-0.67261601,0.47
-0.65782764,0.48
-0.69811321,0.49
-0.63029067,0.50
-0.61601224,0.51
-0.63233044,0.52
-0.65323814,0.53
-0.65323814,0.54
-0.67363590,0.55
-0.67006629,0.56
-0.51555329,0.57
-0.50892402,0.58
-0.33299337,0.59
-0.36206017,0.60
-0.43090260,0.61
-0.45996940,0.62
-0.56348802,0.63
-0.54920959,0.64
-0.48393677,0.65
-0.48495665,0.66
-0.46965834,0.67
-0.45181030,0.68
-0.45843957,0.69
-0.47118817,0.70
-0.51555329,0.71
-0.58031617,0.72
-0.55481897,0.73
-0.56297807,0.74
-0.56603774,0.75
-0.57929628,0.76
-0.64762876,0.77
-0.66241713,0.78
-0.69301377,0.79
-0.65119837,0.80
-0.68332483,0.81
-0.66598674,0.82
-0.73890872,0.83
-0.73992861,0.84
-0.84242733,0.85
-0.91330954,0.86
-0.88016318,0.87
-0.90719021,0.88
-0.93115757,0.89
-0.93115757,0.90
-0.91942886,0.91
-0.92911780,0.92
-0.95665477,0.93
-0.95002550,0.94
-0.96940337,0.95
-1.00000000,0.96
-0.89801122,0.97
-0.90311066,0.98
-0.90362060,0.99
-0.83477817,1.0 \ No newline at end of file
diff --git a/data/mllib/sample_isotonic_regression_libsvm_data.txt b/data/mllib/sample_isotonic_regression_libsvm_data.txt
new file mode 100644
index 0000000000..f39fe0269c
--- /dev/null
+++ b/data/mllib/sample_isotonic_regression_libsvm_data.txt
@@ -0,0 +1,100 @@
+0.24579296 1:0.01
+0.28505864 1:0.02
+0.31208567 1:0.03
+0.35900051 1:0.04
+0.35747068 1:0.05
+0.16675166 1:0.06
+0.17491076 1:0.07
+0.04181540 1:0.08
+0.04793473 1:0.09
+0.03926568 1:0.10
+0.12952575 1:0.11
+0.00000000 1:0.12
+0.01376849 1:0.13
+0.13105558 1:0.14
+0.08873024 1:0.15
+0.12595614 1:0.16
+0.15247323 1:0.17
+0.25956145 1:0.18
+0.20040796 1:0.19
+0.19581846 1:0.20
+0.15757267 1:0.21
+0.13717491 1:0.22
+0.19020908 1:0.23
+0.19581846 1:0.24
+0.20091790 1:0.25
+0.16879143 1:0.26
+0.18510964 1:0.27
+0.20040796 1:0.28
+0.29576747 1:0.29
+0.43396226 1:0.30
+0.53391127 1:0.31
+0.52116267 1:0.32
+0.48546660 1:0.33
+0.49209587 1:0.34
+0.54156043 1:0.35
+0.59765426 1:0.36
+0.56144824 1:0.37
+0.58592555 1:0.38
+0.52983172 1:0.39
+0.50178480 1:0.40
+0.52626211 1:0.41
+0.58286588 1:0.42
+0.64660887 1:0.43
+0.68077511 1:0.44
+0.74298827 1:0.45
+0.64864865 1:0.46
+0.67261601 1:0.47
+0.65782764 1:0.48
+0.69811321 1:0.49
+0.63029067 1:0.50
+0.61601224 1:0.51
+0.63233044 1:0.52
+0.65323814 1:0.53
+0.65323814 1:0.54
+0.67363590 1:0.55
+0.67006629 1:0.56
+0.51555329 1:0.57
+0.50892402 1:0.58
+0.33299337 1:0.59
+0.36206017 1:0.60
+0.43090260 1:0.61
+0.45996940 1:0.62
+0.56348802 1:0.63
+0.54920959 1:0.64
+0.48393677 1:0.65
+0.48495665 1:0.66
+0.46965834 1:0.67
+0.45181030 1:0.68
+0.45843957 1:0.69
+0.47118817 1:0.70
+0.51555329 1:0.71
+0.58031617 1:0.72
+0.55481897 1:0.73
+0.56297807 1:0.74
+0.56603774 1:0.75
+0.57929628 1:0.76
+0.64762876 1:0.77
+0.66241713 1:0.78
+0.69301377 1:0.79
+0.65119837 1:0.80
+0.68332483 1:0.81
+0.66598674 1:0.82
+0.73890872 1:0.83
+0.73992861 1:0.84
+0.84242733 1:0.85
+0.91330954 1:0.86
+0.88016318 1:0.87
+0.90719021 1:0.88
+0.93115757 1:0.89
+0.93115757 1:0.90
+0.91942886 1:0.91
+0.92911780 1:0.92
+0.95665477 1:0.93
+0.95002550 1:0.94
+0.96940337 1:0.95
+1.00000000 1:0.96
+0.89801122 1:0.97
+0.90311066 1:0.98
+0.90362060 1:0.99
+0.83477817 1:1.0 \ No newline at end of file
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index d7e5521cbc..3d6106b532 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -691,6 +691,76 @@ The implementation matches the result from R's survival function
</div>
+## Isotonic regression
+[Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression)
+belongs to the family of regression algorithms. Formally isotonic regression is a problem where
+given a finite set of real numbers `$Y = {y_1, y_2, ..., y_n}$` representing observed responses
+and `$X = {x_1, x_2, ..., x_n}$` the unknown response values to be fitted
+finding a function that minimises
+
+`\begin{equation}
+ f(x) = \sum_{i=1}^n w_i (y_i - x_i)^2
+\end{equation}`
+
+with respect to complete order subject to
+`$x_1\le x_2\le ...\le x_n$` where `$w_i$` are positive weights.
+The resulting function is called isotonic regression and it is unique.
+It can be viewed as least squares problem under order restriction.
+Essentially isotonic regression is a
+[monotonic function](http://en.wikipedia.org/wiki/Monotonic_function)
+best fitting the original data points.
+
+We implement a
+[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
+which uses an approach to
+[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
+The training input is a DataFrame which contains three columns
+label, features and weight. Additionally IsotonicRegression algorithm has one
+optional parameter called $isotonic$ defaulting to true.
+This argument specifies if the isotonic regression is
+isotonic (monotonically increasing) or antitonic (monotonically decreasing).
+
+Training returns an IsotonicRegressionModel that can be used to predict
+labels for both known and unknown features. The result of isotonic regression
+is treated as piecewise linear function. The rules for prediction therefore are:
+
+* If the prediction input exactly matches a training feature
+ then associated prediction is returned. In case there are multiple predictions with the same
+ feature then one of them is returned. Which one is undefined
+ (same as java.util.Arrays.binarySearch).
+* If the prediction input is lower or higher than all training features
+ then prediction with lowest or highest feature is returned respectively.
+ In case there are multiple predictions with the same feature
+ then the lowest or highest is returned respectively.
+* If the prediction input falls between two training features then prediction is treated
+ as piecewise linear function and interpolated value is calculated from the
+ predictions of the two closest features. In case there are multiple values
+ with the same feature then the same rules as in previous point are used.
+
+### Examples
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.ml.regression.IsotonicRegression) for details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala %}
+</div>
+<div data-lang="java" markdown="1">
+
+Refer to the [`IsotonicRegression` Java docs](api/java/org/apache/spark/ml/regression/IsotonicRegression.html) for details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java %}
+</div>
+<div data-lang="python" markdown="1">
+
+Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspark.ml.regression.IsotonicRegression) for more details on the API.
+
+{% include_example python/ml/isotonic_regression_example.py %}
+</div>
+</div>
+
+
# Decision trees
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
new file mode 100644
index 0000000000..0ec17b0471
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.ml;
+
+// $example on$
+
+import org.apache.spark.ml.regression.IsotonicRegression;
+import org.apache.spark.ml.regression.IsotonicRegressionModel;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+// $example off$
+import org.apache.spark.sql.SparkSession;
+
+/**
+ * An example demonstrating IsotonicRegression.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaIsotonicRegressionExample
+ * </pre>
+ */
+public class JavaIsotonicRegressionExample {
+
+ public static void main(String[] args) {
+ // Create a SparkSession.
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("JavaIsotonicRegressionExample")
+ .getOrCreate();
+
+ // $example on$
+ // Loads data.
+ Dataset<Row> dataset = spark.read().format("libsvm")
+ .load("data/mllib/sample_isotonic_regression_libsvm_data.txt");
+
+ // Trains an isotonic regression model.
+ IsotonicRegression ir = new IsotonicRegression();
+ IsotonicRegressionModel model = ir.fit(dataset);
+
+ System.out.println("Boundaries in increasing order: " + model.boundaries());
+ System.out.println("Predictions associated with the boundaries: " + model.predictions());
+
+ // Makes predictions.
+ model.transform(dataset).show();
+ // $example off$
+
+ spark.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java
index c6361a3729..a30b5f1f73 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java
@@ -17,6 +17,7 @@
package org.apache.spark.examples.mllib;
// $example on$
+
import scala.Tuple2;
import scala.Tuple3;
import org.apache.spark.api.java.function.Function;
@@ -27,6 +28,8 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.regression.IsotonicRegression;
import org.apache.spark.mllib.regression.IsotonicRegressionModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
// $example off$
import org.apache.spark.SparkConf;
@@ -35,27 +38,29 @@ public class JavaIsotonicRegressionExample {
SparkConf sparkConf = new SparkConf().setAppName("JavaIsotonicRegressionExample");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
// $example on$
- JavaRDD<String> data = jsc.textFile("data/mllib/sample_isotonic_regression_data.txt");
+ JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(
+ jsc.sc(), "data/mllib/sample_isotonic_regression_libsvm_data.txt").toJavaRDD();
// Create label, feature, weight tuples from input data with weight set to default value 1.0.
JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(
- new Function<String, Tuple3<Double, Double, Double>>() {
- public Tuple3<Double, Double, Double> call(String line) {
- String[] parts = line.split(",");
- return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), 1.0);
+ new Function<LabeledPoint, Tuple3<Double, Double, Double>>() {
+ public Tuple3<Double, Double, Double> call(LabeledPoint point) {
+ return new Tuple3<>(new Double(point.label()),
+ new Double(point.features().apply(0)), 1.0);
}
}
);
// Split data into training (60%) and test (40%) sets.
JavaRDD<Tuple3<Double, Double, Double>>[] splits =
- parsedData.randomSplit(new double[]{0.6, 0.4}, 11L);
+ parsedData.randomSplit(new double[]{0.6, 0.4}, 11L);
JavaRDD<Tuple3<Double, Double, Double>> training = splits[0];
JavaRDD<Tuple3<Double, Double, Double>> test = splits[1];
// Create isotonic regression model from training data.
// Isotonic parameter defaults to true so it is only shown for demonstration
- final IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training);
+ final IsotonicRegressionModel model =
+ new IsotonicRegression().setIsotonic(true).run(training);
// Create tuples of predicted and real labels.
JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
new file mode 100644
index 0000000000..1e61bd8eff
--- /dev/null
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -0,0 +1,54 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Isotonic Regression Example.
+"""
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+An example demonstrating isotonic regression.
+Run with:
+ bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
+"""
+if __name__ == "__main__":
+
+ spark = SparkSession\
+ .builder\
+ .appName("PythonIsotonicRegressionExample")\
+ .getOrCreate()
+
+ # $example on$
+ # Loads data.
+ dataset = spark.read.format("libsvm")\
+ .load("data/mllib/sample_isotonic_regression_libsvm_data.txt")
+
+ # Trains an isotonic regression model.
+ model = IsotonicRegression().fit(dataset)
+ print("Boundaries in increasing order: " + str(model.boundaries))
+ print("Predictions associated with the boundaries: " + str(model.predictions))
+
+ # Makes predictions.
+ model.transform(dataset).show()
+ # $example off$
+
+ spark.stop()
diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py
index 89dc9f4b66..33d618ab48 100644
--- a/examples/src/main/python/mllib/isotonic_regression_example.py
+++ b/examples/src/main/python/mllib/isotonic_regression_example.py
@@ -23,7 +23,8 @@ from __future__ import print_function
from pyspark import SparkContext
# $example on$
import math
-from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
+from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel
+from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -31,10 +32,14 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonIsotonicRegressionExample")
# $example on$
- data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
+ # Load and parse the data
+ def parsePoint(labeledData):
+ return (labeledData.label, labeledData.features[0], 1.0)
+
+ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt")
# Create label, feature, weight tuples from input data with weight set to default value 1.0.
- parsedData = data.map(lambda line: tuple([float(x) for x in line.split(',')]) + (1.0,))
+ parsedData = data.map(parsePoint)
# Split data into training (60%) and test (40%) sets.
training, test = parsedData.randomSplit([0.6, 0.4], 11)
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
new file mode 100644
index 0000000000..7c5d3f2341
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.regression.IsotonicRegression
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example demonstrating Isotonic Regression.
+ * Run with
+ * {{{
+ * bin/run-example ml.IsotonicRegressionExample
+ * }}}
+ */
+object IsotonicRegressionExample {
+
+ def main(args: Array[String]): Unit = {
+
+ // Creates a SparkSession.
+ val spark = SparkSession
+ .builder
+ .appName(s"${this.getClass.getSimpleName}")
+ .getOrCreate()
+
+ // $example on$
+ // Loads data.
+ val dataset = spark.read.format("libsvm")
+ .load("data/mllib/sample_isotonic_regression_libsvm_data.txt")
+
+ // Trains an isotonic regression model.
+ val ir = new IsotonicRegression()
+ val model = ir.fit(dataset)
+
+ println(s"Boundaries in increasing order: ${model.boundaries}")
+ println(s"Predictions associated with the boundaries: ${model.predictions}")
+
+ // Makes predictions.
+ model.transform(dataset).show()
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
index c4336639d7..e5dea129c1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
@@ -21,6 +21,7 @@ package org.apache.spark.examples.mllib
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel}
+import org.apache.spark.mllib.util.MLUtils
// $example off$
object IsotonicRegressionExample {
@@ -30,12 +31,12 @@ object IsotonicRegressionExample {
val conf = new SparkConf().setAppName("IsotonicRegressionExample")
val sc = new SparkContext(conf)
// $example on$
- val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
+ val data = MLUtils.loadLibSVMFile(sc,
+ "data/mllib/sample_isotonic_regression_libsvm_data.txt").cache()
// Create label, feature, weight tuples from input data with weight set to default value 1.0.
- val parsedData = data.map { line =>
- val parts = line.split(',').map(_.toDouble)
- (parts(0), parts(1), 1.0)
+ val parsedData = data.map { labeledPoint =>
+ (labeledPoint.label, labeledPoint.features(0), 1.0)
}
// Split data into training (60%) and test (40%) sets.