From 654087194d232221dfb64ba646c8a8e12649f961 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 13 Aug 2013 11:43:49 -0700 Subject: Change SVM to use {0,1} labels. Also add a data validation check to make sure classification labels are always 0 or 1 and add an appropriate test case. --- .../mllib/classification/LogisticRegression.scala | 16 +++++---- .../scala/spark/mllib/classification/SVM.scala | 21 ++++++++--- .../scala/spark/mllib/optimization/Gradient.scala | 12 ++++--- .../regression/GeneralizedLinearAlgorithm.scala | 9 ++++- .../scala/spark/mllib/util/DataValidators.scala | 42 ++++++++++++++++++++++ .../scala/spark/mllib/util/SVMDataGenerator.scala | 9 ++--- .../spark/mllib/classification/SVMSuite.scala | 33 +++++++++++++---- 7 files changed, 116 insertions(+), 26 deletions(-) create mode 100644 mllib/src/main/scala/spark/mllib/util/DataValidators.scala (limited to 'mllib') diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala index 30ee0ab0ff..24f9f4e76b 100644 --- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala @@ -17,12 +17,13 @@ package spark.mllib.classification +import scala.math.round + import spark.{Logging, RDD, SparkContext} import spark.mllib.optimization._ import spark.mllib.regression._ import spark.mllib.util.MLUtils - -import scala.math.round +import spark.mllib.util.DataValidators import org.jblas.DoubleMatrix @@ -59,10 +60,13 @@ class LogisticRegressionWithSGD private ( val gradient = new LogisticGradient() val updater = new SimpleUpdater() - val optimizer = new GradientDescent(gradient, updater).setStepSize(stepSize) - .setNumIterations(numIterations) - .setRegParam(regParam) - .setMiniBatchFraction(miniBatchFraction) + override val optimizer = new GradientDescent(gradient, updater) + .setStepSize(stepSize) + .setNumIterations(numIterations) + .setRegParam(regParam) + .setMiniBatchFraction(miniBatchFraction) + override val validateFuncs = List(DataValidators.classificationLabels) + /** * Construct a LogisticRegression object with default parameters */ diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala index f799cb2829..d2b50f4987 100644 --- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala @@ -18,10 +18,12 @@ package spark.mllib.classification import scala.math.signum + import spark.{Logging, RDD, SparkContext} import spark.mllib.optimization._ import spark.mllib.regression._ import spark.mllib.util.MLUtils +import spark.mllib.util.DataValidators import org.jblas.DoubleMatrix @@ -45,6 +47,7 @@ class SVMModel( /** * Train an SVM using Stochastic Gradient Descent. + * NOTE: Labels used in SVM should be {0, 1} */ class SVMWithSGD private ( var stepSize: Double, @@ -56,10 +59,14 @@ class SVMWithSGD private ( val gradient = new HingeGradient() val updater = new SquaredL2Updater() - val optimizer = new GradientDescent(gradient, updater).setStepSize(stepSize) - .setNumIterations(numIterations) - .setRegParam(regParam) - .setMiniBatchFraction(miniBatchFraction) + override val optimizer = new GradientDescent(gradient, updater) + .setStepSize(stepSize) + .setNumIterations(numIterations) + .setRegParam(regParam) + .setMiniBatchFraction(miniBatchFraction) + + override val validateFuncs = List(DataValidators.classificationLabels) + /** * Construct a SVM object with default parameters */ @@ -71,7 +78,7 @@ class SVMWithSGD private ( } /** - * Top-level methods for calling SVM. + * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1} */ object SVMWithSGD { @@ -80,6 +87,7 @@ object SVMWithSGD { * of iterations of gradient descent using the specified step size. Each iteration uses * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in * gradient descent are initialized using the initial weights provided. + * NOTE: Labels used in SVM should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. @@ -106,6 +114,7 @@ object SVMWithSGD { * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number * of iterations of gradient descent using the specified step size. Each iteration uses * `miniBatchFraction` fraction of the data to calculate the gradient. + * NOTE: Labels used in SVM should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. @@ -128,6 +137,7 @@ object SVMWithSGD { * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number * of iterations of gradient descent using the specified step size. We use the entire data set to * update the gradient in each iteration. + * NOTE: Labels used in SVM should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param stepSize Step size to be used for each iteration of Gradient Descent. @@ -149,6 +159,7 @@ object SVMWithSGD { * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number * of iterations of gradient descent using a step size of 1.0. We use the entire data set to * update the gradient in each iteration. + * NOTE: Labels used in SVM should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. diff --git a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala index e72b8b3a92..58bfe3f37b 100644 --- a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala +++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala @@ -77,16 +77,20 @@ class SquaredGradient extends Gradient { /** * Compute gradient and loss for a Hinge loss function. + * NOTE: This assumes that the labels are {0,1} */ class HingeGradient extends Gradient { - override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): + override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): (DoubleMatrix, Double) = { val dotProduct = data.dot(weights) + val labelScaled = 2*label - 1.0 - if (1.0 > label * dotProduct) - (data.mul(-label), 1.0 - label * dotProduct) + // Our loss function with {0, 1} labels is max(0, 1 - (2y – 1) (f_w(x))) + // Therefore the gradient is -(2y - 1)*x + if (1.0 > labelScaled * dotProduct) + (data.mul(-labelScaled), 1.0 - labelScaled * dotProduct) else - (DoubleMatrix.zeros(1,weights.length), 0.0) + (DoubleMatrix.zeros(1, weights.length), 0.0) } } diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 4ecafff08b..55edb3def5 100644 --- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -17,7 +17,7 @@ package spark.mllib.regression -import spark.{Logging, RDD} +import spark.{Logging, RDD, SparkException} import spark.mllib.optimization._ import org.jblas.DoubleMatrix @@ -83,6 +83,8 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] extends Logging with Serializable { + protected val validateFuncs: Seq[RDD[LabeledPoint] => Boolean] = List() + val optimizer: Optimizer /** @@ -116,6 +118,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] */ def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = { + // Check the data properties before running the optimizer + if (!validateFuncs.forall(func => func(input))) { + throw new SparkException("Input validation failed.") + } + // Add a extra variable consisting of all 1.0's for the intercept. val data = if (addIntercept) { input.map(labeledPoint => (labeledPoint.label, Array(1.0, labeledPoint.features:_*))) diff --git a/mllib/src/main/scala/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/spark/mllib/util/DataValidators.scala new file mode 100644 index 0000000000..57553accf1 --- /dev/null +++ b/mllib/src/main/scala/spark/mllib/util/DataValidators.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package spark.mllib.util + +import spark.{RDD, Logging} +import spark.mllib.regression.LabeledPoint + +/** + * A collection of methods used to validate data before applying ML algorithms. + */ +object DataValidators extends Logging { + + /** + * Function to check if labels used for classification are either zero or one. + * + * @param data - input data set that needs to be checked + * + * @return True if labels are all zero or one, false otherwise. + */ + val classificationLabels: RDD[LabeledPoint] => Boolean = { data => + val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count() + if (numInvalid != 0) { + logError("Classification labels should be 0 or 1. Found " + numInvalid + " invalid labels") + } + numInvalid == 0 + } +} diff --git a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala index e02bd190f6..eff456cad6 100644 --- a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala +++ b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala @@ -1,7 +1,6 @@ package spark.mllib.util import scala.util.Random -import scala.math.signum import spark.{RDD, SparkContext} @@ -30,8 +29,8 @@ object SVMDataGenerator { val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) - val trueWeights = new DoubleMatrix(1, nfeatures+1, - Array.fill[Double](nfeatures + 1) { globalRnd.nextGaussian() }:_*) + val trueWeights = new DoubleMatrix(1, nfeatures + 1, + Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian()):_*) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) @@ -39,11 +38,13 @@ object SVMDataGenerator { val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } - val y = signum((new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1) + val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1 + val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, x) } MLUtils.saveLabeledData(data, outputPath) + sc.stop() } } diff --git a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala index 04f631d80f..f392efa405 100644 --- a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala +++ b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala @@ -50,11 +50,9 @@ object SVMSuite { val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(rnd.nextGaussian())) val y = x.map { xi => - signum( - (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + - intercept + - 0.1 * rnd.nextGaussian() - ).toInt + val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + + intercept + 0.01 * rnd.nextGaussian() + if (yD < 0) 0.0 else 1.0 } y.zip(x).map(p => LabeledPoint(p._1, p._2)) } @@ -100,7 +98,7 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { val model = svm.run(testRDD) val validationData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 17) - val validationRDD = sc.parallelize(validationData,2) + val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData) @@ -139,4 +137,27 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { // Test prediction on Array. validatePrediction(validationData.map(row => model.predict(row.features)), validationData) } + + test("SVM with invalid labels") { + val nPoints = 10000 + + val A = 2.0 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 42) + val testRDD = sc.parallelize(testData, 2) + + val testRDDInvalid = testRDD.map { lp => + if (lp.label == 0.0) { + LabeledPoint(-1.0, lp.features) + } else { + lp + } + } + + intercept[spark.SparkException] { + val model = SVMWithSGD.train(testRDDInvalid, 100) + } + } } -- cgit v1.2.3 From 0ab6ff4c3252e7cb9ea573e09d9188da1fcb87cc Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 13 Aug 2013 13:57:06 -0700 Subject: Fix SVM model and unit test to work with {0,1}. Also rename validateFuncs to validators. --- .../scala/spark/mllib/classification/LogisticRegression.scala | 2 +- mllib/src/main/scala/spark/mllib/classification/SVM.scala | 5 +++-- mllib/src/main/scala/spark/mllib/optimization/Gradient.scala | 8 +++++--- .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala | 4 ++-- .../src/test/scala/spark/mllib/classification/SVMSuite.scala | 11 +++++++---- 5 files changed, 18 insertions(+), 12 deletions(-) (limited to 'mllib') diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala index 24f9f4e76b..7f0b1ba841 100644 --- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala @@ -65,7 +65,7 @@ class LogisticRegressionWithSGD private ( .setNumIterations(numIterations) .setRegParam(regParam) .setMiniBatchFraction(miniBatchFraction) - override val validateFuncs = List(DataValidators.classificationLabels) + override val validators = List(DataValidators.classificationLabels) /** * Construct a LogisticRegression object with default parameters diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala index d2b50f4987..b680d81e86 100644 --- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala @@ -41,7 +41,8 @@ class SVMModel( override def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix, intercept: Double) = { - signum(dataMatrix.dot(weightMatrix) + intercept) + val margin = dataMatrix.dot(weightMatrix) + intercept + if (margin < 0) 0.0 else 1.0 } } @@ -65,7 +66,7 @@ class SVMWithSGD private ( .setRegParam(regParam) .setMiniBatchFraction(miniBatchFraction) - override val validateFuncs = List(DataValidators.classificationLabels) + override val validators = List(DataValidators.classificationLabels) /** * Construct a SVM object with default parameters diff --git a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala index 58bfe3f37b..05568f55af 100644 --- a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala +++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala @@ -84,13 +84,15 @@ class HingeGradient extends Gradient { (DoubleMatrix, Double) = { val dotProduct = data.dot(weights) - val labelScaled = 2*label - 1.0 // Our loss function with {0, 1} labels is max(0, 1 - (2y – 1) (f_w(x))) // Therefore the gradient is -(2y - 1)*x - if (1.0 > labelScaled * dotProduct) + val labelScaled = 2 * label - 1.0 + + if (1.0 > labelScaled * dotProduct) { (data.mul(-labelScaled), 1.0 - labelScaled * dotProduct) - else + } else { (DoubleMatrix.zeros(1, weights.length), 0.0) + } } } diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 55edb3def5..03f991df39 100644 --- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -83,7 +83,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] extends Logging with Serializable { - protected val validateFuncs: Seq[RDD[LabeledPoint] => Boolean] = List() + protected val validators: Seq[RDD[LabeledPoint] => Boolean] = List() val optimizer: Optimizer @@ -119,7 +119,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = { // Check the data properties before running the optimizer - if (!validateFuncs.forall(func => func(input))) { + if (!validators.forall(func => func(input))) { throw new SparkException("Input validation failed.") } diff --git a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala index f392efa405..8fa9e4639b 100644 --- a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala +++ b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala @@ -48,7 +48,7 @@ object SVMSuite { val rnd = new Random(seed) val weightsMat = new DoubleMatrix(1, weights.length, weights:_*) val x = Array.fill[Array[Double]](nPoints)( - Array.fill[Double](weights.length)(rnd.nextGaussian())) + Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0)) val y = x.map { xi => val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + intercept + 0.01 * rnd.nextGaussian() @@ -83,7 +83,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { test("SVM using local random SGD") { val nPoints = 10000 - val A = 2.0 + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 val B = -1.5 val C = 1.0 @@ -110,7 +111,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { test("SVM local random SGD with initial weights") { val nPoints = 10000 - val A = 2.0 + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 val B = -1.5 val C = 1.0 @@ -141,7 +143,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { test("SVM with invalid labels") { val nPoints = 10000 - val A = 2.0 + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 val B = -1.5 val C = 1.0 -- cgit v1.2.3 From c874625354de7117da9586cfbbe919bb6801a932 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 13 Aug 2013 16:55:53 -0700 Subject: Specify label format in LogisticRegression. --- .../main/scala/spark/mllib/classification/LogisticRegression.scala | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mllib') diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala index 7f0b1ba841..474ca6e97c 100644 --- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala @@ -48,6 +48,7 @@ class LogisticRegressionModel( /** * Train a classification model for Logistic Regression using Stochastic Gradient Descent. + * NOTE: Labels used in Logistic Regression should be {0, 1} */ class LogisticRegressionWithSGD private ( var stepSize: Double, @@ -79,6 +80,7 @@ class LogisticRegressionWithSGD private ( /** * Top-level methods for calling Logistic Regression. + * NOTE: Labels used in Logistic Regression should be {0, 1} */ object LogisticRegressionWithSGD { // NOTE(shivaram): We use multiple train methods instead of default arguments to support @@ -89,6 +91,7 @@ object LogisticRegressionWithSGD { * number of iterations of gradient descent using the specified step size. Each iteration uses * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in * gradient descent are initialized using the initial weights provided. + * NOTE: Labels used in Logistic Regression should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. @@ -113,6 +116,7 @@ object LogisticRegressionWithSGD { * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed * number of iterations of gradient descent using the specified step size. Each iteration uses * `miniBatchFraction` fraction of the data to calculate the gradient. + * NOTE: Labels used in Logistic Regression should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. @@ -135,6 +139,7 @@ object LogisticRegressionWithSGD { * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed * number of iterations of gradient descent using the specified step size. We use the entire data * set to update the gradient in each iteration. + * NOTE: Labels used in Logistic Regression should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param stepSize Step size to be used for each iteration of Gradient Descent. @@ -155,6 +160,7 @@ object LogisticRegressionWithSGD { * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed * number of iterations of gradient descent using a step size of 1.0. We use the entire data set * to update the gradient in each iteration. + * NOTE: Labels used in Logistic Regression should be {0, 1} * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. -- cgit v1.2.3 From dc06b528790c69b2e6de85cba84266fea81dd4f4 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Sun, 25 Aug 2013 23:14:35 -0700 Subject: Add an option to turn off data validation, test it. Also moves addIntercept to have default true to make it similar to validateData option --- .../spark/mllib/classification/LogisticRegression.scala | 9 ++++----- .../src/main/scala/spark/mllib/classification/SVM.scala | 9 ++++----- .../mllib/regression/GeneralizedLinearAlgorithm.scala | 16 +++++++++++++--- mllib/src/main/scala/spark/mllib/regression/Lasso.scala | 9 ++++----- .../test/scala/spark/mllib/classification/SVMSuite.scala | 3 +++ 5 files changed, 28 insertions(+), 18 deletions(-) (limited to 'mllib') diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala index 474ca6e97c..482e4a6745 100644 --- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala @@ -54,8 +54,7 @@ class LogisticRegressionWithSGD private ( var stepSize: Double, var numIterations: Int, var regParam: Double, - var miniBatchFraction: Double, - var addIntercept: Boolean) + var miniBatchFraction: Double) extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable { @@ -71,7 +70,7 @@ class LogisticRegressionWithSGD private ( /** * Construct a LogisticRegression object with default parameters */ - def this() = this(1.0, 100, 0.0, 1.0, true) + def this() = this(1.0, 100, 0.0, 1.0) def createModel(weights: Array[Double], intercept: Double) = { new LogisticRegressionModel(weights, intercept) @@ -108,7 +107,7 @@ object LogisticRegressionWithSGD { initialWeights: Array[Double]) : LogisticRegressionModel = { - new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction, true).run( + new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction).run( input, initialWeights) } @@ -131,7 +130,7 @@ object LogisticRegressionWithSGD { miniBatchFraction: Double) : LogisticRegressionModel = { - new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction, true).run( + new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction).run( input) } diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala index b680d81e86..69393cd7b0 100644 --- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala @@ -54,8 +54,7 @@ class SVMWithSGD private ( var stepSize: Double, var numIterations: Int, var regParam: Double, - var miniBatchFraction: Double, - var addIntercept: Boolean) + var miniBatchFraction: Double) extends GeneralizedLinearAlgorithm[SVMModel] with Serializable { val gradient = new HingeGradient() @@ -71,7 +70,7 @@ class SVMWithSGD private ( /** * Construct a SVM object with default parameters */ - def this() = this(1.0, 100, 1.0, 1.0, true) + def this() = this(1.0, 100, 1.0, 1.0) def createModel(weights: Array[Double], intercept: Double) = { new SVMModel(weights, intercept) @@ -107,7 +106,7 @@ object SVMWithSGD { initialWeights: Array[Double]) : SVMModel = { - new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input, + new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input, initialWeights) } @@ -131,7 +130,7 @@ object SVMWithSGD { miniBatchFraction: Double) : SVMModel = { - new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input) + new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input) } /** diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 03f991df39..d164d415d6 100644 --- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -87,13 +87,15 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] val optimizer: Optimizer + protected var addIntercept: Boolean = true + + protected var validateData: Boolean = true + /** * Create a model given the weights and intercept */ protected def createModel(weights: Array[Double], intercept: Double): M - protected var addIntercept: Boolean - /** * Set if the algorithm should add an intercept. Default true. */ @@ -102,6 +104,14 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] this } + /** + * Set if the algorithm should validate data before training. Default true. + */ + def setValidateData(validateData: Boolean): this.type = { + this.validateData = validateData + this + } + /** * Run the algorithm with the configured parameters on an input * RDD of LabeledPoint entries. @@ -119,7 +129,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = { // Check the data properties before running the optimizer - if (!validators.forall(func => func(input))) { + if (validateData && !validators.forall(func => func(input))) { throw new SparkException("Input validation failed.") } diff --git a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala index 6bbc990a5a..89f791e85a 100644 --- a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala +++ b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala @@ -48,8 +48,7 @@ class LassoWithSGD private ( var stepSize: Double, var numIterations: Int, var regParam: Double, - var miniBatchFraction: Double, - var addIntercept: Boolean) + var miniBatchFraction: Double) extends GeneralizedLinearAlgorithm[LassoModel] with Serializable { @@ -63,7 +62,7 @@ class LassoWithSGD private ( /** * Construct a Lasso object with default parameters */ - def this() = this(1.0, 100, 1.0, 1.0, true) + def this() = this(1.0, 100, 1.0, 1.0) def createModel(weights: Array[Double], intercept: Double) = { new LassoModel(weights, intercept) @@ -98,7 +97,7 @@ object LassoWithSGD { initialWeights: Array[Double]) : LassoModel = { - new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input, + new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input, initialWeights) } @@ -121,7 +120,7 @@ object LassoWithSGD { miniBatchFraction: Double) : LassoModel = { - new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input) + new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input) } /** diff --git a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala index 8fa9e4639b..894ae458ad 100644 --- a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala +++ b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala @@ -162,5 +162,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { intercept[spark.SparkException] { val model = SVMWithSGD.train(testRDDInvalid, 100) } + + // Turning off data validation should not throw an exception + val noValidationModel = new SVMWithSGD().setValidateData(false).run(testRDDInvalid) } } -- cgit v1.2.3