From 654087194d232221dfb64ba646c8a8e12649f961 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@eecs.berkeley.edu>
Date: Tue, 13 Aug 2013 11:43:49 -0700
Subject: Change SVM to use {0,1} labels. Also add a data validation check to
 make sure classification labels are always 0 or 1 and add an appropriate test
 case.

---
 .../mllib/classification/LogisticRegression.scala  | 16 +++++----
 .../scala/spark/mllib/classification/SVM.scala     | 21 ++++++++---
 .../scala/spark/mllib/optimization/Gradient.scala  | 12 ++++---
 .../regression/GeneralizedLinearAlgorithm.scala    |  9 ++++-
 .../scala/spark/mllib/util/DataValidators.scala    | 42 ++++++++++++++++++++++
 .../scala/spark/mllib/util/SVMDataGenerator.scala  |  9 ++---
 .../spark/mllib/classification/SVMSuite.scala      | 33 +++++++++++++----
 7 files changed, 116 insertions(+), 26 deletions(-)
 create mode 100644 mllib/src/main/scala/spark/mllib/util/DataValidators.scala

(limited to 'mllib')

diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 30ee0ab0ff..24f9f4e76b 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -17,12 +17,13 @@
 
 package spark.mllib.classification
 
+import scala.math.round
+
 import spark.{Logging, RDD, SparkContext}
 import spark.mllib.optimization._
 import spark.mllib.regression._
 import spark.mllib.util.MLUtils
-
-import scala.math.round
+import spark.mllib.util.DataValidators
 
 import org.jblas.DoubleMatrix
 
@@ -59,10 +60,13 @@ class LogisticRegressionWithSGD private (
 
   val gradient = new LogisticGradient()
   val updater = new SimpleUpdater()
-  val optimizer = new GradientDescent(gradient, updater).setStepSize(stepSize)
-                                                        .setNumIterations(numIterations)
-                                                        .setRegParam(regParam)
-                                                        .setMiniBatchFraction(miniBatchFraction)
+  override val optimizer = new GradientDescent(gradient, updater)
+      .setStepSize(stepSize)
+      .setNumIterations(numIterations)
+      .setRegParam(regParam)
+      .setMiniBatchFraction(miniBatchFraction)
+  override val validateFuncs = List(DataValidators.classificationLabels)
+
   /**
    * Construct a LogisticRegression object with default parameters
    */
diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
index f799cb2829..d2b50f4987 100644
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@@ -18,10 +18,12 @@
 package spark.mllib.classification
 
 import scala.math.signum
+
 import spark.{Logging, RDD, SparkContext}
 import spark.mllib.optimization._
 import spark.mllib.regression._
 import spark.mllib.util.MLUtils
+import spark.mllib.util.DataValidators
 
 import org.jblas.DoubleMatrix
 
@@ -45,6 +47,7 @@ class SVMModel(
 
 /**
  * Train an SVM using Stochastic Gradient Descent.
+ * NOTE: Labels used in SVM should be {0, 1}
  */
 class SVMWithSGD private (
     var stepSize: Double,
@@ -56,10 +59,14 @@ class SVMWithSGD private (
 
   val gradient = new HingeGradient()
   val updater = new SquaredL2Updater()
-  val optimizer = new GradientDescent(gradient, updater).setStepSize(stepSize)
-                                                        .setNumIterations(numIterations)
-                                                        .setRegParam(regParam)
-                                                        .setMiniBatchFraction(miniBatchFraction)
+  override val optimizer = new GradientDescent(gradient, updater)
+    .setStepSize(stepSize)
+    .setNumIterations(numIterations)
+    .setRegParam(regParam)
+    .setMiniBatchFraction(miniBatchFraction)
+
+  override val validateFuncs = List(DataValidators.classificationLabels)
+
   /**
    * Construct a SVM object with default parameters
    */
@@ -71,7 +78,7 @@ class SVMWithSGD private (
 }
 
 /**
- * Top-level methods for calling SVM.
+ * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}
  */
 object SVMWithSGD {
 
@@ -80,6 +87,7 @@ object SVMWithSGD {
    * of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
    * gradient descent are initialized using the initial weights provided.
+   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -106,6 +114,7 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient.
+   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -128,6 +137,7 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using the specified step size. We use the entire data set to
    * update the gradient in each iteration.
+   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param stepSize Step size to be used for each iteration of Gradient Descent.
@@ -149,6 +159,7 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using a step size of 1.0. We use the entire data set to
    * update the gradient in each iteration.
+   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
diff --git a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
index e72b8b3a92..58bfe3f37b 100644
--- a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
@@ -77,16 +77,20 @@ class SquaredGradient extends Gradient {
 
 /**
  * Compute gradient and loss for a Hinge loss function.
+ * NOTE: This assumes that the labels are {0,1}
  */
 class HingeGradient extends Gradient {
-  override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): 
+  override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
       (DoubleMatrix, Double) = {
 
     val dotProduct = data.dot(weights)
+    val labelScaled = 2*label - 1.0
 
-    if (1.0 > label * dotProduct)
-      (data.mul(-label), 1.0 - label * dotProduct)
+    // Our loss function with {0, 1} labels is max(0, 1 - (2y – 1) (f_w(x)))
+    // Therefore the gradient is -(2y - 1)*x
+    if (1.0 > labelScaled * dotProduct)
+      (data.mul(-labelScaled), 1.0 - labelScaled * dotProduct)
     else
-      (DoubleMatrix.zeros(1,weights.length), 0.0)
+      (DoubleMatrix.zeros(1, weights.length), 0.0)
   }
 }
diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 4ecafff08b..55edb3def5 100644
--- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -17,7 +17,7 @@
 
 package spark.mllib.regression
 
-import spark.{Logging, RDD}
+import spark.{Logging, RDD, SparkException}
 import spark.mllib.optimization._
 
 import org.jblas.DoubleMatrix
@@ -83,6 +83,8 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
 abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   extends Logging with Serializable {
 
+  protected val validateFuncs: Seq[RDD[LabeledPoint] => Boolean] = List()
+
   val optimizer: Optimizer
 
   /**
@@ -116,6 +118,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    */
   def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
 
+    // Check the data properties before running the optimizer
+    if (!validateFuncs.forall(func => func(input))) {
+      throw new SparkException("Input validation failed.")
+    }
+
     // Add a extra variable consisting of all 1.0's for the intercept.
     val data = if (addIntercept) {
       input.map(labeledPoint => (labeledPoint.label, Array(1.0, labeledPoint.features:_*)))
diff --git a/mllib/src/main/scala/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/spark/mllib/util/DataValidators.scala
new file mode 100644
index 0000000000..57553accf1
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/util/DataValidators.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.util
+
+import spark.{RDD, Logging}
+import spark.mllib.regression.LabeledPoint
+
+/**
+ * A collection of methods used to validate data before applying ML algorithms.
+ */
+object DataValidators extends Logging {
+
+  /**
+   * Function to check if labels used for classification are either zero or one.
+   *
+   * @param data - input data set that needs to be checked
+   *
+   * @return True if labels are all zero or one, false otherwise.
+   */
+   val classificationLabels: RDD[LabeledPoint] => Boolean = { data =>
+    val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
+    if (numInvalid != 0) {
+      logError("Classification labels should be 0 or 1. Found " + numInvalid + " invalid labels")
+    }
+    numInvalid == 0
+  }
+}
diff --git a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
index e02bd190f6..eff456cad6 100644
--- a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
@@ -1,7 +1,6 @@
 package spark.mllib.util
 
 import scala.util.Random
-import scala.math.signum
 
 import spark.{RDD, SparkContext}
 
@@ -30,8 +29,8 @@ object SVMDataGenerator {
     val sc = new SparkContext(sparkMaster, "SVMGenerator")
 
     val globalRnd = new Random(94720)
-    val trueWeights = new DoubleMatrix(1, nfeatures+1,
-      Array.fill[Double](nfeatures + 1) { globalRnd.nextGaussian() }:_*)
+    val trueWeights = new DoubleMatrix(1, nfeatures + 1,
+      Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian()):_*)
 
     val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
       val rnd = new Random(42 + idx)
@@ -39,11 +38,13 @@ object SVMDataGenerator {
       val x = Array.fill[Double](nfeatures) {
         rnd.nextDouble() * 2.0 - 1.0
       }
-      val y = signum((new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1)
+      val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1
+      val y = if (yD < 0) 0.0 else 1.0
       LabeledPoint(y, x)
     }
 
     MLUtils.saveLabeledData(data, outputPath)
+
     sc.stop()
   }
 }
diff --git a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
index 04f631d80f..f392efa405 100644
--- a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
@@ -50,11 +50,9 @@ object SVMSuite {
     val x = Array.fill[Array[Double]](nPoints)(
         Array.fill[Double](weights.length)(rnd.nextGaussian()))
     val y = x.map { xi =>
-      signum(
-        (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) +
-        intercept +
-        0.1 * rnd.nextGaussian()
-      ).toInt
+      val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) +
+        intercept + 0.01 * rnd.nextGaussian()
+      if (yD < 0) 0.0 else 1.0
     }
     y.zip(x).map(p => LabeledPoint(p._1, p._2))
   }
@@ -100,7 +98,7 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
     val model = svm.run(testRDD)
 
     val validationData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 17)
-    val validationRDD  = sc.parallelize(validationData,2)
+    val validationRDD  = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
     validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData)
@@ -139,4 +137,27 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
     // Test prediction on Array.
     validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
   }
+
+  test("SVM with invalid labels") {
+    val nPoints = 10000
+
+    val A = 2.0
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B,C), nPoints, 42)
+    val testRDD = sc.parallelize(testData, 2)
+
+    val testRDDInvalid = testRDD.map { lp =>
+      if (lp.label == 0.0) {
+        LabeledPoint(-1.0, lp.features)
+      } else {
+        lp
+      }
+    }
+
+    intercept[spark.SparkException] {
+      val model = SVMWithSGD.train(testRDDInvalid, 100)
+    }
+  }
 }
-- 
cgit v1.2.3


From 0ab6ff4c3252e7cb9ea573e09d9188da1fcb87cc Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@eecs.berkeley.edu>
Date: Tue, 13 Aug 2013 13:57:06 -0700
Subject: Fix SVM model and unit test to work with {0,1}. Also rename
 validateFuncs to validators.

---
 .../scala/spark/mllib/classification/LogisticRegression.scala |  2 +-
 mllib/src/main/scala/spark/mllib/classification/SVM.scala     |  5 +++--
 mllib/src/main/scala/spark/mllib/optimization/Gradient.scala  |  8 +++++---
 .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala   |  4 ++--
 .../src/test/scala/spark/mllib/classification/SVMSuite.scala  | 11 +++++++----
 5 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'mllib')

diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 24f9f4e76b..7f0b1ba841 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -65,7 +65,7 @@ class LogisticRegressionWithSGD private (
       .setNumIterations(numIterations)
       .setRegParam(regParam)
       .setMiniBatchFraction(miniBatchFraction)
-  override val validateFuncs = List(DataValidators.classificationLabels)
+  override val validators = List(DataValidators.classificationLabels)
 
   /**
    * Construct a LogisticRegression object with default parameters
diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
index d2b50f4987..b680d81e86 100644
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@@ -41,7 +41,8 @@ class SVMModel(
 
   override def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
       intercept: Double) = {
-    signum(dataMatrix.dot(weightMatrix) + intercept)
+    val margin = dataMatrix.dot(weightMatrix) + intercept
+    if (margin < 0) 0.0 else 1.0
   }
 }
 
@@ -65,7 +66,7 @@ class SVMWithSGD private (
     .setRegParam(regParam)
     .setMiniBatchFraction(miniBatchFraction)
 
-  override val validateFuncs = List(DataValidators.classificationLabels)
+  override val validators = List(DataValidators.classificationLabels)
 
   /**
    * Construct a SVM object with default parameters
diff --git a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
index 58bfe3f37b..05568f55af 100644
--- a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
@@ -84,13 +84,15 @@ class HingeGradient extends Gradient {
       (DoubleMatrix, Double) = {
 
     val dotProduct = data.dot(weights)
-    val labelScaled = 2*label - 1.0
 
     // Our loss function with {0, 1} labels is max(0, 1 - (2y – 1) (f_w(x)))
     // Therefore the gradient is -(2y - 1)*x
-    if (1.0 > labelScaled * dotProduct)
+    val labelScaled = 2 * label - 1.0
+
+    if (1.0 > labelScaled * dotProduct) {
       (data.mul(-labelScaled), 1.0 - labelScaled * dotProduct)
-    else
+    } else {
       (DoubleMatrix.zeros(1, weights.length), 0.0)
+    }
   }
 }
diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 55edb3def5..03f991df39 100644
--- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -83,7 +83,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
 abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   extends Logging with Serializable {
 
-  protected val validateFuncs: Seq[RDD[LabeledPoint] => Boolean] = List()
+  protected val validators: Seq[RDD[LabeledPoint] => Boolean] = List()
 
   val optimizer: Optimizer
 
@@ -119,7 +119,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
 
     // Check the data properties before running the optimizer
-    if (!validateFuncs.forall(func => func(input))) {
+    if (!validators.forall(func => func(input))) {
       throw new SparkException("Input validation failed.")
     }
 
diff --git a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
index f392efa405..8fa9e4639b 100644
--- a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
@@ -48,7 +48,7 @@ object SVMSuite {
     val rnd = new Random(seed)
     val weightsMat = new DoubleMatrix(1, weights.length, weights:_*)
     val x = Array.fill[Array[Double]](nPoints)(
-        Array.fill[Double](weights.length)(rnd.nextGaussian()))
+        Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0))
     val y = x.map { xi =>
       val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) +
         intercept + 0.01 * rnd.nextGaussian()
@@ -83,7 +83,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
   test("SVM using local random SGD") {
     val nPoints = 10000
 
-    val A = 2.0
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
     val B = -1.5
     val C = 1.0
 
@@ -110,7 +111,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
   test("SVM local random SGD with initial weights") {
     val nPoints = 10000
 
-    val A = 2.0
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
     val B = -1.5
     val C = 1.0
 
@@ -141,7 +143,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
   test("SVM with invalid labels") {
     val nPoints = 10000
 
-    val A = 2.0
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
     val B = -1.5
     val C = 1.0
 
-- 
cgit v1.2.3


From c874625354de7117da9586cfbbe919bb6801a932 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@eecs.berkeley.edu>
Date: Tue, 13 Aug 2013 16:55:53 -0700
Subject: Specify label format in LogisticRegression.

---
 .../main/scala/spark/mllib/classification/LogisticRegression.scala  | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mllib')

diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 7f0b1ba841..474ca6e97c 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -48,6 +48,7 @@ class LogisticRegressionModel(
 
 /**
  * Train a classification model for Logistic Regression using Stochastic Gradient Descent.
+ * NOTE: Labels used in Logistic Regression should be {0, 1}
  */
 class LogisticRegressionWithSGD private (
     var stepSize: Double,
@@ -79,6 +80,7 @@ class LogisticRegressionWithSGD private (
 
 /**
  * Top-level methods for calling Logistic Regression.
+ * NOTE: Labels used in Logistic Regression should be {0, 1}
  */
 object LogisticRegressionWithSGD {
   // NOTE(shivaram): We use multiple train methods instead of default arguments to support
@@ -89,6 +91,7 @@ object LogisticRegressionWithSGD {
    * number of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
    * gradient descent are initialized using the initial weights provided.
+   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -113,6 +116,7 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient.
+   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -135,6 +139,7 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using the specified step size. We use the entire data
    * set to update the gradient in each iteration.
+   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param stepSize Step size to be used for each iteration of Gradient Descent.
@@ -155,6 +160,7 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using a step size of 1.0. We use the entire data set
    * to update the gradient in each iteration.
+   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
-- 
cgit v1.2.3


From dc06b528790c69b2e6de85cba84266fea81dd4f4 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@eecs.berkeley.edu>
Date: Sun, 25 Aug 2013 23:14:35 -0700
Subject: Add an option to turn off data validation, test it. Also moves
 addIntercept to have default true to make it similar to validateData option

---
 .../spark/mllib/classification/LogisticRegression.scala  |  9 ++++-----
 .../src/main/scala/spark/mllib/classification/SVM.scala  |  9 ++++-----
 .../mllib/regression/GeneralizedLinearAlgorithm.scala    | 16 +++++++++++++---
 mllib/src/main/scala/spark/mllib/regression/Lasso.scala  |  9 ++++-----
 .../test/scala/spark/mllib/classification/SVMSuite.scala |  3 +++
 5 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'mllib')

diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 474ca6e97c..482e4a6745 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -54,8 +54,7 @@ class LogisticRegressionWithSGD private (
     var stepSize: Double,
     var numIterations: Int,
     var regParam: Double,
-    var miniBatchFraction: Double,
-    var addIntercept: Boolean)
+    var miniBatchFraction: Double)
   extends GeneralizedLinearAlgorithm[LogisticRegressionModel]
   with Serializable {
 
@@ -71,7 +70,7 @@ class LogisticRegressionWithSGD private (
   /**
    * Construct a LogisticRegression object with default parameters
    */
-  def this() = this(1.0, 100, 0.0, 1.0, true)
+  def this() = this(1.0, 100, 0.0, 1.0)
 
   def createModel(weights: Array[Double], intercept: Double) = {
     new LogisticRegressionModel(weights, intercept)
@@ -108,7 +107,7 @@ object LogisticRegressionWithSGD {
       initialWeights: Array[Double])
     : LogisticRegressionModel =
   {
-    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction, true).run(
+    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction).run(
       input, initialWeights)
   }
 
@@ -131,7 +130,7 @@ object LogisticRegressionWithSGD {
       miniBatchFraction: Double)
     : LogisticRegressionModel =
   {
-    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction, true).run(
+    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction).run(
       input)
   }
 
diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
index b680d81e86..69393cd7b0 100644
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@@ -54,8 +54,7 @@ class SVMWithSGD private (
     var stepSize: Double,
     var numIterations: Int,
     var regParam: Double,
-    var miniBatchFraction: Double,
-    var addIntercept: Boolean)
+    var miniBatchFraction: Double)
   extends GeneralizedLinearAlgorithm[SVMModel] with Serializable {
 
   val gradient = new HingeGradient()
@@ -71,7 +70,7 @@ class SVMWithSGD private (
   /**
    * Construct a SVM object with default parameters
    */
-  def this() = this(1.0, 100, 1.0, 1.0, true)
+  def this() = this(1.0, 100, 1.0, 1.0)
 
   def createModel(weights: Array[Double], intercept: Double) = {
     new SVMModel(weights, intercept)
@@ -107,7 +106,7 @@ object SVMWithSGD {
       initialWeights: Array[Double])
     : SVMModel =
   {
-    new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input,
+    new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input,
       initialWeights)
   }
 
@@ -131,7 +130,7 @@ object SVMWithSGD {
       miniBatchFraction: Double)
     : SVMModel =
   {
-    new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input)
+    new SVMWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input)
   }
 
   /**
diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 03f991df39..d164d415d6 100644
--- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -87,13 +87,15 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
 
   val optimizer: Optimizer
 
+  protected var addIntercept: Boolean = true
+
+  protected var validateData: Boolean = true
+
   /**
    * Create a model given the weights and intercept
    */
   protected def createModel(weights: Array[Double], intercept: Double): M
 
-  protected var addIntercept: Boolean
-
   /**
    * Set if the algorithm should add an intercept. Default true.
    */
@@ -102,6 +104,14 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
     this
   }
 
+  /**
+   * Set if the algorithm should validate data before training. Default true.
+   */
+  def setValidateData(validateData: Boolean): this.type = {
+    this.validateData = validateData
+    this
+  }
+
   /**
    * Run the algorithm with the configured parameters on an input
    * RDD of LabeledPoint entries.
@@ -119,7 +129,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
 
     // Check the data properties before running the optimizer
-    if (!validators.forall(func => func(input))) {
+    if (validateData && !validators.forall(func => func(input))) {
       throw new SparkException("Input validation failed.")
     }
 
diff --git a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
index 6bbc990a5a..89f791e85a 100644
--- a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
@@ -48,8 +48,7 @@ class LassoWithSGD private (
     var stepSize: Double,
     var numIterations: Int,
     var regParam: Double,
-    var miniBatchFraction: Double,
-    var addIntercept: Boolean)
+    var miniBatchFraction: Double)
   extends GeneralizedLinearAlgorithm[LassoModel]
   with Serializable {
 
@@ -63,7 +62,7 @@ class LassoWithSGD private (
   /**
    * Construct a Lasso object with default parameters
    */
-  def this() = this(1.0, 100, 1.0, 1.0, true)
+  def this() = this(1.0, 100, 1.0, 1.0)
 
   def createModel(weights: Array[Double], intercept: Double) = {
     new LassoModel(weights, intercept)
@@ -98,7 +97,7 @@ object LassoWithSGD {
       initialWeights: Array[Double])
     : LassoModel =
   {
-    new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input,
+    new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input,
         initialWeights)
   }
 
@@ -121,7 +120,7 @@ object LassoWithSGD {
       miniBatchFraction: Double)
     : LassoModel =
   {
-    new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction, true).run(input)
+    new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input)
   }
 
   /**
diff --git a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
index 8fa9e4639b..894ae458ad 100644
--- a/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/spark/mllib/classification/SVMSuite.scala
@@ -162,5 +162,8 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
     intercept[spark.SparkException] {
       val model = SVMWithSGD.train(testRDDInvalid, 100)
     }
+
+    // Turning off data validation should not throw an exception
+    val noValidationModel = new SVMWithSGD().setValidateData(false).run(testRDDInvalid)
   }
 }
-- 
cgit v1.2.3