Fixed SVM and LR train functions to take Int instead of Double for Classification

author: Xinghao <pxinghao@gmail.com> 2013-07-28 22:12:39 -0700
committer: Xinghao <pxinghao@gmail.com> 2013-07-28 22:12:39 -0700
commit: 96e04f4cb7de3a7c9d31aa7acba496d81066634e (patch)
tree: a81b6c706f31681ab3013bdac1f2403a48b7312d /mllib
parent: 9398dced0331c0ec098ef5eb4616571874ceefb6 (diff)
download: spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.tar.gz
spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.tar.bz2
spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.zip
3 files changed, 21 insertions, 22 deletions
diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 0a7effb1d7..cbc0d03ae1 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -86,19 +86,19 @@ class LogisticRegressionLocalRandomSGD private (var stepSize: Double, var miniBa
     this
   }
 
-  def train(input: RDD[(Double, Array[Double])]): LogisticRegressionModel = {
+  def train(input: RDD[(Int, Array[Double])]): LogisticRegressionModel = {
     val nfeatures: Int = input.take(1)(0)._2.length
     val initialWeights = Array.fill(nfeatures)(1.0)
     train(input, initialWeights)
   }
 
   def train(
-    input: RDD[(Double, Array[Double])],
+    input: RDD[(Int, Array[Double])],
     initialWeights: Array[Double]): LogisticRegressionModel = {
 
     // Add a extra variable consisting of all 1.0's for the intercept.
     val data = input.map { case (y, features) =>
-      (y, Array(1.0, features:_*))
+      (y.toDouble, Array(1.0, features:_*))
     }
 
     val initalWeightsWithIntercept = Array(1.0, initialWeights:_*)
@@ -141,13 +141,12 @@ object LogisticRegressionLocalRandomSGD {
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @param stepSize Step size to be used for each iteration of gradient descent.
-
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to 
    *        the number of features in the data.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int,
       stepSize: Double,
 
@@ -170,7 +169,7 @@ object LogisticRegressionLocalRandomSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int,
       stepSize: Double,
 
@@ -192,7 +191,7 @@ object LogisticRegressionLocalRandomSGD {
    * @return a LogisticRegressionModel which has the weights and offset from training.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int,
       stepSize: Double
       )
@@ -211,7 +210,7 @@ object LogisticRegressionLocalRandomSGD {
    * @return a LogisticRegressionModel which has the weights and offset from training.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int)
     : LogisticRegressionModel =
   {
@@ -224,7 +223,7 @@ object LogisticRegressionLocalRandomSGD {
       System.exit(1)
     }
     val sc = new SparkContext(args(0), "LogisticRegression")
-    val data = MLUtils.loadLabeledData(sc, args(1))
+    val data = MLUtils.loadLabeledData(sc, args(1)).map(yx => (yx._1.toInt, yx._2))
     val model = LogisticRegressionLocalRandomSGD.train(data, args(4).toInt, args(2).toDouble, args(3).toDouble)
 
     sc.stop()
diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
index 30766a4c64..15b689e7e0 100644
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@@ -94,19 +94,19 @@ class SVMLocalRandomSGD private (var stepSize: Double, var regParam: Double, var
     this
   }
 
-  def train(input: RDD[(Double, Array[Double])]): SVMModel = {
+  def train(input: RDD[(Int, Array[Double])]): SVMModel = {
     val nfeatures: Int = input.take(1)(0)._2.length
     val initialWeights = Array.fill(nfeatures)(1.0)
     train(input, initialWeights)
   }
 
   def train(
-    input: RDD[(Double, Array[Double])],
+    input: RDD[(Int, Array[Double])],
     initialWeights: Array[Double]): SVMModel = {
 
     // Add a extra variable consisting of all 1.0's for the intercept.
     val data = input.map { case (y, features) =>
-      (y, Array(1.0, features:_*))
+      (y.toDouble, Array(1.0, features:_*))
     }
 
     val initalWeightsWithIntercept = Array(1.0, initialWeights:_*)
@@ -155,7 +155,7 @@ object SVMLocalRandomSGD {
    *        the number of features in the data.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int,
       stepSize: Double,
       regParam: Double,
@@ -178,7 +178,7 @@ object SVMLocalRandomSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int,
       stepSize: Double,
       regParam: Double,
@@ -200,7 +200,7 @@ object SVMLocalRandomSGD {
    * @return a SVMModel which has the weights and offset from training.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int,
       stepSize: Double,
       regParam: Double)
@@ -219,7 +219,7 @@ object SVMLocalRandomSGD {
    * @return a SVMModel which has the weights and offset from training.
    */
   def train(
-      input: RDD[(Double, Array[Double])],
+      input: RDD[(Int, Array[Double])],
       numIterations: Int)
     : SVMModel =
   {
@@ -232,7 +232,7 @@ object SVMLocalRandomSGD {
       System.exit(1)
     }
     val sc = new SparkContext(args(0), "SVM")
-    val data = MLUtils.loadLabeledData(sc, args(1))
+    val data = MLUtils.loadLabeledData(sc, args(1)).map(yx => (yx._1.toInt, yx._2))
     val model = SVMLocalRandomSGD.train(data, args(4).toInt, args(2).toDouble, args(3).toDouble)
 
     sc.stop()
diff --git a/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala
index 144b8b1bc7..3aa9fe6d12 100644
--- a/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -38,7 +38,7 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll {
       offset: Double,
       scale: Double,
       nPoints: Int,
-      seed: Int): Seq[(Double, Array[Double])]  = {
+      seed: Int): Seq[(Int, Array[Double])]  = {
     val rnd = new Random(seed)
     val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 
@@ -51,19 +51,19 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll {
 
     // y <- A + B*x + rLogis()
     // y <- as.numeric(y > 0)
-    val y: Seq[Double] = (0 until nPoints).map { i =>
+    val y: Seq[Int] = (0 until nPoints).map { i =>
       val yVal = offset + scale * x1(i) + rLogis(i)
-      if (yVal > 0) 1.0 else 0.0
+      if (yVal > 0) 1 else 0
     }
 
     val testData = (0 until nPoints).map(i => (y(i), Array(x1(i))))
     testData
   }
 
-  def validatePrediction(predictions: Seq[Double], input: Seq[(Double, Array[Double])]) {
+  def validatePrediction(predictions: Seq[Int], input: Seq[(Int, Array[Double])]) {
     val numOffPredictions = predictions.zip(input).filter { case (prediction, (expected, _)) =>
       // A prediction is off if the prediction is more than 0.5 away from expected value.
-      math.abs(prediction - expected) > 0.5
+      math.abs(prediction.toDouble - expected.toDouble) > 0.5
     }.size
     // At least 80% of the predictions should be on.
     assert(numOffPredictions < input.length / 5)
author	Xinghao <pxinghao@gmail.com>	2013-07-28 22:12:39 -0700
committer	Xinghao <pxinghao@gmail.com>	2013-07-28 22:12:39 -0700
commit	96e04f4cb7de3a7c9d31aa7acba496d81066634e (patch)
tree	a81b6c706f31681ab3013bdac1f2403a48b7312d /mllib
parent	9398dced0331c0ec098ef5eb4616571874ceefb6 (diff)
download	spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.tar.gz spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.tar.bz2 spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.zip