aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorXinghao <pxinghao@gmail.com>2013-07-28 22:12:39 -0700
committerXinghao <pxinghao@gmail.com>2013-07-28 22:12:39 -0700
commit96e04f4cb7de3a7c9d31aa7acba496d81066634e (patch)
treea81b6c706f31681ab3013bdac1f2403a48b7312d /mllib
parent9398dced0331c0ec098ef5eb4616571874ceefb6 (diff)
downloadspark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.tar.gz
spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.tar.bz2
spark-96e04f4cb7de3a7c9d31aa7acba496d81066634e.zip
Fixed SVM and LR train functions to take Int instead of Double for Classification
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala17
-rw-r--r--mllib/src/main/scala/spark/mllib/classification/SVM.scala16
-rw-r--r--mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala10
3 files changed, 21 insertions, 22 deletions
diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 0a7effb1d7..cbc0d03ae1 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -86,19 +86,19 @@ class LogisticRegressionLocalRandomSGD private (var stepSize: Double, var miniBa
this
}
- def train(input: RDD[(Double, Array[Double])]): LogisticRegressionModel = {
+ def train(input: RDD[(Int, Array[Double])]): LogisticRegressionModel = {
val nfeatures: Int = input.take(1)(0)._2.length
val initialWeights = Array.fill(nfeatures)(1.0)
train(input, initialWeights)
}
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
initialWeights: Array[Double]): LogisticRegressionModel = {
// Add a extra variable consisting of all 1.0's for the intercept.
val data = input.map { case (y, features) =>
- (y, Array(1.0, features:_*))
+ (y.toDouble, Array(1.0, features:_*))
}
val initalWeightsWithIntercept = Array(1.0, initialWeights:_*)
@@ -141,13 +141,12 @@ object LogisticRegressionLocalRandomSGD {
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @param stepSize Step size to be used for each iteration of gradient descent.
-
* @param miniBatchFraction Fraction of data to be used per iteration.
* @param initialWeights Initial set of weights to be used. Array should be equal in size to
* the number of features in the data.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int,
stepSize: Double,
@@ -170,7 +169,7 @@ object LogisticRegressionLocalRandomSGD {
* @param miniBatchFraction Fraction of data to be used per iteration.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int,
stepSize: Double,
@@ -192,7 +191,7 @@ object LogisticRegressionLocalRandomSGD {
* @return a LogisticRegressionModel which has the weights and offset from training.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int,
stepSize: Double
)
@@ -211,7 +210,7 @@ object LogisticRegressionLocalRandomSGD {
* @return a LogisticRegressionModel which has the weights and offset from training.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int)
: LogisticRegressionModel =
{
@@ -224,7 +223,7 @@ object LogisticRegressionLocalRandomSGD {
System.exit(1)
}
val sc = new SparkContext(args(0), "LogisticRegression")
- val data = MLUtils.loadLabeledData(sc, args(1))
+ val data = MLUtils.loadLabeledData(sc, args(1)).map(yx => (yx._1.toInt, yx._2))
val model = LogisticRegressionLocalRandomSGD.train(data, args(4).toInt, args(2).toDouble, args(3).toDouble)
sc.stop()
diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
index 30766a4c64..15b689e7e0 100644
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@@ -94,19 +94,19 @@ class SVMLocalRandomSGD private (var stepSize: Double, var regParam: Double, var
this
}
- def train(input: RDD[(Double, Array[Double])]): SVMModel = {
+ def train(input: RDD[(Int, Array[Double])]): SVMModel = {
val nfeatures: Int = input.take(1)(0)._2.length
val initialWeights = Array.fill(nfeatures)(1.0)
train(input, initialWeights)
}
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
initialWeights: Array[Double]): SVMModel = {
// Add a extra variable consisting of all 1.0's for the intercept.
val data = input.map { case (y, features) =>
- (y, Array(1.0, features:_*))
+ (y.toDouble, Array(1.0, features:_*))
}
val initalWeightsWithIntercept = Array(1.0, initialWeights:_*)
@@ -155,7 +155,7 @@ object SVMLocalRandomSGD {
* the number of features in the data.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int,
stepSize: Double,
regParam: Double,
@@ -178,7 +178,7 @@ object SVMLocalRandomSGD {
* @param miniBatchFraction Fraction of data to be used per iteration.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int,
stepSize: Double,
regParam: Double,
@@ -200,7 +200,7 @@ object SVMLocalRandomSGD {
* @return a SVMModel which has the weights and offset from training.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int,
stepSize: Double,
regParam: Double)
@@ -219,7 +219,7 @@ object SVMLocalRandomSGD {
* @return a SVMModel which has the weights and offset from training.
*/
def train(
- input: RDD[(Double, Array[Double])],
+ input: RDD[(Int, Array[Double])],
numIterations: Int)
: SVMModel =
{
@@ -232,7 +232,7 @@ object SVMLocalRandomSGD {
System.exit(1)
}
val sc = new SparkContext(args(0), "SVM")
- val data = MLUtils.loadLabeledData(sc, args(1))
+ val data = MLUtils.loadLabeledData(sc, args(1)).map(yx => (yx._1.toInt, yx._2))
val model = SVMLocalRandomSGD.train(data, args(4).toInt, args(2).toDouble, args(3).toDouble)
sc.stop()
diff --git a/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala
index 144b8b1bc7..3aa9fe6d12 100644
--- a/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -38,7 +38,7 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll {
offset: Double,
scale: Double,
nPoints: Int,
- seed: Int): Seq[(Double, Array[Double])] = {
+ seed: Int): Seq[(Int, Array[Double])] = {
val rnd = new Random(seed)
val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
@@ -51,19 +51,19 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll {
// y <- A + B*x + rLogis()
// y <- as.numeric(y > 0)
- val y: Seq[Double] = (0 until nPoints).map { i =>
+ val y: Seq[Int] = (0 until nPoints).map { i =>
val yVal = offset + scale * x1(i) + rLogis(i)
- if (yVal > 0) 1.0 else 0.0
+ if (yVal > 0) 1 else 0
}
val testData = (0 until nPoints).map(i => (y(i), Array(x1(i))))
testData
}
- def validatePrediction(predictions: Seq[Double], input: Seq[(Double, Array[Double])]) {
+ def validatePrediction(predictions: Seq[Int], input: Seq[(Int, Array[Double])]) {
val numOffPredictions = predictions.zip(input).filter { case (prediction, (expected, _)) =>
// A prediction is off if the prediction is more than 0.5 away from expected value.
- math.abs(prediction - expected) > 0.5
+ math.abs(prediction.toDouble - expected.toDouble) > 0.5
}.size
// At least 80% of the predictions should be on.
assert(numOffPredictions < input.length / 5)