aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorShivaram Venkataraman <shivaram@eecs.berkeley.edu>2013-08-11 19:02:43 -0700
committerShivaram Venkataraman <shivaram@eecs.berkeley.edu>2013-08-11 19:02:43 -0700
commit4935a2558b18965f3ec6bc3963b6ce95e9fa3ef3 (patch)
treef1b5a8636d951d303a569c544cdf40b8cd93a7ff /mllib
parente5b9ed2833911cb894cf7ad05299aa1385a7e600 (diff)
downloadspark-4935a2558b18965f3ec6bc3963b6ce95e9fa3ef3.tar.gz
spark-4935a2558b18965f3ec6bc3963b6ce95e9fa3ef3.tar.bz2
spark-4935a2558b18965f3ec6bc3963b6ce95e9fa3ef3.zip
Clean up scaladoc in ML Lib.
Also build and copy ML Lib scaladoc in Spark docs build. Some more minor cleanup with respect to naming, test locations etc.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala15
-rw-r--r--mllib/src/main/scala/spark/mllib/classification/SVM.scala8
-rw-r--r--mllib/src/main/scala/spark/mllib/optimization/Gradient.scala23
-rw-r--r--mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala19
-rw-r--r--mllib/src/main/scala/spark/mllib/optimization/Updater.scala23
-rw-r--r--mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala9
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala34
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/Lasso.scala10
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala4
-rw-r--r--mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala10
-rw-r--r--mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala15
-rw-r--r--mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala8
-rw-r--r--mllib/src/main/scala/spark/mllib/util/MLUtils.scala21
-rw-r--r--mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala16
-rw-r--r--mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala16
-rw-r--r--mllib/src/test/java/spark/mllib/clustering/JavaKMeansSuite.java (renamed from mllib/src/test/scala/spark/mllib/clustering/JavaKMeansSuite.java)0
-rw-r--r--mllib/src/test/java/spark/mllib/recommendation/JavaALSSuite.java (renamed from mllib/src/test/scala/spark/mllib/recommendation/JavaALSSuite.java)0
17 files changed, 171 insertions, 60 deletions
diff --git a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
index 73949b0103..30ee0ab0ff 100644
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@@ -27,8 +27,10 @@ import scala.math.round
import org.jblas.DoubleMatrix
/**
- * Logistic Regression using Stochastic Gradient Descent.
- * Based on Matlab code written by John Duchi.
+ * Classification model trained using Logistic Regression.
+ *
+ * @param weights Weights computed for every feature.
+ * @param intercept Intercept computed for this model.
*/
class LogisticRegressionModel(
override val weights: Array[Double],
@@ -43,7 +45,10 @@ class LogisticRegressionModel(
}
}
-class LogisticRegressionWithSGD (
+/**
+ * Train a classification model for Logistic Regression using Stochastic Gradient Descent.
+ */
+class LogisticRegressionWithSGD private (
var stepSize: Double,
var numIterations: Int,
var regParam: Double,
@@ -70,10 +75,10 @@ class LogisticRegressionWithSGD (
/**
* Top-level methods for calling Logistic Regression.
- * NOTE(shivaram): We use multiple train methods instead of default arguments to support
- * Java programs.
*/
object LogisticRegressionWithSGD {
+ // NOTE(shivaram): We use multiple train methods instead of default arguments to support
+ // Java programs.
/**
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
diff --git a/mllib/src/main/scala/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
index fa9d5a9471..f799cb2829 100644
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@@ -26,7 +26,10 @@ import spark.mllib.util.MLUtils
import org.jblas.DoubleMatrix
/**
- * SVM using Stochastic Gradient Descent.
+ * Model built using SVM.
+ *
+ * @param weights Weights computed for every feature.
+ * @param intercept Intercept computed for this model.
*/
class SVMModel(
override val weights: Array[Double],
@@ -40,6 +43,9 @@ class SVMModel(
}
}
+/**
+ * Train an SVM using Stochastic Gradient Descent.
+ */
class SVMWithSGD private (
var stepSize: Double,
var numIterations: Int,
diff --git a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
index 22b2ec5ed6..e72b8b3a92 100644
--- a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
@@ -19,18 +19,29 @@ package spark.mllib.optimization
import org.jblas.DoubleMatrix
+/**
+ * Class used to compute the gradient for a loss function, given a single data point.
+ */
abstract class Gradient extends Serializable {
/**
- * Compute the gradient for a given row of data.
+ * Compute the gradient and loss given features of a single data point.
*
- * @param data - One row of data. Row matrix of size 1xn where n is the number of features.
+ * @param data - Feature values for one data point. Column matrix of size nx1
+ * where n is the number of features.
* @param label - Label for this data item.
* @param weights - Column matrix containing weights for every feature.
+ *
+ * @return A tuple of 2 elements. The first element is a column matrix containing the computed
+ * gradient and the second element is the loss computed at this data point.
+ *
*/
def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double)
}
+/**
+ * Compute gradient and loss for a logistic loss function.
+ */
class LogisticGradient extends Gradient {
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double) = {
@@ -49,7 +60,9 @@ class LogisticGradient extends Gradient {
}
}
-
+/**
+ * Compute gradient and loss for a Least-squared loss function.
+ */
class SquaredGradient extends Gradient {
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double) = {
@@ -62,7 +75,9 @@ class SquaredGradient extends Gradient {
}
}
-
+/**
+ * Compute gradient and loss for a Hinge loss function.
+ */
class HingeGradient extends Gradient {
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double) = {
diff --git a/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
index 1f04398d0c..31917df7e8 100644
--- a/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
@@ -24,12 +24,17 @@ import org.jblas.DoubleMatrix
import scala.collection.mutable.ArrayBuffer
+/**
+ * Class used to solve an optimization problem using Gradient Descent.
+ * @param gradient Gradient function to be used.
+ * @param updater Updater to be used to update weights after every iteration.
+ */
class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {
- var stepSize: Double = 1.0
- var numIterations: Int = 100
- var regParam: Double = 0.0
- var miniBatchFraction: Double = 1.0
+ private var stepSize: Double = 1.0
+ private var numIterations: Int = 100
+ private var regParam: Double = 0.0
+ private var miniBatchFraction: Double = 1.0
/**
* Set the step size per-iteration of SGD. Default 1.0.
@@ -97,10 +102,10 @@ class GradientDescent(var gradient: Gradient, var updater: Updater) extends Opti
}
+// Top-level method to run gradient descent.
object GradientDescent extends Logging {
/**
* Run gradient descent in parallel using mini batches.
- * Based on Matlab code written by John Duchi.
*
* @param data - Input data for SGD. RDD of form (label, [feature values]).
* @param gradient - Gradient object that will be used to compute the gradient.
@@ -137,8 +142,8 @@ object GradientDescent extends Logging {
for (i <- 1 to numIterations) {
val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
case (y, features) =>
- val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
- val (grad, loss) = gradient.compute(featuresRow, y, weights)
+ val featuresCol = new DoubleMatrix(features.length, 1, features:_*)
+ val (grad, loss) = gradient.compute(featuresCol, y, weights)
(grad, loss)
}.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
diff --git a/mllib/src/main/scala/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
index 3ebc1409b6..db67d6b0bc 100644
--- a/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
@@ -20,10 +20,14 @@ package spark.mllib.optimization
import scala.math._
import org.jblas.DoubleMatrix
+/**
+ * Class used to update weights used in Gradient Descent.
+ */
abstract class Updater extends Serializable {
/**
- * Compute an updated value for weights given the gradient, stepSize and iteration number.
- * Also returns the regularization value computed using the *updated* weights.
+ * Compute an updated value for weights given the gradient, stepSize, iteration number and
+ * regularization parameter. Also returns the regularization value computed using the
+ * *updated* weights.
*
* @param weightsOld - Column matrix of size nx1 where n is the number of features.
* @param gradient - Column matrix of size nx1 where n is the number of features.
@@ -38,6 +42,10 @@ abstract class Updater extends Serializable {
regParam: Double): (DoubleMatrix, Double)
}
+/**
+ * A simple updater that adaptively adjusts the learning rate the
+ * square root of the number of iterations. Does not perform any regularization.
+ */
class SimpleUpdater extends Updater {
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
@@ -48,11 +56,15 @@ class SimpleUpdater extends Updater {
}
/**
- * L1 regularization -- corresponding proximal operator is the soft-thresholding function
- * That is, each weight component is shrunk towards 0 by shrinkageVal
+ * Updater that adjusts learning rate and performs L1 regularization.
+ *
+ * The corresponding proximal operator used is the soft-thresholding function.
+ * That is, each weight component is shrunk towards 0 by shrinkageVal.
+ *
* If w > shrinkageVal, set weight component to w-shrinkageVal.
* If w < -shrinkageVal, set weight component to w+shrinkageVal.
* If -shrinkageVal < w < shrinkageVal, set weight component to 0.
+ *
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
*/
class L1Updater extends Updater {
@@ -72,6 +84,9 @@ class L1Updater extends Updater {
}
}
+/**
+ * Updater that adjusts the learning rate and performs L2 regularization
+ */
class SquaredL2Updater extends Updater {
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
diff --git a/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 38637b3dd1..5e21717da5 100644
--- a/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -22,6 +22,15 @@ import spark.SparkContext._
import org.jblas._
+/**
+ * Model representing the result of matrix factorization.
+ *
+ * @param rank Rank for the features in this model.
+ * @param userFeatures RDD of tuples where each tuple represents the userId and
+ * the features computed for this user.
+ * @param productFeatures RDD of tuples where each tuple represents the productId
+ * and the features computed for this product.
+ */
class MatrixFactorizationModel(
val rank: Int,
val userFeatures: RDD[(Int, Array[Double])],
diff --git a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 8ea823b307..4ecafff08b 100644
--- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -24,8 +24,11 @@ import org.jblas.DoubleMatrix
/**
* GeneralizedLinearModel (GLM) represents a model trained using
- * GeneralizedLinearAlgorithm. GLMs consist of a weight vector,
+ * GeneralizedLinearAlgorithm. GLMs consist of a weight vector and
* an intercept.
+ *
+ * @param weights Weights computed for every feature.
+ * @param intercept Intercept computed for this model.
*/
abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
extends Serializable {
@@ -43,6 +46,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
intercept: Double): Double
+ /**
+ * Predict values for the given data set using the model trained.
+ *
+ * @param testData RDD representing data points to be predicted
+ * @return RDD[Double] where each entry contains the corresponding prediction
+ */
def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
// A small optimization to avoid serializing the entire model. Only the weightsMatrix
// and intercept is needed.
@@ -55,6 +64,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
}
}
+ /**
+ * Predict values for a single data point using the model trained.
+ *
+ * @param testData array representing a single data point
+ * @return Double prediction from the trained model
+ */
def predict(testData: Array[Double]): Double = {
val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
predictPoint(dataMat, weightsMatrix, intercept)
@@ -62,7 +77,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
}
/**
- * GeneralizedLinearAlgorithm abstracts out the training for all GLMs.
+ * GeneralizedLinearAlgorithm implements methods to train a Genearalized Linear Model (GLM).
* This class should be extended with an Optimizer to create a new GLM.
*/
abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
@@ -70,9 +85,12 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
val optimizer: Optimizer
- def createModel(weights: Array[Double], intercept: Double): M
+ /**
+ * Create a model given the weights and intercept
+ */
+ protected def createModel(weights: Array[Double], intercept: Double): M
- var addIntercept: Boolean
+ protected var addIntercept: Boolean
/**
* Set if the algorithm should add an intercept. Default true.
@@ -82,12 +100,20 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
this
}
+ /**
+ * Run the algorithm with the configured parameters on an input
+ * RDD of LabeledPoint entries.
+ */
def run(input: RDD[LabeledPoint]) : M = {
val nfeatures: Int = input.first().features.length
val initialWeights = Array.fill(nfeatures)(1.0)
run(input, initialWeights)
}
+ /**
+ * Run the algorithm with the configured parameters on an input RDD
+ * of LabeledPoint entries starting from the initial weights provided.
+ */
def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
// Add a extra variable consisting of all 1.0's for the intercept.
diff --git a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
index 989e5ded58..6bbc990a5a 100644
--- a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
@@ -24,8 +24,10 @@ import spark.mllib.util.MLUtils
import org.jblas.DoubleMatrix
/**
- * Lasso using Stochastic Gradient Descent.
+ * Regression model trained using Lasso.
*
+ * @param weights Weights computed for every feature.
+ * @param intercept Intercept computed for this model.
*/
class LassoModel(
override val weights: Array[Double],
@@ -39,8 +41,10 @@ class LassoModel(
}
}
-
-class LassoWithSGD (
+/**
+ * Train a regression model with L1-regularization using Stochastic Gradient Descent.
+ */
+class LassoWithSGD private (
var stepSize: Double,
var numIterations: Int,
var regParam: Double,
diff --git a/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
index de790dde51..b42d94af41 100644
--- a/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
@@ -168,10 +168,10 @@ class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)
/**
* Top-level methods for calling Ridge Regression.
- * NOTE(shivaram): We use multiple train methods instead of default arguments to support
- * Java programs.
*/
object RidgeRegression {
+ // NOTE(shivaram): We use multiple train methods instead of default arguments to support
+ // Java programs.
/**
* Train a ridge regression model given an RDD of (response, features) pairs.
diff --git a/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
index c89e5dd738..672b63f65a 100644
--- a/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
@@ -21,12 +21,16 @@ import scala.util.Random
import spark.{RDD, SparkContext}
+/**
+ * Generate test data for KMeans. This class first chooses k cluster centers
+ * from a d-dimensional Gaussian distribution scaled by factor r and then creates a Gaussian
+ * cluster with scale 1 around each center.
+ */
+
object KMeansDataGenerator {
/**
- * Generate an RDD containing test data for KMeans. This function chooses k cluster centers
- * from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
- * cluster with scale 1 around each center.
+ * Generate an RDD containing test data for KMeans.
*
* @param sc SparkContext to use for creating the RDD
* @param numPoints Number of points that will be contained in the RDD
diff --git a/mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala
index 1f185c9de7..eeb14fc4e3 100644
--- a/mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala
@@ -1,18 +1,22 @@
-package spark.mllib.regression
+package spark.mllib.util
import scala.util.Random
import org.jblas.DoubleMatrix
import spark.{RDD, SparkContext}
-import spark.mllib.util.MLUtils
+import spark.mllib.regression.LabeledPoint
-object LassoGenerator {
+/**
+ * Generate sample data used for Lasso Regression. This class generates uniform random values
+ * for the features and adds Gaussian noise with weight 0.1 to generate response variables.
+ */
+object LassoDataGenerator {
def main(args: Array[String]) {
- if (args.length != 5) {
+ if (args.length < 2) {
println("Usage: LassoGenerator " +
- "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+ "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
System.exit(1)
}
@@ -21,7 +25,6 @@ object LassoGenerator {
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
val parts: Int = if (args.length > 4) args(4).toInt else 2
- val eps = 3
val sc = new SparkContext(sparkMaster, "LassoGenerator")
diff --git a/mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala
index 4fa19c3c23..d6402f23e2 100644
--- a/mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala
@@ -22,11 +22,15 @@ import scala.util.Random
import spark.{RDD, SparkContext}
import spark.mllib.regression.LabeledPoint
+/**
+ * Generate test data for LogisticRegression. This class chooses positive labels
+ * with probability `probOne` and scales features for positive examples by `eps`.
+ */
+
object LogisticRegressionDataGenerator {
/**
- * Generate an RDD containing test data for LogisticRegression. This function chooses
- * positive labels with probability `probOne` and scales positive examples by `eps`.
+ * Generate an RDD containing test data for LogisticRegression.
*
* @param sc SparkContext to use for creating the RDD.
* @param nexamples Number of examples that will be contained in the RDD.
diff --git a/mllib/src/main/scala/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
index 9174e8cea7..4e030a81b4 100644
--- a/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
@@ -24,18 +24,19 @@ import org.jblas.DoubleMatrix
import spark.mllib.regression.LabeledPoint
/**
- * Helper methods to load and save data
- * Data format:
- * <l>, <f1> <f2> ...
- * where <f1>, <f2> are feature values in Double and <l> is the corresponding label as Double.
+ * Helper methods to load, save and pre-process data used in ML Lib.
*/
object MLUtils {
/**
+ * Load labeled data from a file. The data format used here is
+ * <L>, <f1> <f2> ...
+ * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
+ *
* @param sc SparkContext
* @param dir Directory to the input data files.
- * @return An RDD of tuples. For each tuple, the first element is the label, and the second
- * element represents the feature values (an array of Double).
+ * @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
+ * the label, and the second element represents the feature values (an array of Double).
*/
def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
sc.textFile(dir).map { line =>
@@ -46,6 +47,14 @@ object MLUtils {
}
}
+ /**
+ * Save labeled data to a file. The data format used here is
+ * <L>, <f1> <f2> ...
+ * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
+ *
+ * @param data An RDD of LabeledPoints containing data to be saved.
+ * @param dir Directory to save the data.
+ */
def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
val dataStr = data.map(x => x.label + "," + x.features.mkString(" "))
dataStr.saveAsTextFile(dir)
diff --git a/mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala
index c4d65c3f9a..4d329168be 100644
--- a/mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala
@@ -24,18 +24,24 @@ import org.jblas.DoubleMatrix
import spark.{RDD, SparkContext}
import spark.mllib.regression.LabeledPoint
+/**
+ * Generate sample data used for RidgeRegression. This class generates
+ * uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
+ * response variable `Y`.
+ *
+ */
object RidgeRegressionDataGenerator {
/**
- * Generate an RDD containing test data used for RidgeRegression. This function generates
- * uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
- * response variable `Y`.
+ * Generate an RDD containing sample data for RidgeRegression.
*
* @param sc SparkContext to be used for generating the RDD.
* @param nexamples Number of examples that will be contained in the RDD.
* @param nfeatures Number of features to generate for each example.
* @param eps Epsilon factor by which examples are scaled.
* @param nparts Number of partitions in the RDD. Default value is 2.
+ *
+ * @return RDD of LabeledPoint containing sample data.
*/
def generateRidgeRDD(
sc: SparkContext,
@@ -69,9 +75,9 @@ object RidgeRegressionDataGenerator {
}
def main(args: Array[String]) {
- if (args.length != 5) {
+ if (args.length < 2) {
println("Usage: RidgeRegressionGenerator " +
- "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+ "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
System.exit(1)
}
diff --git a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
index a37f6eb3b3..e02bd190f6 100644
--- a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
@@ -1,22 +1,23 @@
-package spark.mllib.classification
+package spark.mllib.util
import scala.util.Random
import scala.math.signum
-import org.jblas.DoubleMatrix
-
import spark.{RDD, SparkContext}
-import spark.mllib.util.MLUtils
import org.jblas.DoubleMatrix
import spark.mllib.regression.LabeledPoint
-object SVMGenerator {
+/**
+ * Generate sample data used for SVM. This class generates uniform random values
+ * for the features and adds Gaussian noise with weight 0.1 to generate labels.
+ */
+object SVMDataGenerator {
def main(args: Array[String]) {
- if (args.length != 5) {
+ if (args.length < 2) {
println("Usage: SVMGenerator " +
- "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+ "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
System.exit(1)
}
@@ -25,7 +26,6 @@ object SVMGenerator {
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
val parts: Int = if (args.length > 4) args(4).toInt else 2
- val eps = 3
val sc = new SparkContext(sparkMaster, "SVMGenerator")
diff --git a/mllib/src/test/scala/spark/mllib/clustering/JavaKMeansSuite.java b/mllib/src/test/java/spark/mllib/clustering/JavaKMeansSuite.java
index 3f2d82bfb4..3f2d82bfb4 100644
--- a/mllib/src/test/scala/spark/mllib/clustering/JavaKMeansSuite.java
+++ b/mllib/src/test/java/spark/mllib/clustering/JavaKMeansSuite.java
diff --git a/mllib/src/test/scala/spark/mllib/recommendation/JavaALSSuite.java b/mllib/src/test/java/spark/mllib/recommendation/JavaALSSuite.java
index 7993629a6d..7993629a6d 100644
--- a/mllib/src/test/scala/spark/mllib/recommendation/JavaALSSuite.java
+++ b/mllib/src/test/java/spark/mllib/recommendation/JavaALSSuite.java