aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorGinger Smith <vsmith@berkeley.edu>2013-08-05 11:22:18 -0700
committerGinger Smith <vsmith@berkeley.edu>2013-08-05 11:22:18 -0700
commit8c8947e2b66169dddb828b801ffaa43cc400b8a5 (patch)
treea44e0e06e801e4e9fe421e28ec46730825cbca85 /mllib
parent4ab4df5edbc1bded810a8a3e1dfc7f8ae40a7c30 (diff)
downloadspark-8c8947e2b66169dddb828b801ffaa43cc400b8a5.tar.gz
spark-8c8947e2b66169dddb828b801ffaa43cc400b8a5.tar.bz2
spark-8c8947e2b66169dddb828b801ffaa43cc400b8a5.zip
fixing formatting
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala39
1 files changed, 23 insertions, 16 deletions
diff --git a/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala
index 8637d27cd0..1d2b5c89f0 100644
--- a/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala
@@ -24,25 +24,32 @@ import org.jblas.DoubleMatrix
import spark.{RDD, SparkContext}
import spark.mllib.util.MLUtils
+/**
+* Generate RDD(s) containing data for Matrix Factorization.
+*
+* This method samples training entries according to the oversampling factor
+* 'tr_samp_fact', which is a multiplicative factor of the number of
+* degrees of freedom of the matrix: rank*(m+n-rank).
+*
+* It optionally samples entries for a testing matrix using
+* 'te_samp_fact', the percentage of the number of training entries
+* to use for testing.
+*
+* This method takes the following inputs:
+* sparkMaster (String) The master URL.
+* outputPath (String) Directory to save output.
+* m (Int) Number of rows in data matrix.
+* n (Int) Number of columns in data matrix.
+* rank (Int) Underlying rank of data matrix.
+* tr_samp_fact (Double) Oversampling factor.
+* noise (Boolean) Whether to add gaussian noise to training data.
+* sigma (Double) Standard deviation of added gaussian noise.
+* test (Boolean) Whether to create testing RDD.
+* te_samp_fact (Double) Percentage of training data to use as test data.
+*/
object MFDataGenerator{
- /**
- * Generate RDD(s) containing data for Matrix Factorization. This function chooses
- * positive labels with probability `probOne` and scales positive examples by `eps`.
- *
- * @param sc SparkContext to use for creating the RDD.
- * @param outputPath Directory to save output.
- * @param m Number of rows in data matrix.
- * @param n Number of columns in data matrix.
- * @param rank Underlying rank of data matrix.
- * @param tr_samp_fact Oversampling factor.
- * @param noise Boolean value - whether to add gaussian noise to training data.
- * @param sigma Standard deviation of added gaussian noise.
- * @param test Boolean value - whether to create testing RDD.
- * @param te_samp_fact Percentage of training data to use as test data.
- */
-
def main(args: Array[String]) {
if (args.length != 10) {
println("Usage: MFGenerator " +