aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorGinger Smith <vsmith@berkeley.edu>2013-08-05 21:26:24 -0700
committerGinger Smith <vsmith@berkeley.edu>2013-08-05 21:26:24 -0700
commitbf7033f3ebf9315ccf9aba09a6e702c3a671fd8d (patch)
treecc8e8b799574b2454b6b446c421a83e58d908aa8 /mllib
parent8c8947e2b66169dddb828b801ffaa43cc400b8a5 (diff)
downloadspark-bf7033f3ebf9315ccf9aba09a6e702c3a671fd8d.tar.gz
spark-bf7033f3ebf9315ccf9aba09a6e702c3a671fd8d.tar.bz2
spark-bf7033f3ebf9315ccf9aba09a6e702c3a671fd8d.zip
fixing formatting, style, and input
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala73
1 files changed, 37 insertions, 36 deletions
diff --git a/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala
index 1d2b5c89f0..88992cde0c 100644
--- a/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala
@@ -28,32 +28,32 @@ import spark.mllib.util.MLUtils
* Generate RDD(s) containing data for Matrix Factorization.
*
* This method samples training entries according to the oversampling factor
-* 'tr_samp_fact', which is a multiplicative factor of the number of
+* 'trainSampFact', which is a multiplicative factor of the number of
* degrees of freedom of the matrix: rank*(m+n-rank).
*
* It optionally samples entries for a testing matrix using
-* 'te_samp_fact', the percentage of the number of training entries
+* 'testSampFact', the percentage of the number of training entries
* to use for testing.
*
* This method takes the following inputs:
-* sparkMaster (String) The master URL.
-* outputPath (String) Directory to save output.
-* m (Int) Number of rows in data matrix.
-* n (Int) Number of columns in data matrix.
-* rank (Int) Underlying rank of data matrix.
-* tr_samp_fact (Double) Oversampling factor.
-* noise (Boolean) Whether to add gaussian noise to training data.
-* sigma (Double) Standard deviation of added gaussian noise.
-* test (Boolean) Whether to create testing RDD.
-* te_samp_fact (Double) Percentage of training data to use as test data.
+* sparkMaster (String) The master URL.
+* outputPath (String) Directory to save output.
+* m (Int) Number of rows in data matrix.
+* n (Int) Number of columns in data matrix.
+* rank (Int) Underlying rank of data matrix.
+* trainSampFact (Double) Oversampling factor.
+* noise (Boolean) Whether to add gaussian noise to training data.
+* sigma (Double) Standard deviation of added gaussian noise.
+* test (Boolean) Whether to create testing RDD.
+* testSampFact (Double) Percentage of training data to use as test data.
*/
object MFDataGenerator{
def main(args: Array[String]) {
- if (args.length != 10) {
- println("Usage: MFGenerator " +
- "<master> <output_dir> <m> <n> <rank> <tr_samp_fact> <noise> <sigma> <test> <te_samp_fact>")
+ if (args.length < 2) {
+ println("Usage: MFDataGenerator " +
+ "<master> <outputDir> [m] [n] [rank] [trainSampFact] [noise] [sigma] [test] [testSampFact]")
System.exit(1)
}
@@ -62,51 +62,52 @@ object MFDataGenerator{
val m: Int = if (args.length > 2) args(2).toInt else 100
val n: Int = if (args.length > 3) args(3).toInt else 100
val rank: Int = if (args.length > 4) args(4).toInt else 10
- val tr_samp_fact: Double = if (args.length > 5) args(5).toDouble else 1.0
+ val trainSampFact: Double = if (args.length > 5) args(5).toDouble else 1.0
val noise: Boolean = if (args.length > 6) args(6).toBoolean else false
val sigma: Double = if (args.length > 7) args(7).toDouble else 0.1
val test: Boolean = if (args.length > 8) args(8).toBoolean else false
- val te_samp_fact: Double = if (args.length > 9) args(9).toDouble else 0.1
+ val testSampFact: Double = if (args.length > 9) args(9).toDouble else 0.1
val sc = new SparkContext(sparkMaster, "MFDataGenerator")
- val A = DoubleMatrix.randn(m,rank)
- val B = DoubleMatrix.randn(rank,n)
- val z = 1/(scala.math.sqrt(scala.math.sqrt(rank)))
+ val A = DoubleMatrix.randn(m, rank)
+ val B = DoubleMatrix.randn(rank, n)
+ val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank)))
A.mmuli(z)
B.mmuli(z)
val fullData = A.mmul(B)
- val df = rank*(m+n-rank)
- val sampsize = scala.math.min(scala.math.round(tr_samp_fact*df), scala.math.round(.99*m*n)).toInt
+ val df = rank * (m + n - rank)
+ val sampSize = scala.math.min(scala.math.round(trainSampFact * df),
+ scala.math.round(.99 * m * n)).toInt
val rand = new Random()
- val mn = m*n
+ val mn = m * n
val shuffled = rand.shuffle(1 to mn toIterable)
- val omega = shuffled.slice(0,sampsize)
+ val omega = shuffled.slice(0, sampSize)
val ordered = omega.sortWith(_ < _).toArray
val trainData: RDD[(Int, Int, Double)] = sc.parallelize(ordered)
- .map(x => (fullData.indexRows(x-1),fullData.indexColumns(x-1),fullData.get(x-1)))
+ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
// optionally add gaussian noise
- if(noise){
- trainData.map(x => (x._1,x._2,x._3+rand.nextGaussian*sigma))
+ if (noise) {
+ trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma))
}
trainData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
// optionally generate testing data
- if(test){
- val test_sampsize = scala.math
- .min(scala.math.round(sampsize*te_samp_fact),scala.math.round(mn-sampsize))
- .toInt
- val test_omega = shuffled.slice(sampsize,sampsize+test_sampsize)
- val test_ordered = test_omega.sortWith(_ < _).toArray
- val testData: RDD[(Int, Int, Double)] = sc.parallelize(test_ordered)
- .map(x=> (fullData.indexRows(x-1),fullData.indexColumns(x-1),fullData.get(x-1)))
+ if (test) {
+ val testSampSize = scala.math
+ .min(scala.math.round(sampSize * testSampFact),scala.math.round(mn - sampSize)).toInt
+ val testOmega = shuffled.slice(sampSize, sampSize + testSampSize)
+ val testOrdered = testOmega.sortWith(_ < _).toArray
+ val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered)
+ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
}
- sc.stop()
+ sc.stop()
+
}
} \ No newline at end of file