From bf7033f3ebf9315ccf9aba09a6e702c3a671fd8d Mon Sep 17 00:00:00 2001 From: Ginger Smith Date: Mon, 5 Aug 2013 21:26:24 -0700 Subject: fixing formatting, style, and input --- .../scala/spark/mllib/util/MFDataGenerator.scala | 73 +++++++++++----------- 1 file changed, 37 insertions(+), 36 deletions(-) (limited to 'mllib/src') diff --git a/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala index 1d2b5c89f0..88992cde0c 100644 --- a/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/spark/mllib/util/MFDataGenerator.scala @@ -28,32 +28,32 @@ import spark.mllib.util.MLUtils * Generate RDD(s) containing data for Matrix Factorization. * * This method samples training entries according to the oversampling factor -* 'tr_samp_fact', which is a multiplicative factor of the number of +* 'trainSampFact', which is a multiplicative factor of the number of * degrees of freedom of the matrix: rank*(m+n-rank). * * It optionally samples entries for a testing matrix using -* 'te_samp_fact', the percentage of the number of training entries +* 'testSampFact', the percentage of the number of training entries * to use for testing. * * This method takes the following inputs: -* sparkMaster (String) The master URL. -* outputPath (String) Directory to save output. -* m (Int) Number of rows in data matrix. -* n (Int) Number of columns in data matrix. -* rank (Int) Underlying rank of data matrix. -* tr_samp_fact (Double) Oversampling factor. -* noise (Boolean) Whether to add gaussian noise to training data. -* sigma (Double) Standard deviation of added gaussian noise. -* test (Boolean) Whether to create testing RDD. -* te_samp_fact (Double) Percentage of training data to use as test data. +* sparkMaster (String) The master URL. +* outputPath (String) Directory to save output. +* m (Int) Number of rows in data matrix. +* n (Int) Number of columns in data matrix. +* rank (Int) Underlying rank of data matrix. +* trainSampFact (Double) Oversampling factor. +* noise (Boolean) Whether to add gaussian noise to training data. +* sigma (Double) Standard deviation of added gaussian noise. +* test (Boolean) Whether to create testing RDD. +* testSampFact (Double) Percentage of training data to use as test data. */ object MFDataGenerator{ def main(args: Array[String]) { - if (args.length != 10) { - println("Usage: MFGenerator " + - " ") + if (args.length < 2) { + println("Usage: MFDataGenerator " + + " [m] [n] [rank] [trainSampFact] [noise] [sigma] [test] [testSampFact]") System.exit(1) } @@ -62,51 +62,52 @@ object MFDataGenerator{ val m: Int = if (args.length > 2) args(2).toInt else 100 val n: Int = if (args.length > 3) args(3).toInt else 100 val rank: Int = if (args.length > 4) args(4).toInt else 10 - val tr_samp_fact: Double = if (args.length > 5) args(5).toDouble else 1.0 + val trainSampFact: Double = if (args.length > 5) args(5).toDouble else 1.0 val noise: Boolean = if (args.length > 6) args(6).toBoolean else false val sigma: Double = if (args.length > 7) args(7).toDouble else 0.1 val test: Boolean = if (args.length > 8) args(8).toBoolean else false - val te_samp_fact: Double = if (args.length > 9) args(9).toDouble else 0.1 + val testSampFact: Double = if (args.length > 9) args(9).toDouble else 0.1 val sc = new SparkContext(sparkMaster, "MFDataGenerator") - val A = DoubleMatrix.randn(m,rank) - val B = DoubleMatrix.randn(rank,n) - val z = 1/(scala.math.sqrt(scala.math.sqrt(rank))) + val A = DoubleMatrix.randn(m, rank) + val B = DoubleMatrix.randn(rank, n) + val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank))) A.mmuli(z) B.mmuli(z) val fullData = A.mmul(B) - val df = rank*(m+n-rank) - val sampsize = scala.math.min(scala.math.round(tr_samp_fact*df), scala.math.round(.99*m*n)).toInt + val df = rank * (m + n - rank) + val sampSize = scala.math.min(scala.math.round(trainSampFact * df), + scala.math.round(.99 * m * n)).toInt val rand = new Random() - val mn = m*n + val mn = m * n val shuffled = rand.shuffle(1 to mn toIterable) - val omega = shuffled.slice(0,sampsize) + val omega = shuffled.slice(0, sampSize) val ordered = omega.sortWith(_ < _).toArray val trainData: RDD[(Int, Int, Double)] = sc.parallelize(ordered) - .map(x => (fullData.indexRows(x-1),fullData.indexColumns(x-1),fullData.get(x-1))) + .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) // optionally add gaussian noise - if(noise){ - trainData.map(x => (x._1,x._2,x._3+rand.nextGaussian*sigma)) + if (noise) { + trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma)) } trainData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) // optionally generate testing data - if(test){ - val test_sampsize = scala.math - .min(scala.math.round(sampsize*te_samp_fact),scala.math.round(mn-sampsize)) - .toInt - val test_omega = shuffled.slice(sampsize,sampsize+test_sampsize) - val test_ordered = test_omega.sortWith(_ < _).toArray - val testData: RDD[(Int, Int, Double)] = sc.parallelize(test_ordered) - .map(x=> (fullData.indexRows(x-1),fullData.indexColumns(x-1),fullData.get(x-1))) + if (test) { + val testSampSize = scala.math + .min(scala.math.round(sampSize * testSampFact),scala.math.round(mn - sampSize)).toInt + val testOmega = shuffled.slice(sampSize, sampSize + testSampSize) + val testOrdered = testOmega.sortWith(_ < _).toArray + val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered) + .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) } - sc.stop() + sc.stop() + } } \ No newline at end of file -- cgit v1.2.3