diff options
Diffstat (limited to 'mllib/src')
4 files changed, 18 insertions, 15 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index efc0eb9353..efe99a31be 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -106,7 +106,8 @@ class PythonMLLibAPI extends Serializable { bytes } - private def trainRegressionModel(trainFunc: (RDD[LabeledPoint], Array[Double]) => GeneralizedLinearModel, + private def trainRegressionModel( + trainFunc: (RDD[LabeledPoint], Array[Double]) => GeneralizedLinearModel, dataBytesJRDD: JavaRDD[Array[Byte]], initialWeightsBA: Array[Byte]): java.util.LinkedList[java.lang.Object] = { val data = dataBytesJRDD.rdd.map(xBytes => { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala index e476b53450..8803c4c1a0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala @@ -105,7 +105,7 @@ object SVD { cols.flatMap{ case (colind1, mval1) => cols.map{ case (colind2, mval2) => ((colind1, colind2), mval1*mval2) } } - }.reduceByKey(_+_) + }.reduceByKey(_ + _) // Construct jblas A^T A locally val ata = DoubleMatrix.zeros(n, n) @@ -145,10 +145,10 @@ object SVD { // Multiply A by VS^-1 val aCols = data.map(entry => (entry.j, (entry.i, entry.mval))) val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2))) - val retUdata = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) ) - => ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_) + val retUdata = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal))) + => ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_ + _) .map{ case ((row, col), mval) => MatrixEntry(row, col, mval)} - val retU = SparseMatrix(retUdata, m, sigma.length) + val retU = SparseMatrix(retUdata, m, sigma.length) MatrixSVD(retU, retS, retV) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index b77364e08d..cd80134737 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -142,7 +142,7 @@ object GradientDescent extends Logging { var regVal = 0.0 for (i <- 1 to numIterations) { - val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map { + val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42 + i).map { case (y, features) => val featuresCol = new DoubleMatrix(features.length, 1, features:_*) val (grad, loss) = gradient.compute(featuresCol, y, weights) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index c5f64b1350..a990e0fb01 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -84,8 +84,9 @@ case class Rating(val user: Int, val product: Int, val rating: Double) * [[http://research.yahoo.com/pub/2433]], adapted for the blocked approach used here. * * Essentially instead of finding the low-rank approximations to the rating matrix `R`, - * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if r > 0 - * and 0 if r = 0. The ratings then act as 'confidence' values related to strength of indicated user + * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if + * r > 0 and 0 if r = 0. The ratings then act as 'confidence' values related to strength of + * indicated user * preferences rather than explicit ratings given to items. */ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var lambda: Double, @@ -152,8 +153,8 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l val (userInLinks, userOutLinks) = makeLinkRDDs(numBlocks, ratingsByUserBlock) val (productInLinks, productOutLinks) = makeLinkRDDs(numBlocks, ratingsByProductBlock) - // Initialize user and product factors randomly, but use a deterministic seed for each partition - // so that fault recovery works + // Initialize user and product factors randomly, but use a deterministic seed for each + // partition so that fault recovery works val seedGen = new Random() val seed1 = seedGen.nextInt() val seed2 = seedGen.nextInt() @@ -268,7 +269,8 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l val groupedRatings = blockRatings(productBlock).groupBy(_.product).toArray // Sort them by product ID val ordering = new Ordering[(Int, ArrayBuffer[Rating])] { - def compare(a: (Int, ArrayBuffer[Rating]), b: (Int, ArrayBuffer[Rating])): Int = a._1 - b._1 + def compare(a: (Int, ArrayBuffer[Rating]), b: (Int, ArrayBuffer[Rating])): Int = + a._1 - b._1 } Sorting.quickSort(groupedRatings)(ordering) // Translate the user IDs to indices based on userIdToPos @@ -369,7 +371,8 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l val tempXtX = DoubleMatrix.zeros(triangleSize) val fullXtX = DoubleMatrix.zeros(rank, rank) - // Compute the XtX and Xy values for each user by adding products it rated in each product block + // Compute the XtX and Xy values for each user by adding products it rated in each product + // block for (productBlock <- 0 until numBlocks) { for (p <- 0 until blockFactors(productBlock).length) { val x = new DoubleMatrix(blockFactors(productBlock)(p)) @@ -544,9 +547,8 @@ object ALS { * @param iterations number of iterations of ALS (recommended: 10-20) * @param lambda regularization factor (recommended: 0.01) */ - def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, alpha: Double) - : MatrixFactorizationModel = - { + def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, + alpha: Double): MatrixFactorizationModel = { trainImplicit(ratings, rank, iterations, lambda, -1, alpha) } |