diff options
author | Reza Zadeh <rizlar@gmail.com> | 2014-01-17 14:34:03 -0800 |
---|---|---|
committer | Reza Zadeh <rizlar@gmail.com> | 2014-01-17 14:34:03 -0800 |
commit | caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288 (patch) | |
tree | 3eaffca1eb0c9031c4f9acc2b91ea915beec25b1 /mllib/src/main | |
parent | 4e96757793e7aee165381f80a60b3f46f60c9ebc (diff) | |
parent | d749d472b37448edb322bc7208a3db925c9a4fc2 (diff) | |
download | spark-caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288.tar.gz spark-caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288.tar.bz2 spark-caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288.zip |
Merge remote-tracking branch 'upstream/master' into sparsesvd
Diffstat (limited to 'mllib/src/main')
9 files changed, 17 insertions, 26 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 3fec1a909d..efc0eb9353 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -24,7 +24,6 @@ import org.apache.spark.mllib.recommendation._ import org.apache.spark.rdd.RDD import java.nio.ByteBuffer import java.nio.ByteOrder -import java.nio.DoubleBuffer /** * The Java stubs necessary for the Python mllib bindings. @@ -81,7 +80,6 @@ class PythonMLLibAPI extends Serializable { } val db = bb.asDoubleBuffer() val ans = new Array[Array[Double]](rows.toInt) - var i = 0 for (i <- 0 until rows.toInt) { ans(i) = new Array[Double](cols.toInt) db.get(ans(i)) @@ -236,7 +234,7 @@ class PythonMLLibAPI extends Serializable { * Serialize a Rating object into an array of bytes. * It can be deserialized using RatingDeserializer(). * - * @param rate + * @param rate the Rating object to serialize * @return */ private[spark] def serializeRating(rate: Rating): Array[Byte] = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index f2964ea446..6dff29dfb4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -17,8 +17,6 @@ package org.apache.spark.mllib.classification -import scala.math.signum - import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index cfc81c985a..980be93157 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -19,8 +19,6 @@ package org.apache.spark.mllib.clustering import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.util.MLUtils - /** * A clustering model for K-means. Each point belongs to the cluster with the closest center. @@ -39,6 +37,6 @@ class KMeansModel(val clusterCenters: Array[Array[Double]]) extends Serializable * model on the given data. */ def computeCost(data: RDD[Array[Double]]): Double = { - data.map(p => KMeans.pointCost(clusterCenters, p)).sum + data.map(p => KMeans.pointCost(clusterCenters, p)).sum() } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala index fe5cce064b..df599fde76 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala index c125c6797a..0c0e67fb7b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils @@ -76,7 +76,7 @@ class RidgeRegressionWithSGD private ( def createModel(weights: Array[Double], intercept: Double) = { val weightsMat = new DoubleMatrix(weights.length + 1, 1, (Array(intercept) ++ weights):_*) val weightsScaled = weightsMat.div(xColSd) - val interceptScaled = yMean - (weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0)) + val interceptScaled = yMean - weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0) new RidgeRegressionModel(weightsScaled.data, interceptScaled) } @@ -86,7 +86,7 @@ class RidgeRegressionWithSGD private ( initialWeights: Array[Double]) : RidgeRegressionModel = { - val nfeatures: Int = input.first.features.length + val nfeatures: Int = input.first().features.length val nexamples: Long = input.count() // To avoid penalizing the intercept, we center and scale the data. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index bc5045fb05..2e03684e62 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -25,7 +25,6 @@ import org.jblas.DoubleMatrix import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.LabeledPoint /** * Generate sample data used for Linear Data. This class generates @@ -73,7 +72,7 @@ object LinearDataGenerator { val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(2 * rnd.nextDouble - 1.0)) val y = x.map { xi => - (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + intercept + eps * rnd.nextGaussian() + new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + eps * rnd.nextGaussian() } y.zip(x).map(p => LabeledPoint(p._1, p._2)) } @@ -86,7 +85,6 @@ object LinearDataGenerator { * @param nexamples Number of examples that will be contained in the RDD. * @param nfeatures Number of features to generate for each example. * @param eps Epsilon factor by which examples are scaled. - * @param weights Weights associated with the first weights.length features. * @param nparts Number of partitions in the RDD. Default value is 2. * * @return RDD of LabeledPoint containing sample data. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala index d5f3f6b8db..348aba1dea 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.mllib.recommendation +package org.apache.spark.mllib.util import scala.util.Random @@ -23,7 +23,6 @@ import org.jblas.DoubleMatrix import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.util.MLUtils /** * Generate RDD(s) containing data for Matrix Factorization. @@ -31,9 +30,9 @@ import org.apache.spark.mllib.util.MLUtils * This method samples training entries according to the oversampling factor * 'trainSampFact', which is a multiplicative factor of the number of * degrees of freedom of the matrix: rank*(m+n-rank). -* -* It optionally samples entries for a testing matrix using -* 'testSampFact', the percentage of the number of training entries +* +* It optionally samples entries for a testing matrix using +* 'testSampFact', the percentage of the number of training entries * to use for testing. * * This method takes the following inputs: @@ -73,7 +72,7 @@ object MFDataGenerator{ val A = DoubleMatrix.randn(m, rank) val B = DoubleMatrix.randn(rank, n) - val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank))) + val z = 1 / scala.math.sqrt(scala.math.sqrt(rank)) A.mmuli(z) B.mmuli(z) val fullData = A.mmul(B) @@ -91,7 +90,7 @@ object MFDataGenerator{ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) // optionally add gaussian noise - if (noise) { + if (noise) { trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma)) } @@ -107,8 +106,8 @@ object MFDataGenerator{ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) } - + sc.stop() - + } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index d91b74c3ac..64c6136a8b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -97,7 +97,7 @@ object MLUtils { while (col < nfeatures) { xColMean.put(col, xColSumsMap(col)._1 / nexamples) val variance = - (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / (nexamples) + (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / nexamples xColSd.put(col, math.sqrt(variance)) col += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala index 07022093f3..c96c94f70e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala @@ -56,7 +56,7 @@ object SVMDataGenerator { val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } - val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1 + val yD = new DoubleMatrix(1, x.length, x: _*).dot(trueWeights) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, x) } |