aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorReza Zadeh <rizlar@gmail.com>2014-01-17 14:34:03 -0800
committerReza Zadeh <rizlar@gmail.com>2014-01-17 14:34:03 -0800
commitcaf97a25a2bd70ef5164c3ce0e8b59a8e39eb288 (patch)
tree3eaffca1eb0c9031c4f9acc2b91ea915beec25b1 /mllib
parent4e96757793e7aee165381f80a60b3f46f60c9ebc (diff)
parentd749d472b37448edb322bc7208a3db925c9a4fc2 (diff)
downloadspark-caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288.tar.gz
spark-caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288.tar.bz2
spark-caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288.zip
Merge remote-tracking branch 'upstream/master' into sparsesvd
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala17
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala2
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java17
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala6
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala9
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala3
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala1
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala6
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala5
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala3
17 files changed, 45 insertions, 48 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 3fec1a909d..efc0eb9353 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -24,7 +24,6 @@ import org.apache.spark.mllib.recommendation._
import org.apache.spark.rdd.RDD
import java.nio.ByteBuffer
import java.nio.ByteOrder
-import java.nio.DoubleBuffer
/**
* The Java stubs necessary for the Python mllib bindings.
@@ -81,7 +80,6 @@ class PythonMLLibAPI extends Serializable {
}
val db = bb.asDoubleBuffer()
val ans = new Array[Array[Double]](rows.toInt)
- var i = 0
for (i <- 0 until rows.toInt) {
ans(i) = new Array[Double](cols.toInt)
db.get(ans(i))
@@ -236,7 +234,7 @@ class PythonMLLibAPI extends Serializable {
* Serialize a Rating object into an array of bytes.
* It can be deserialized using RatingDeserializer().
*
- * @param rate
+ * @param rate the Rating object to serialize
* @return
*/
private[spark] def serializeRating(rate: Rating): Array[Byte] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index f2964ea446..6dff29dfb4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -17,8 +17,6 @@
package org.apache.spark.mllib.classification
-import scala.math.signum
-
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization._
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index cfc81c985a..980be93157 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -19,8 +19,6 @@ package org.apache.spark.mllib.clustering
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.util.MLUtils
-
/**
* A clustering model for K-means. Each point belongs to the cluster with the closest center.
@@ -39,6 +37,6 @@ class KMeansModel(val clusterCenters: Array[Array[Double]]) extends Serializable
* model on the given data.
*/
def computeCost(data: RDD[Array[Double]]): Double = {
- data.map(p => KMeans.pointCost(clusterCenters, p)).sum
+ data.map(p => KMeans.pointCost(clusterCenters, p)).sum()
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
index fe5cce064b..df599fde76 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.regression
-import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.util.MLUtils
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index c125c6797a..0c0e67fb7b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.regression
-import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.util.MLUtils
@@ -76,7 +76,7 @@ class RidgeRegressionWithSGD private (
def createModel(weights: Array[Double], intercept: Double) = {
val weightsMat = new DoubleMatrix(weights.length + 1, 1, (Array(intercept) ++ weights):_*)
val weightsScaled = weightsMat.div(xColSd)
- val interceptScaled = yMean - (weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0))
+ val interceptScaled = yMean - weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0)
new RidgeRegressionModel(weightsScaled.data, interceptScaled)
}
@@ -86,7 +86,7 @@ class RidgeRegressionWithSGD private (
initialWeights: Array[Double])
: RidgeRegressionModel =
{
- val nfeatures: Int = input.first.features.length
+ val nfeatures: Int = input.first().features.length
val nexamples: Long = input.count()
// To avoid penalizing the intercept, we center and scale the data.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index bc5045fb05..2e03684e62 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -25,7 +25,6 @@ import org.jblas.DoubleMatrix
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.regression.LabeledPoint
/**
* Generate sample data used for Linear Data. This class generates
@@ -73,7 +72,7 @@ object LinearDataGenerator {
val x = Array.fill[Array[Double]](nPoints)(
Array.fill[Double](weights.length)(2 * rnd.nextDouble - 1.0))
val y = x.map { xi =>
- (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + intercept + eps * rnd.nextGaussian()
+ new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + eps * rnd.nextGaussian()
}
y.zip(x).map(p => LabeledPoint(p._1, p._2))
}
@@ -86,7 +85,6 @@ object LinearDataGenerator {
* @param nexamples Number of examples that will be contained in the RDD.
* @param nfeatures Number of features to generate for each example.
* @param eps Epsilon factor by which examples are scaled.
- * @param weights Weights associated with the first weights.length features.
* @param nparts Number of partitions in the RDD. Default value is 2.
*
* @return RDD of LabeledPoint containing sample data.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
index d5f3f6b8db..348aba1dea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.spark.mllib.recommendation
+package org.apache.spark.mllib.util
import scala.util.Random
@@ -23,7 +23,6 @@ import org.jblas.DoubleMatrix
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.util.MLUtils
/**
* Generate RDD(s) containing data for Matrix Factorization.
@@ -31,9 +30,9 @@ import org.apache.spark.mllib.util.MLUtils
* This method samples training entries according to the oversampling factor
* 'trainSampFact', which is a multiplicative factor of the number of
* degrees of freedom of the matrix: rank*(m+n-rank).
-*
-* It optionally samples entries for a testing matrix using
-* 'testSampFact', the percentage of the number of training entries
+*
+* It optionally samples entries for a testing matrix using
+* 'testSampFact', the percentage of the number of training entries
* to use for testing.
*
* This method takes the following inputs:
@@ -73,7 +72,7 @@ object MFDataGenerator{
val A = DoubleMatrix.randn(m, rank)
val B = DoubleMatrix.randn(rank, n)
- val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank)))
+ val z = 1 / scala.math.sqrt(scala.math.sqrt(rank))
A.mmuli(z)
B.mmuli(z)
val fullData = A.mmul(B)
@@ -91,7 +90,7 @@ object MFDataGenerator{
.map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
// optionally add gaussian noise
- if (noise) {
+ if (noise) {
trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma))
}
@@ -107,8 +106,8 @@ object MFDataGenerator{
.map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
}
-
+
sc.stop()
-
+
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index d91b74c3ac..64c6136a8b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -97,7 +97,7 @@ object MLUtils {
while (col < nfeatures) {
xColMean.put(col, xColSumsMap(col)._1 / nexamples)
val variance =
- (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / (nexamples)
+ (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / nexamples
xColSd.put(col, math.sqrt(variance))
col += 1
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
index 07022093f3..c96c94f70e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
@@ -56,7 +56,7 @@ object SVMDataGenerator {
val x = Array.fill[Double](nfeatures) {
rnd.nextDouble() * 2.0 - 1.0
}
- val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1
+ val yD = new DoubleMatrix(1, x.length, x: _*).dot(trueWeights) + rnd.nextGaussian() * 0.1
val y = if (yD < 0) 0.0 else 1.0
LabeledPoint(y, x)
}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
index 23ea3548b9..073ded6f36 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.spark.mllib.classification;
import org.apache.spark.api.java.JavaRDD;
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 34c67294e9..02ede71137 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -80,9 +80,9 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll with Shoul
}
def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
- val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
- (prediction != expected.label)
- }.size
+ val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
+ prediction != expected.label
+ }
// At least 83% of the predictions should be on.
((input.length - numOffPredictions).toDouble / input.length) should be > 0.83
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
index 6a957e3ddc..3357b86f9b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
@@ -18,7 +18,6 @@
package org.apache.spark.mllib.classification
import scala.util.Random
-import scala.math.signum
import scala.collection.JavaConversions._
import org.scalatest.BeforeAndAfterAll
@@ -50,7 +49,7 @@ object SVMSuite {
val x = Array.fill[Array[Double]](nPoints)(
Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0))
val y = x.map { xi =>
- val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) +
+ val yD = new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) +
intercept + 0.01 * rnd.nextGaussian()
if (yD < 0) 0.0 else 1.0
}
@@ -72,9 +71,9 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
}
def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
- val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
- (prediction != expected.label)
- }.size
+ val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
+ prediction != expected.label
+ }
// At least 80% of the predictions should be on.
assert(numOffPredictions < input.length / 5)
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 94245f6027..73657cac89 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -17,15 +17,12 @@
package org.apache.spark.mllib.clustering
-import scala.util.Random
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite
import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.jblas._
class KMeansSuite extends FunSuite with BeforeAndAfterAll {
@transient private var sc: SparkContext = _
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index e683a90f57..4e8dbde658 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -24,7 +24,6 @@ import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite
import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
import org.jblas._
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
index db980c7bae..b2c8df97a8 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
@@ -17,8 +17,6 @@
package org.apache.spark.mllib.regression
-import scala.collection.JavaConversions._
-import scala.util.Random
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite
@@ -41,10 +39,10 @@ class LassoSuite extends FunSuite with BeforeAndAfterAll {
}
def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
- val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
+ val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
// A prediction is off if the prediction is more than 0.5 away from expected value.
math.abs(prediction - expected.label) > 0.5
- }.size
+ }
// At least 80% of the predictions should be on.
assert(numOffPredictions < input.length / 5)
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
index ef500c704c..406afbaa3e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
@@ -21,7 +21,6 @@ import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite
import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
import org.apache.spark.mllib.util.LinearDataGenerator
class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll {
@@ -37,10 +36,10 @@ class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll {
}
def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
- val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
+ val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
// A prediction is off if the prediction is more than 0.5 away from expected value.
math.abs(prediction - expected.label) > 0.5
- }.size
+ }
// At least 80% of the predictions should be on.
assert(numOffPredictions < input.length / 5)
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
index c18092d804..1d6a10b66e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
@@ -17,15 +17,12 @@
package org.apache.spark.mllib.regression
-import scala.collection.JavaConversions._
-import scala.util.Random
import org.jblas.DoubleMatrix
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite
import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
import org.apache.spark.mllib.util.LinearDataGenerator
class RidgeRegressionSuite extends FunSuite with BeforeAndAfterAll {