aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-05-31 11:35:30 -0700
committerReynold Xin <rxin@databricks.com>2015-05-31 11:35:30 -0700
commite1067d0ad1c32c678c23d76d7653b51770795831 (patch)
tree4c058532a6d72a13f89760b0cd8a16b87a23e23a /mllib
parentd1d2def2f5f91e86f340656421170d1097f14854 (diff)
downloadspark-e1067d0ad1c32c678c23d76d7653b51770795831.tar.gz
spark-e1067d0ad1c32c678c23d76d7653b51770795831.tar.bz2
spark-e1067d0ad1c32c678c23d76d7653b51770795831.zip
[SPARK-3850] Trim trailing spaces for MLlib.
Author: Reynold Xin <rxin@databricks.com> Closes #6534 from rxin/whitespace-mllib and squashes the following commits: 38926e3 [Reynold Xin] [SPARK-3850] Trim trailing spaces for MLlib.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala86
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala22
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala50
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala54
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala4
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala34
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala6
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala8
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala10
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala14
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala2
30 files changed, 189 insertions, 189 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index fdd2494fc8..b0fd06d84f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -35,13 +35,13 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
/**
* Centers the data with mean before scaling.
- * It will build a dense output, so this does not work on sparse input
+ * It will build a dense output, so this does not work on sparse input
* and will raise an exception.
* Default: false
* @group param
*/
val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean")
-
+
/**
* Scales the data to unit standard deviation.
* Default: true
@@ -68,13 +68,13 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM
/** @group setParam */
def setOutputCol(value: String): this.type = set(outputCol, value)
-
+
/** @group setParam */
def setWithMean(value: Boolean): this.type = set(withMean, value)
-
+
/** @group setParam */
def setWithStd(value: Boolean): this.type = set(withStd, value)
-
+
override def fit(dataset: DataFrame): StandardScalerModel = {
transformSchema(dataset.schema, logging = true)
val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 7c40db1a40..fe2a71a331 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -321,7 +321,7 @@ private class LeastSquaresAggregator(
}
(weightsArray, -sum + labelMean / labelStd, weightsArray.length)
}
-
+
private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
private val gradientSumArray = Array.ofDim[Double](dim)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 65f30fdba7..16f3131796 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -399,7 +399,7 @@ private[python] class PythonMLLibAPI extends Serializable {
val sigma = si.map(_.asInstanceOf[DenseMatrix])
val gaussians = Array.tabulate(weight.length){
i => new MultivariateGaussian(mean(i), sigma(i))
- }
+ }
val model = new GaussianMixtureModel(weight, gaussians)
model.predictSoft(data).map(Vectors.dense)
}
@@ -494,7 +494,7 @@ private[python] class PythonMLLibAPI extends Serializable {
def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = {
new Normalizer(p).transform(rdd)
}
-
+
/**
* Java stub for StandardScaler.fit(). This stub returns a
* handle to the Java object instead of the content of the Java object.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index e9a23e40cc..70b0e40948 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -36,11 +36,11 @@ import org.apache.spark.util.Utils
* independent Gaussian distributions with associated "mixing" weights
* specifying each's contribution to the composite.
*
- * Given a set of sample points, this class will maximize the log-likelihood
- * for a mixture of k Gaussians, iterating until the log-likelihood changes by
+ * Given a set of sample points, this class will maximize the log-likelihood
+ * for a mixture of k Gaussians, iterating until the log-likelihood changes by
* less than convergenceTol, or until it has reached the max number of iterations.
* While this process is generally guaranteed to converge, it is not guaranteed
- * to find a global optimum.
+ * to find a global optimum.
*
* Note: For high-dimensional data (with many features), this algorithm may perform poorly.
* This is due to high-dimensional data (a) making it difficult to cluster at all (based
@@ -53,24 +53,24 @@ import org.apache.spark.util.Utils
*/
@Experimental
class GaussianMixture private (
- private var k: Int,
- private var convergenceTol: Double,
+ private var k: Int,
+ private var convergenceTol: Double,
private var maxIterations: Int,
private var seed: Long) extends Serializable {
-
+
/**
* Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
* maxIterations: 100, seed: random}.
*/
def this() = this(2, 0.01, 100, Utils.random.nextLong())
-
+
// number of samples per cluster to use when initializing Gaussians
private val nSamples = 5
-
- // an initializing GMM can be provided rather than using the
+
+ // an initializing GMM can be provided rather than using the
// default random starting point
private var initialModel: Option[GaussianMixtureModel] = None
-
+
/** Set the initial GMM starting point, bypassing the random initialization.
* You must call setK() prior to calling this method, and the condition
* (model.k == this.k) must be met; failure will result in an IllegalArgumentException
@@ -83,37 +83,37 @@ class GaussianMixture private (
}
this
}
-
+
/** Return the user supplied initial GMM, if supplied */
def getInitialModel: Option[GaussianMixtureModel] = initialModel
-
+
/** Set the number of Gaussians in the mixture model. Default: 2 */
def setK(k: Int): this.type = {
this.k = k
this
}
-
+
/** Return the number of Gaussians in the mixture model */
def getK: Int = k
-
+
/** Set the maximum number of iterations to run. Default: 100 */
def setMaxIterations(maxIterations: Int): this.type = {
this.maxIterations = maxIterations
this
}
-
+
/** Return the maximum number of iterations to run */
def getMaxIterations: Int = maxIterations
-
+
/**
- * Set the largest change in log-likelihood at which convergence is
+ * Set the largest change in log-likelihood at which convergence is
* considered to have occurred.
*/
def setConvergenceTol(convergenceTol: Double): this.type = {
this.convergenceTol = convergenceTol
this
}
-
+
/**
* Return the largest change in log-likelihood at which convergence is
* considered to have occurred.
@@ -132,41 +132,41 @@ class GaussianMixture private (
/** Perform expectation maximization */
def run(data: RDD[Vector]): GaussianMixtureModel = {
val sc = data.sparkContext
-
+
// we will operate on the data as breeze data
val breezeData = data.map(_.toBreeze).cache()
-
+
// Get length of the input vectors
val d = breezeData.first().length
-
+
// Determine initial weights and corresponding Gaussians.
// If the user supplied an initial GMM, we use those values, otherwise
// we start with uniform weights, a random mean from the data, and
// diagonal covariance matrices using component variances
- // derived from the samples
+ // derived from the samples
val (weights, gaussians) = initialModel match {
case Some(gmm) => (gmm.weights, gmm.gaussians)
-
+
case None => {
val samples = breezeData.takeSample(withReplacement = true, k * nSamples, seed)
- (Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
+ (Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
val slice = samples.view(i * nSamples, (i + 1) * nSamples)
- new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
+ new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
})
}
}
-
- var llh = Double.MinValue // current log-likelihood
+
+ var llh = Double.MinValue // current log-likelihood
var llhp = 0.0 // previous log-likelihood
-
+
var iter = 0
while (iter < maxIterations && math.abs(llh-llhp) > convergenceTol) {
// create and broadcast curried cluster contribution function
val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_)
-
+
// aggregate the cluster contribution for all sample points
val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
-
+
// Create new distributions based on the partial assignments
// (often referred to as the "M" step in literature)
val sumWeights = sums.weights.sum
@@ -179,22 +179,22 @@ class GaussianMixture private (
gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
i = i + 1
}
-
+
llhp = llh // current becomes previous
llh = sums.logLikelihood // this is the freshly computed log-likelihood
iter += 1
- }
-
+ }
+
new GaussianMixtureModel(weights, gaussians)
}
-
+
/** Average of dense breeze vectors */
private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
val v = BDV.zeros[Double](x(0).length)
x.foreach(xi => v += xi)
- v / x.length.toDouble
+ v / x.length.toDouble
}
-
+
/**
* Construct matrix where diagonal entries are element-wise
* variance of input vectors (computes biased variance)
@@ -210,14 +210,14 @@ class GaussianMixture private (
// companion class to provide zero constructor for ExpectationSum
private object ExpectationSum {
def zero(k: Int, d: Int): ExpectationSum = {
- new ExpectationSum(0.0, Array.fill(k)(0.0),
+ new ExpectationSum(0.0, Array.fill(k)(0.0),
Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d, d)))
}
-
+
// compute cluster contributions for each input point
// (U, T) => U for aggregation
def add(
- weights: Array[Double],
+ weights: Array[Double],
dists: Array[MultivariateGaussian])
(sums: ExpectationSum, x: BV[Double]): ExpectationSum = {
val p = weights.zip(dists).map {
@@ -235,7 +235,7 @@ private object ExpectationSum {
i = i + 1
}
sums
- }
+ }
}
// Aggregation class for partial expectation results
@@ -244,9 +244,9 @@ private class ExpectationSum(
val weights: Array[Double],
val means: Array[BDV[Double]],
val sigmas: Array[BreezeMatrix[Double]]) extends Serializable {
-
+
val k = weights.length
-
+
def +=(x: ExpectationSum): ExpectationSum = {
var i = 0
while (i < k) {
@@ -257,5 +257,5 @@ private class ExpectationSum(
}
logLikelihood += x.logLikelihood
this
- }
+ }
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 86353aed81..5fc2cb1b62 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -34,10 +34,10 @@ import org.apache.spark.sql.{SQLContext, Row}
/**
* :: Experimental ::
*
- * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
- * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
- * the respective mean and covariance for each Gaussian distribution i=1..k.
- *
+ * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
+ * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
+ * the respective mean and covariance for each Gaussian distribution i=1..k.
+ *
* @param weights Weights for each Gaussian distribution in the mixture, where weights(i) is
* the weight for Gaussian i, and weights.sum == 1
* @param gaussians Array of MultivariateGaussian where gaussians(i) represents
@@ -45,9 +45,9 @@ import org.apache.spark.sql.{SQLContext, Row}
*/
@Experimental
class GaussianMixtureModel(
- val weights: Array[Double],
+ val weights: Array[Double],
val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable{
-
+
require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
override protected def formatVersion = "1.0"
@@ -64,20 +64,20 @@ class GaussianMixtureModel(
val responsibilityMatrix = predictSoft(points)
responsibilityMatrix.map(r => r.indexOf(r.max))
}
-
+
/**
* Given the input vectors, return the membership value of each vector
- * to all mixture components.
+ * to all mixture components.
*/
def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
val sc = points.sparkContext
val bcDists = sc.broadcast(gaussians)
val bcWeights = sc.broadcast(weights)
- points.map { x =>
+ points.map { x =>
computeSoftAssignments(x.toBreeze.toDenseVector, bcDists.value, bcWeights.value, k)
}
}
-
+
/**
* Compute the partial assignments for each vector
*/
@@ -89,7 +89,7 @@ class GaussianMixtureModel(
val p = weights.zip(dists).map {
case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
}
- val pSum = p.sum
+ val pSum = p.sum
for (i <- 0 until k) {
p(i) /= pSum
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 1ed01c9d8b..e7a243f854 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -121,7 +121,7 @@ class PowerIterationClustering private[clustering] (
import org.apache.spark.mllib.clustering.PowerIterationClustering._
/** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
- * initMode: "random"}.
+ * initMode: "random"}.
*/
def this() = this(k = 2, maxIterations = 100, initMode = "random")
@@ -243,7 +243,7 @@ object PowerIterationClustering extends Logging {
/**
* Generates random vertex properties (v0) to start power iteration.
- *
+ *
* @param g a graph representing the normalized affinity matrix (W)
* @return a graph with edges representing W and vertices representing a random vector
* with unit 1-norm
@@ -266,7 +266,7 @@ object PowerIterationClustering extends Logging {
* Generates the degree vector as the vertex properties (v0) to start power iteration.
* It is not exactly the node degrees but just the normalized sum similarities. Call it
* as degree vector because it is used in the PIC paper.
- *
+ *
* @param g a graph representing the normalized affinity matrix (W)
* @return a graph with edges representing W and vertices representing the degree vector
*/
@@ -276,7 +276,7 @@ object PowerIterationClustering extends Logging {
val v0 = g.vertices.mapValues(_ / sum)
GraphImpl.fromExistingRDDs(VertexRDD(v0), g.edges)
}
-
+
/**
* Runs power iteration.
* @param g input graph with edges representing the normalized affinity matrix (W) and vertices
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 466ae95859..51546d41c3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -42,7 +42,7 @@ import org.apache.spark.util.random.XORShiftRandom
import org.apache.spark.sql.{SQLContext, Row}
/**
- * Entry in vocabulary
+ * Entry in vocabulary
*/
private case class VocabWord(
var word: String,
@@ -56,18 +56,18 @@ private case class VocabWord(
* :: Experimental ::
* Word2Vec creates vector representation of words in a text corpus.
* The algorithm first constructs a vocabulary from the corpus
- * and then learns vector representation of words in the vocabulary.
- * The vector representation can be used as features in
+ * and then learns vector representation of words in the vocabulary.
+ * The vector representation can be used as features in
* natural language processing and machine learning algorithms.
- *
- * We used skip-gram model in our implementation and hierarchical softmax
+ *
+ * We used skip-gram model in our implementation and hierarchical softmax
* method to train the model. The variable names in the implementation
* matches the original C implementation.
*
- * For original C implementation, see https://code.google.com/p/word2vec/
- * For research papers, see
+ * For original C implementation, see https://code.google.com/p/word2vec/
+ * For research papers, see
* Efficient Estimation of Word Representations in Vector Space
- * and
+ * and
* Distributed Representations of Words and Phrases and their Compositionality.
*/
@Experimental
@@ -79,7 +79,7 @@ class Word2Vec extends Serializable with Logging {
private var numIterations = 1
private var seed = Utils.random.nextLong()
private var minCount = 5
-
+
/**
* Sets vector size (default: 100).
*/
@@ -122,15 +122,15 @@ class Word2Vec extends Serializable with Logging {
this
}
- /**
- * Sets minCount, the minimum number of times a token must appear to be included in the word2vec
+ /**
+ * Sets minCount, the minimum number of times a token must appear to be included in the word2vec
* model's vocabulary (default: 5).
*/
def setMinCount(minCount: Int): this.type = {
this.minCount = minCount
this
}
-
+
private val EXP_TABLE_SIZE = 1000
private val MAX_EXP = 6
private val MAX_CODE_LENGTH = 40
@@ -150,13 +150,13 @@ class Word2Vec extends Serializable with Logging {
.map(x => VocabWord(
x._1,
x._2,
- new Array[Int](MAX_CODE_LENGTH),
- new Array[Int](MAX_CODE_LENGTH),
+ new Array[Int](MAX_CODE_LENGTH),
+ new Array[Int](MAX_CODE_LENGTH),
0))
.filter(_.cn >= minCount)
.collect()
.sortWith((a, b) => a.cn > b.cn)
-
+
vocabSize = vocab.length
require(vocabSize > 0, "The vocabulary size should be > 0. You may need to check " +
"the setting of minCount, which could be large enough to remove all your words in sentences.")
@@ -198,8 +198,8 @@ class Word2Vec extends Serializable with Logging {
}
var pos1 = vocabSize - 1
var pos2 = vocabSize
-
- var min1i = 0
+
+ var min1i = 0
var min2i = 0
a = 0
@@ -268,15 +268,15 @@ class Word2Vec extends Serializable with Logging {
val words = dataset.flatMap(x => x)
learnVocab(words)
-
+
createBinaryTree()
-
+
val sc = dataset.context
val expTable = sc.broadcast(createExpTable())
val bcVocab = sc.broadcast(vocab)
val bcVocabHash = sc.broadcast(vocabHash)
-
+
val sentences: RDD[Array[Int]] = words.mapPartitions { iter =>
new Iterator[Array[Int]] {
def hasNext: Boolean = iter.hasNext
@@ -297,7 +297,7 @@ class Word2Vec extends Serializable with Logging {
}
}
}
-
+
val newSentences = sentences.repartition(numPartitions).cache()
val initRandom = new XORShiftRandom(seed)
@@ -402,7 +402,7 @@ class Word2Vec extends Serializable with Logging {
}
}
newSentences.unpersist()
-
+
val word2VecMap = mutable.HashMap.empty[String, Array[Float]]
var i = 0
while (i < vocabSize) {
@@ -480,7 +480,7 @@ class Word2VecModel private[mllib] (
/**
* Transforms a word to its vector representation
- * @param word a word
+ * @param word a word
* @return vector representation of word
*/
def transform(word: String): Vector = {
@@ -495,7 +495,7 @@ class Word2VecModel private[mllib] (
/**
* Find synonyms of a word
* @param word a word
- * @param num number of synonyms to find
+ * @param num number of synonyms to find
* @return array of (word, cosineSimilarity)
*/
def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
@@ -506,7 +506,7 @@ class Word2VecModel private[mllib] (
/**
* Find synonyms of the vector representation of a word
* @param vector vector representation of a word
- * @param num number of synonyms to find
+ * @param num number of synonyms to find
* @return array of (word, cosineSimilarity)
*/
def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index ec38529cf8..557119f7b1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -228,7 +228,7 @@ private[spark] object BLAS extends Serializable with Logging {
}
_nativeBLAS
}
-
+
/**
* A := alpha * x * x^T^ + A
* @param alpha a real scalar that will be multiplied to x * x^T^.
@@ -264,7 +264,7 @@ private[spark] object BLAS extends Serializable with Logging {
j += 1
}
i += 1
- }
+ }
}
private def syr(alpha: Double, x: SparseVector, A: DenseMatrix) {
@@ -505,7 +505,7 @@ private[spark] object BLAS extends Serializable with Logging {
nativeBLAS.dgemv(tStrA, mA, nA, alpha, A.values, mA, x.values, 1, beta,
y.values, 1)
}
-
+
/**
* y := alpha * A * x + beta * y
* For `DenseMatrix` A and `SparseVector` x.
@@ -557,7 +557,7 @@ private[spark] object BLAS extends Serializable with Logging {
}
}
}
-
+
/**
* y := alpha * A * x + beta * y
* For `SparseMatrix` A and `SparseVector` x.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index 866936aa4f..ae3ba3099c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -81,7 +81,7 @@ private[mllib] object EigenValueDecomposition {
require(n * ncv.toLong <= Integer.MAX_VALUE && ncv * (ncv.toLong + 8) <= Integer.MAX_VALUE,
s"k = $k and/or n = $n are too large to compute an eigendecomposition")
-
+
var ido = new intW(0)
var info = new intW(0)
var resid = new Array[Double](n)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
index 34b447584e..622b53a252 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
@@ -27,10 +27,10 @@ import org.apache.spark.mllib.regression.GeneralizedLinearModel
* PMML Model Export for GeneralizedLinearModel class with binary ClassificationModel
*/
private[mllib] class BinaryClassificationPMMLModelExport(
- model : GeneralizedLinearModel,
+ model : GeneralizedLinearModel,
description : String,
normalizationMethod : RegressionNormalizationMethodType,
- threshold: Double)
+ threshold: Double)
extends PMMLModelExport {
populateBinaryClassificationPMML()
@@ -72,7 +72,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
.withUsageType(FieldUsageType.ACTIVE))
regressionTableYES.withNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
}
-
+
// add target field
val targetField = FieldName.create("target")
dataDictionary
@@ -80,9 +80,9 @@ private[mllib] class BinaryClassificationPMMLModelExport(
miningSchema
.withMiningFields(new MiningField(targetField)
.withUsageType(FieldUsageType.TARGET))
-
+
dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)
-
+
pmml.setDataDictionary(dataDictionary)
pmml.withModels(regressionModel)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
index ebdeae50bb..c5fdecd3ca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
@@ -25,7 +25,7 @@ import scala.beans.BeanProperty
import org.dmg.pmml.{Application, Header, PMML, Timestamp}
private[mllib] trait PMMLModelExport {
-
+
/**
* Holder of the exported model in PMML format
*/
@@ -33,7 +33,7 @@ private[mllib] trait PMMLModelExport {
val pmml: PMML = new PMML
setHeader(pmml)
-
+
private def setHeader(pmml: PMML): Unit = {
val version = getClass.getPackage.getImplementationVersion
val app = new Application().withName("Apache Spark MLlib").withVersion(version)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
index c16e83d6a0..29bd689e11 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
@@ -27,9 +27,9 @@ import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel
private[mllib] object PMMLModelExportFactory {
-
+
/**
- * Factory object to help creating the necessary PMMLModelExport implementation
+ * Factory object to help creating the necessary PMMLModelExport implementation
* taking as input the machine learning model (for example KMeansModel).
*/
def createPMMLModelExport(model: Any): PMMLModelExport = {
@@ -44,7 +44,7 @@ private[mllib] object PMMLModelExportFactory {
new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
case svm: SVMModel =>
new BinaryClassificationPMMLModelExport(
- svm, "linear SVM", RegressionNormalizationMethodType.NONE,
+ svm, "linear SVM", RegressionNormalizationMethodType.NONE,
svm.getThreshold.getOrElse(0.0))
case logistic: LogisticRegressionModel =>
if (logistic.numClasses == 2) {
@@ -60,5 +60,5 @@ private[mllib] object PMMLModelExportFactory {
"PMML Export not supported for model: " + model.getClass.getName)
}
}
-
+
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 7db5a14fd4..174d5e0f6c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -234,7 +234,7 @@ object RandomRDDs {
*
* @param sc SparkContext used to create the RDD.
* @param shape shape parameter (> 0) for the gamma distribution
- * @param scale scale parameter (> 0) for the gamma distribution
+ * @param scale scale parameter (> 0) for the gamma distribution
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
@@ -293,7 +293,7 @@ object RandomRDDs {
*
* @param sc SparkContext used to create the RDD.
* @param mean mean for the log normal distribution
- * @param std standard deviation for the log normal distribution
+ * @param std standard deviation for the log normal distribution
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
@@ -671,7 +671,7 @@ object RandomRDDs {
*
* @param sc SparkContext used to create the RDD.
* @param shape shape parameter (> 0) for the gamma distribution.
- * @param scale scale parameter (> 0) for the gamma distribution.
+ * @param scale scale parameter (> 0) for the gamma distribution.
* @param numRows Number of Vectors in the RDD.
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index dddefe1944..93290e6508 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -175,7 +175,7 @@ class ALS private (
/**
* :: DeveloperApi ::
* Sets storage level for final RDDs (user/product used in MatrixFactorizationModel). The default
- * value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g.
+ * value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g.
* `MEMORY_AND_DISK_SER` and set `spark.rdd.compress` to `true` to reduce the space requirement,
* at the cost of speed.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 96e50faca2..f3b46c75c0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -170,15 +170,15 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
case class Data(boundary: Double, prediction: Double)
def save(
- sc: SparkContext,
- path: String,
- boundaries: Array[Double],
- predictions: Array[Double],
+ sc: SparkContext,
+ path: String,
+ boundaries: Array[Double],
+ predictions: Array[Double],
isotonic: Boolean): Unit = {
val sqlContext = new SQLContext(sc)
val metadata = compact(render(
- ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
+ ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
("isotonic" -> isotonic)))
sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index cd6add9d60..cf51b24ff7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -29,102 +29,102 @@ import org.apache.spark.mllib.util.MLUtils
* the event that the covariance matrix is singular, the density will be computed in a
* reduced dimensional subspace under which the distribution is supported.
* (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
- *
+ *
* @param mu The mean vector of the distribution
* @param sigma The covariance matrix of the distribution
*/
@DeveloperApi
class MultivariateGaussian (
- val mu: Vector,
+ val mu: Vector,
val sigma: Matrix) extends Serializable {
require(sigma.numCols == sigma.numRows, "Covariance matrix must be square")
require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size")
-
+
private val breezeMu = mu.toBreeze.toDenseVector
-
+
/**
* private[mllib] constructor
- *
+ *
* @param mu The mean vector of the distribution
* @param sigma The covariance matrix of the distribution
*/
private[mllib] def this(mu: DBV[Double], sigma: DBM[Double]) = {
this(Vectors.fromBreeze(mu), Matrices.fromBreeze(sigma))
}
-
+
/**
* Compute distribution dependent constants:
* rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t
- * u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
+ * u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
*/
private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
-
+
/** Returns density of this multivariate Gaussian at given point, x */
def pdf(x: Vector): Double = {
pdf(x.toBreeze)
}
-
+
/** Returns the log-density of this multivariate Gaussian at given point, x */
def logpdf(x: Vector): Double = {
logpdf(x.toBreeze)
}
-
+
/** Returns density of this multivariate Gaussian at given point, x */
private[mllib] def pdf(x: BV[Double]): Double = {
math.exp(logpdf(x))
}
-
+
/** Returns the log-density of this multivariate Gaussian at given point, x */
private[mllib] def logpdf(x: BV[Double]): Double = {
val delta = x - breezeMu
val v = rootSigmaInv * delta
u + v.t * v * -0.5
}
-
+
/**
* Calculate distribution dependent components used for the density function:
* pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
* where k is length of the mean vector.
- *
- * We here compute distribution-fixed parts
+ *
+ * We here compute distribution-fixed parts
* log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
* and
* D^(-1/2)^ * U, where sigma = U * D * U.t
- *
+ *
* Both the determinant and the inverse can be computed from the singular value decomposition
* of sigma. Noting that covariance matrices are always symmetric and positive semi-definite,
* we can use the eigendecomposition. We also do not compute the inverse directly; noting
- * that
- *
+ * that
+ *
* sigma = U * D * U.t
- * inv(Sigma) = U * inv(D) * U.t
+ * inv(Sigma) = U * inv(D) * U.t
* = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U)
- *
+ *
* and thus
- *
+ *
* -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U * (x-mu))^2^
- *
- * To guard against singular covariance matrices, this method computes both the
+ *
+ * To guard against singular covariance matrices, this method computes both the
* pseudo-determinant and the pseudo-inverse (Moore-Penrose). Singular values are considered
* to be non-zero only if they exceed a tolerance based on machine precision, matrix size, and
* relation to the maximum singular value (same tolerance used by, e.g., Octave).
*/
private def calculateCovarianceConstants: (DBM[Double], Double) = {
val eigSym.EigSym(d, u) = eigSym(sigma.toBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t
-
+
// For numerical stability, values are considered to be non-zero only if they exceed tol.
// This prevents any inverted value from exceeding (eps * n * max(d))^-1
val tol = MLUtils.EPSILON * max(d) * d.length
-
+
try {
// log(pseudo-determinant) is sum of the logs of all non-zero singular values
val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum
-
- // calculate the root-pseudo-inverse of the diagonal matrix of singular values
+
+ // calculate the root-pseudo-inverse of the diagonal matrix of singular values
// by inverting the square root of all non-zero values
val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
-
+
(pinvS * u, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
} catch {
case uex: UnsupportedOperationException =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index e3ddc70536..a835f96d5d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -270,7 +270,7 @@ object GradientBoostedTrees extends Logging {
logInfo(s"$timer")
if (persistedInput) input.unpersist()
-
+
if (validate) {
new GradientBoostedTreesModel(
boostingStrategy.treeStrategy.algo,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 99d0e3cf2f..069959976a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -474,7 +474,7 @@ object RandomForest extends Serializable with Logging {
val (treeIndex, node) = nodeQueue.head
// Choose subset of features for node (if subsampling).
val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
- Some(SamplingUtils.reservoirSampleAndCount(Range(0,
+ Some(SamplingUtils.reservoirSampleAndCount(Range(0,
metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong)._1)
} else {
None
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 681f4c618d..541f3288b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -265,7 +265,7 @@ object MLUtils {
}
Vectors.fromBreeze(vector1)
}
-
+
/**
* Returns the squared Euclidean distance between two vectors. The following formula will be used
* if it does not introduce too much numerical error:
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 9da0618abd..36a1ac6b79 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -38,7 +38,7 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext
val dataset = sqlContext.createDataFrame(
sc.parallelize(LinearDataGenerator.generateLinearInput(
6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
-
+
/**
* Using the following R code to load the data, train the model and evaluate metrics.
*
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
index d4631518e0..7953bd0417 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -47,7 +47,7 @@ class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
test("Binarize continuous features with setter") {
val threshold: Double = 0.2
- val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
+ val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
val dataFrame: DataFrame = sqlContext.createDataFrame(
data.zip(thresholdBinarized)).toDF("feature", "expected")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index a3b085e441..b218d72f12 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -46,7 +46,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
}
}
-
+
test("two clusters") {
val data = sc.parallelize(GaussianTestData.data)
@@ -62,7 +62,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
val Ew = Array(1.0 / 3.0, 2.0 / 3.0)
val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
-
+
val gmm = new GaussianMixture()
.setK(2)
.setInitialModel(initialGmm)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index 3903712879..19e65f1b53 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -56,7 +56,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
predictions(a.cluster) += a.id
}
assert(predictions.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
-
+
val model2 = new PowerIterationClustering()
.setK(2)
.setInitializationMode("degree")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index bcc2e657f3..b0f3f71113 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -139,7 +139,7 @@ class BLASSuite extends SparkFunSuite {
syr(alpha, x, dA)
assert(dA ~== expected absTol 1e-15)
-
+
val dB =
new DenseMatrix(3, 4, Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0))
@@ -148,7 +148,7 @@ class BLASSuite extends SparkFunSuite {
syr(alpha, x, dB)
}
}
-
+
val dC =
new DenseMatrix(3, 3, Array(0.0, 1.2, 2.2, 1.2, 3.2, 5.3, 2.2, 5.3, 1.8))
@@ -157,7 +157,7 @@ class BLASSuite extends SparkFunSuite {
syr(alpha, x, dC)
}
}
-
+
val y = new DenseVector(Array(0.0, 2.7, 3.5, 2.1, 1.5))
withClue("Size of vector must match the rank of matrix") {
@@ -255,13 +255,13 @@ class BLASSuite extends SparkFunSuite {
val dA =
new DenseMatrix(4, 3, Array(0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0))
val sA = new SparseMatrix(4, 3, Array(0, 1, 3, 4), Array(1, 0, 2, 3), Array(1.0, 2.0, 1.0, 3.0))
-
+
val dA2 =
new DenseMatrix(4, 3, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0), true)
val sA2 =
new SparseMatrix(4, 3, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0),
true)
-
+
val dx = new DenseVector(Array(1.0, 2.0, 3.0))
val sx = dx.toSparse
val expected = new DenseVector(Array(4.0, 1.0, 2.0, 9.0))
@@ -270,7 +270,7 @@ class BLASSuite extends SparkFunSuite {
assert(sA.multiply(dx) ~== expected absTol 1e-15)
assert(dA.multiply(sx) ~== expected absTol 1e-15)
assert(sA.multiply(sx) ~== expected absTol 1e-15)
-
+
val y1 = new DenseVector(Array(1.0, 3.0, 1.0, 0.0))
val y2 = y1.copy
val y3 = y1.copy
@@ -287,7 +287,7 @@ class BLASSuite extends SparkFunSuite {
val y14 = y1.copy
val y15 = y1.copy
val y16 = y1.copy
-
+
val expected2 = new DenseVector(Array(6.0, 7.0, 4.0, 9.0))
val expected3 = new DenseVector(Array(10.0, 8.0, 6.0, 18.0))
@@ -295,42 +295,42 @@ class BLASSuite extends SparkFunSuite {
gemv(1.0, sA, dx, 2.0, y2)
gemv(1.0, dA, sx, 2.0, y3)
gemv(1.0, sA, sx, 2.0, y4)
-
+
gemv(1.0, dA2, dx, 2.0, y5)
gemv(1.0, sA2, dx, 2.0, y6)
gemv(1.0, dA2, sx, 2.0, y7)
gemv(1.0, sA2, sx, 2.0, y8)
-
+
gemv(2.0, dA, dx, 2.0, y9)
gemv(2.0, sA, dx, 2.0, y10)
gemv(2.0, dA, sx, 2.0, y11)
gemv(2.0, sA, sx, 2.0, y12)
-
+
gemv(2.0, dA2, dx, 2.0, y13)
gemv(2.0, sA2, dx, 2.0, y14)
gemv(2.0, dA2, sx, 2.0, y15)
gemv(2.0, sA2, sx, 2.0, y16)
-
+
assert(y1 ~== expected2 absTol 1e-15)
assert(y2 ~== expected2 absTol 1e-15)
assert(y3 ~== expected2 absTol 1e-15)
assert(y4 ~== expected2 absTol 1e-15)
-
+
assert(y5 ~== expected2 absTol 1e-15)
assert(y6 ~== expected2 absTol 1e-15)
assert(y7 ~== expected2 absTol 1e-15)
assert(y8 ~== expected2 absTol 1e-15)
-
+
assert(y9 ~== expected3 absTol 1e-15)
assert(y10 ~== expected3 absTol 1e-15)
assert(y11 ~== expected3 absTol 1e-15)
assert(y12 ~== expected3 absTol 1e-15)
-
+
assert(y13 ~== expected3 absTol 1e-15)
assert(y14 ~== expected3 absTol 1e-15)
assert(y15 ~== expected3 absTol 1e-15)
assert(y16 ~== expected3 absTol 1e-15)
-
+
withClue("columns of A don't match the rows of B") {
intercept[Exception] {
gemv(1.0, dA.transpose, dx, 2.0, y1)
@@ -345,12 +345,12 @@ class BLASSuite extends SparkFunSuite {
gemv(1.0, sA.transpose, sx, 2.0, y1)
}
}
-
+
val dAT =
new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0))
val sAT =
new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0))
-
+
val dATT = dAT.transpose
val sATT = sAT.transpose
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index c6d29dcdb0..c4ae0a16f7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -214,13 +214,13 @@ class VectorsSuite extends SparkFunSuite {
val squaredDist = breezeSquaredDistance(sparseVector1.toBreeze, sparseVector2.toBreeze)
- // SparseVector vs. SparseVector
- assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
+ // SparseVector vs. SparseVector
+ assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
// DenseVector vs. SparseVector
assert(Vectors.sqdist(denseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
// DenseVector vs. DenseVector
assert(Vectors.sqdist(denseVector1, denseVector2) ~== squaredDist relTol 1E-8)
- }
+ }
}
test("foreachActive") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
index 7a724fc78b..4c6e76e474 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
@@ -53,13 +53,13 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
// ensure logistic regression has normalization method set to LOGIT
assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.LOGIT)
}
-
+
test("linear SVM PMML export") {
val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
-
+
val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
-
+
// assert that the PMML format is as expected
assert(svmModelExport.isInstanceOf[PMMLModelExport])
val pmml = svmModelExport.getPmml
@@ -80,5 +80,5 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
// ensure linear SVM has normalization method set to NONE
assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.NONE)
}
-
+
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
index a1a683559a..b3f9750afa 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
@@ -45,5 +45,5 @@ class KMeansPMMLModelExportSuite extends SparkFunSuite {
val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
}
-
+
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
index 0d194005a3..af49450961 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactorySuite.scala
@@ -60,25 +60,25 @@ class PMMLModelExportFactorySuite extends SparkFunSuite {
test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
+ "when passing a LogisticRegressionModel or SVMModel") {
val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
-
+
val logisticRegressionModel =
new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
val logisticRegressionModelExport =
PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
-
+
val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
}
-
+
test("PMMLModelExportFactory throw IllegalArgumentException "
+ "when passing a Multinomial Logistic Regression") {
/** 3 classes, 2 features */
val multiclassLogisticRegressionModel = new LogisticRegressionModel(
- weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
+ weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
numFeatures = 2, numClasses = 3)
-
+
intercept[IllegalArgumentException] {
PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
index 703b623536..aa60deb665 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
@@ -26,39 +26,39 @@ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext
test("univariate") {
val x1 = Vectors.dense(0.0)
val x2 = Vectors.dense(1.5)
-
+
val mu = Vectors.dense(0.0)
val sigma1 = Matrices.dense(1, 1, Array(1.0))
val dist1 = new MultivariateGaussian(mu, sigma1)
assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
-
+
val sigma2 = Matrices.dense(1, 1, Array(4.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
}
-
+
test("multivariate") {
val x1 = Vectors.dense(0.0, 0.0)
val x2 = Vectors.dense(1.0, 1.0)
-
+
val mu = Vectors.dense(0.0, 0.0)
val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
val dist1 = new MultivariateGaussian(mu, sigma1)
assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
-
+
val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
}
-
+
test("multivariate degenerate") {
val x1 = Vectors.dense(0.0, 0.0)
val x2 = Vectors.dense(1.0, 1.0)
-
+
val mu = Vectors.dense(0.0, 0.0)
val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
val dist = new MultivariateGaussian(mu, sigma)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 87b3661f77..734b7babec 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -62,7 +62,7 @@ class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
val fastSquaredDist3 =
fastSquaredDistance(v2, norm2, v3, norm3, precision)
assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
- if (m > 10) {
+ if (m > 10) {
val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
indices.map(i => a(i) + 0.5).slice(0, m - 10))
val norm4 = Vectors.norm(v4, 2.0)