From d3db2fd66752e80865e9c7a75d8e8d945121697e Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 13 May 2015 22:23:21 -0700 Subject: [SPARK-7620] [ML] [MLLIB] Removed calling size, length in while condition to avoid extra JVM call Author: DB Tsai Closes #6137 from dbtsai/clean and squashes the following commits: 185816d [DB Tsai] fix compilication issue f418d08 [DB Tsai] first commit --- .../ml/classification/LogisticRegression.scala | 9 +++-- .../org/apache/spark/ml/feature/Bucketizer.scala | 3 +- .../apache/spark/ml/feature/VectorIndexer.scala | 6 ++- .../spark/ml/regression/LinearRegression.scala | 6 ++- .../apache/spark/mllib/feature/ChiSqSelector.scala | 3 +- .../apache/spark/mllib/optimization/Updater.scala | 3 +- .../mllib/regression/IsotonicRegression.scala | 8 ++-- .../mllib/stat/MultivariateOnlineSummarizer.scala | 47 +++++++++++++--------- .../apache/spark/mllib/stat/test/ChiSqTest.scala | 6 ++- .../spark/mllib/tree/impurity/Impurity.scala | 14 ++++--- .../spark/mllib/util/LinearDataGenerator.scala | 3 +- .../apache/spark/ml/feature/BucketizerSuite.scala | 6 ++- .../classification/LogisticRegressionSuite.scala | 3 +- 13 files changed, 73 insertions(+), 44 deletions(-) (limited to 'mllib') diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 93ba91167b..2b10362687 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -258,7 +258,8 @@ class LogisticRegressionModel private[ml] ( rawPrediction match { case dv: DenseVector => var i = 0 - while (i < dv.size) { + val size = dv.size + while (i < size) { dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i))) i += 1 } @@ -357,7 +358,8 @@ private[classification] class MultiClassSummarizer extends Serializable { def histogram: Array[Long] = { val result = Array.ofDim[Long](numClasses) var i = 0 - while (i < result.length) { + val len = result.length + while (i < len) { result(i) = distinctMap.getOrElse(i, 0L) i += 1 } @@ -480,7 +482,8 @@ private class LogisticAggregator( var i = 0 val localThisGradientSumArray = this.gradientSumArray val localOtherGradientSumArray = other.gradientSumArray - while (i < localThisGradientSumArray.length) { + val len = localThisGradientSumArray.length + while (i < len) { localThisGradientSumArray(i) += localOtherGradientSumArray(i) i += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index e52d797293..d8f1961cb3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -98,7 +98,8 @@ private[feature] object Bucketizer { false } else { var i = 0 - while (i < splits.length - 1) { + val n = splits.length - 1 + while (i < n) { if (splits(i) >= splits(i + 1)) return false i += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 2e6313ac14..0f83a29c86 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -189,7 +189,8 @@ private object VectorIndexer { private def addDenseVector(dv: DenseVector): Unit = { var i = 0 - while (i < dv.size) { + val size = dv.size + while (i < size) { if (featureValueSets(i).size <= maxCategories) { featureValueSets(i).add(dv(i)) } @@ -201,7 +202,8 @@ private object VectorIndexer { // TODO: This might be able to handle 0's more efficiently. var vecIndex = 0 // index into vector var k = 0 // index into non-zero elements - while (vecIndex < sv.size) { + val size = sv.size + while (vecIndex < size) { val featureValue = if (k < sv.indices.length && vecIndex == sv.indices(k)) { k += 1 sv.values(k - 1) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 6377923afc..36c242bb5f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -167,7 +167,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress val weights = { val rawWeights = state.x.toArray.clone() var i = 0 - while (i < rawWeights.length) { + val len = rawWeights.length + while (i < len) { rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 } i += 1 } @@ -307,7 +308,8 @@ private class LeastSquaresAggregator( val weightsArray = weights.toArray.clone() var sum = 0.0 var i = 0 - while (i < weightsArray.length) { + val len = weightsArray.length + while (i < len) { if (featuresStd(i) != 0.0) { weightsArray(i) /= featuresStd(i) sum += weightsArray(i) * featuresMean(i) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index c6057c7f83..9cc2d0ffca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -38,7 +38,8 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf protected def isSorted(array: Array[Int]): Boolean = { var i = 1 - while (i < array.length) { + val len = array.length + while (i < len) { if (array(i) < array(i-1)) return false i += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala index 3ed3a5b9b3..9f463e0caf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala @@ -116,7 +116,8 @@ class L1Updater extends Updater { // Apply proximal operator (soft thresholding) val shrinkageVal = regParam * thisIterStepSize var i = 0 - while (i < brzWeights.length) { + val len = brzWeights.length + while (i < len) { val wi = brzWeights(i) brzWeights(i) = signum(wi) * max(0.0, abs(wi) - shrinkageVal) i += 1 diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index be2a00c2df..4ce541ae5b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -69,7 +69,8 @@ class IsotonicRegressionModel ( /** Asserts the input array is monotone with the given ordering. */ private def assertOrdered(xs: Array[Double])(implicit ord: Ordering[Double]): Unit = { var i = 1 - while (i < xs.length) { + val len = xs.length + while (i < len) { require(ord.compare(xs(i - 1), xs(i)) <= 0, s"Elements (${xs(i - 1)}, ${xs(i)}) are not ordered.") i += 1 @@ -329,11 +330,12 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali } var i = 0 - while (i < input.length) { + val len = input.length + while (i < len) { var j = i // Find monotonicity violating sequence, if any. - while (j < input.length - 1 && input(j)._1 > input(j + 1)._1) { + while (j < len - 1 && input(j)._1 > input(j + 1)._1) { j = j + 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index fcc2a14879..0b1755613a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -70,23 +70,30 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S require(n == sample.size, s"Dimensions mismatch when adding new sample." + s" Expecting $n but got ${sample.size}.") + val localCurrMean= currMean + val localCurrM2n = currM2n + val localCurrM2 = currM2 + val localCurrL1 = currL1 + val localNnz = nnz + val localCurrMax = currMax + val localCurrMin = currMin sample.foreachActive { (index, value) => if (value != 0.0) { - if (currMax(index) < value) { - currMax(index) = value + if (localCurrMax(index) < value) { + localCurrMax(index) = value } - if (currMin(index) > value) { - currMin(index) = value + if (localCurrMin(index) > value) { + localCurrMin(index) = value } - val prevMean = currMean(index) + val prevMean = localCurrMean(index) val diff = value - prevMean - currMean(index) = prevMean + diff / (nnz(index) + 1.0) - currM2n(index) += (value - currMean(index)) * diff - currM2(index) += value * value - currL1(index) += math.abs(value) + localCurrMean(index) = prevMean + diff / (localNnz(index) + 1.0) + localCurrM2n(index) += (value - localCurrMean(index)) * diff + localCurrM2(index) += value * value + localCurrL1(index) += math.abs(value) - nnz(index) += 1.0 + localNnz(index) += 1.0 } } @@ -130,14 +137,14 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } } else if (totalCnt == 0 && other.totalCnt != 0) { this.n = other.n - this.currMean = other.currMean.clone - this.currM2n = other.currM2n.clone - this.currM2 = other.currM2.clone - this.currL1 = other.currL1.clone + this.currMean = other.currMean.clone() + this.currM2n = other.currM2n.clone() + this.currM2 = other.currM2.clone() + this.currL1 = other.currL1.clone() this.totalCnt = other.totalCnt - this.nnz = other.nnz.clone - this.currMax = other.currMax.clone - this.currMin = other.currMin.clone + this.nnz = other.nnz.clone() + this.currMax = other.currMax.clone() + this.currMin = other.currMin.clone() } this } @@ -165,7 +172,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S if (denominator > 0.0) { val deltaMean = currMean var i = 0 - while (i < currM2n.size) { + val len = currM2n.length + while (i < len) { realVariance(i) = currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt realVariance(i) /= denominator @@ -211,7 +219,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S val realMagnitude = Array.ofDim[Double](n) var i = 0 - while (i < currM2.size) { + val len = currM2.length + while (i < len) { realMagnitude(i) = math.sqrt(currM2(i)) i += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index ea82d39b72..e597fce2ba 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -205,8 +205,10 @@ private[stat] object ChiSqTest extends Logging { val colSums = new Array[Double](numCols) val rowSums = new Array[Double](numRows) val colMajorArr = counts.toArray + val colMajorArrLen = colMajorArr.length + var i = 0 - while (i < colMajorArr.size) { + while (i < colMajorArrLen) { val elem = colMajorArr(i) if (elem < 0.0) { throw new IllegalArgumentException("Contingency table cannot contain negative entries.") @@ -220,7 +222,7 @@ private[stat] object ChiSqTest extends Logging { // second pass to collect statistic var statistic = 0.0 var j = 0 - while (j < colMajorArr.size) { + while (j < colMajorArrLen) { val col = j / numRows val colSum = colSums(col) if (colSum == 0.0) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala index 60e2ab2bb8..72eb24c492 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala @@ -111,11 +111,12 @@ private[tree] abstract class ImpurityCalculator(val stats: Array[Double]) { * Add the stats from another calculator into this one, modifying and returning this calculator. */ def add(other: ImpurityCalculator): ImpurityCalculator = { - require(stats.size == other.stats.size, + require(stats.length == other.stats.length, s"Two ImpurityCalculator instances cannot be added with different counts sizes." + - s" Sizes are ${stats.size} and ${other.stats.size}.") + s" Sizes are ${stats.length} and ${other.stats.length}.") var i = 0 - while (i < other.stats.size) { + val len = other.stats.length + while (i < len) { stats(i) += other.stats(i) i += 1 } @@ -127,11 +128,12 @@ private[tree] abstract class ImpurityCalculator(val stats: Array[Double]) { * calculator. */ def subtract(other: ImpurityCalculator): ImpurityCalculator = { - require(stats.size == other.stats.size, + require(stats.length == other.stats.length, s"Two ImpurityCalculator instances cannot be subtracted with different counts sizes." + - s" Sizes are ${stats.size} and ${other.stats.size}.") + s" Sizes are ${stats.length} and ${other.stats.length}.") var i = 0 - while (i < other.stats.size) { + val len = other.stats.length + while (i < len) { stats(i) -= other.stats(i) i += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index b1a4517344..b4e33c98ba 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -107,7 +107,8 @@ object LinearDataGenerator { x.foreach { v => var i = 0 - while (i < v.length) { + val len = v.length + while (i < len) { v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) i += 1 } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala index 1900820400..20d2f3ac66 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala @@ -122,7 +122,8 @@ private object BucketizerSuite extends FunSuite { def linearSearchForBuckets(splits: Array[Double], feature: Double): Double = { require(feature >= splits.head) var i = 0 - while (i < splits.length - 1) { + val n = splits.length - 1 + while (i < n) { if (feature < splits(i + 1)) return i i += 1 } @@ -138,7 +139,8 @@ private object BucketizerSuite extends FunSuite { s" ${splits.mkString(", ")}") } var i = 0 - while (i < splits.length - 1) { + val n = splits.length - 1 + while (i < n) { // Split i should fall in bucket i. testFeature(splits(i), i) // Value between splits i,i+1 should be in i, which is also true if the (i+1)-th split is inf. diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala index fb0a194718..966811a5a3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala @@ -101,7 +101,8 @@ object LogisticRegressionSuite { // This doesn't work if `vector` is a sparse vector. val vectorArray = vector.toArray var i = 0 - while (i < vectorArray.length) { + val len = vectorArray.length + while (i < len) { vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i) i += 1 } -- cgit v1.2.3