[SPARK-12026][MLLIB] ChiSqTest gets slower and slower over time when number of features is large

jira: https://issues.apache.org/jira/browse/SPARK-12026 The issue is valid as features.toArray.view.zipWithIndex.slice(startCol, endCol) becomes slower as startCol gets larger. I tested on local and the change can improve the performance and the running time was stable. Author: Yuhao Yang <hhbyyh@gmail.com> Closes #10146 from hhbyyh/chiSq.
author: Yuhao Yang <hhbyyh@gmail.com> 2016-01-13 17:43:27 -0800
committer: Joseph K. Bradley <joseph@databricks.com> 2016-01-13 17:43:27 -0800
commit: 021dafc6a05a31dc22c9f9110dedb47a1f913087 (patch)
tree: bd2f61d86f90a8b0d9147f26b104e65550f49e0c /mllib
parent: cd81fc9e8652c07b84f0887a24d67381b4e605fa (diff)
download: spark-021dafc6a05a31dc22c9f9110dedb47a1f913087.tar.gz
spark-021dafc6a05a31dc22c9f9110dedb47a1f913087.tar.bz2
spark-021dafc6a05a31dc22c9f9110dedb47a1f913087.zip
1 files changed, 4 insertions, 2 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
index f22f2df320..4a3fb06469 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -109,7 +109,9 @@ private[stat] object ChiSqTest extends Logging {
           }
           i += 1
           distinctLabels += label
-          features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) =>
+          val brzFeatures = features.toBreeze
+          (startCol until endCol).map { col =>
+            val feature = brzFeatures(col)
             allDistinctFeatures(col) += feature
             (col, feature, label)
           }
@@ -122,7 +124,7 @@ private[stat] object ChiSqTest extends Logging {
           pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap
       }
       val numLabels = labels.size
-      pairCounts.keys.groupBy(_._1).map { case (col, keys) =>
+      pairCounts.keys.groupBy(_._1).foreach { case (col, keys) =>
         val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap
         val numRows = features.size
         val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))
author	Yuhao Yang <hhbyyh@gmail.com>	2016-01-13 17:43:27 -0800
committer	Joseph K. Bradley <joseph@databricks.com>	2016-01-13 17:43:27 -0800
commit	021dafc6a05a31dc22c9f9110dedb47a1f913087 (patch)
tree	bd2f61d86f90a8b0d9147f26b104e65550f49e0c /mllib
parent	cd81fc9e8652c07b84f0887a24d67381b4e605fa (diff)
download	spark-021dafc6a05a31dc22c9f9110dedb47a1f913087.tar.gz spark-021dafc6a05a31dc22c9f9110dedb47a1f913087.tar.bz2 spark-021dafc6a05a31dc22c9f9110dedb47a1f913087.zip