SPARK-4022 [CORE] [MLLIB] Replace colt dependency (LGPL) with commons-math

This change replaces usages of colt with commons-math3 equivalents, and makes some minor necessary adjustments to related code and tests to match. Author: Sean Owen <sowen@cloudera.com> Closes #2928 from srowen/SPARK-4022 and squashes the following commits: 61a232f [Sean Owen] Fix failure due to different sampling in JavaAPISuite.sample() 16d66b8 [Sean Owen] Simplify seeding with call to reseedRandomGenerator a1a78e0 [Sean Owen] Use Well19937c 31c7641 [Sean Owen] Fix Python Poisson test by choosing a different seed; about 88% of seeds should work but 1 didn't, it seems 5c9c67f [Sean Owen] Additional test fixes from review d8f88e0 [Sean Owen] Replace colt with commons-math3. Some tests do not pass yet.
author: Sean Owen <sowen@cloudera.com> 2014-10-27 10:53:15 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-10-27 10:53:15 -0700
commit: bfa614b12795f1cfce4de0950f90cb8c4f2a7d53 (patch)
tree: e968afcba36be2d27db36bc17c26f38f1a491a48 /mllib/src/main
parent: 1d7bcc88401d66c8d17a075355acfc25a8b7615c (diff)
download: spark-bfa614b12795f1cfce4de0950f90cb8c4f2a7d53.tar.gz
spark-bfa614b12795f1cfce4de0950f90cb8c4f2a7d53.tar.bz2
spark-bfa614b12795f1cfce4de0950f90cb8c4f2a7d53.zip
3 files changed, 22 insertions, 16 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index 28179fbc45..51f9b8657c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.mllib.random
 
-import cern.jet.random.Poisson
-import cern.jet.random.engine.DRand
+import org.apache.commons.math3.distribution.PoissonDistribution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
@@ -89,12 +88,13 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] {
 @DeveloperApi
 class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
 
-  private var rng = new Poisson(mean, new DRand)
+  private var rng = new PoissonDistribution(mean)
 
-  override def nextValue(): Double = rng.nextDouble()
+  override def nextValue(): Double = rng.sample()
 
   override def setSeed(seed: Long) {
-    rng = new Poisson(mean, new DRand(seed.toInt))
+    rng = new PoissonDistribution(mean)
+    rng.reseedRandomGenerator(seed)
   }
 
   override def copy(): PoissonGenerator = new PoissonGenerator(mean)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
index 0089419c2c..ea82d39b72 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.stat.test
 
 import breeze.linalg.{DenseMatrix => BDM}
-import cern.jet.stat.Probability.chiSquareComplemented
+import org.apache.commons.math3.distribution.ChiSquaredDistribution
 
 import org.apache.spark.{SparkException, Logging}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
@@ -33,7 +33,7 @@ import scala.collection.mutable
  * on an input of type `Matrix` in which independence between columns is assessed.
  * We also provide a method for computing the chi-squared statistic between each feature and the
  * label for an input `RDD[LabeledPoint]`, return an `Array[ChiSquaredTestResult]` of size =
- * number of features in the inpuy RDD.
+ * number of features in the input RDD.
  *
  * Supported methods for goodness of fit: `pearson` (default)
  * Supported methods for independence: `pearson` (default)
@@ -139,7 +139,7 @@ private[stat] object ChiSqTest extends Logging {
   }
 
   /*
-   * Pearon's goodness of fit test on the input observed and expected counts/relative frequencies.
+   * Pearson's goodness of fit test on the input observed and expected counts/relative frequencies.
    * Uniform distribution is assumed when `expected` is not passed in.
    */
   def chiSquared(observed: Vector,
@@ -188,12 +188,12 @@ private[stat] object ChiSqTest extends Logging {
       }
     }
     val df = size - 1
-    val pValue = chiSquareComplemented(df, statistic)
+    val pValue = 1.0 - new ChiSquaredDistribution(df).cumulativeProbability(statistic)
     new ChiSqTestResult(pValue, df, statistic, PEARSON.name, NullHypothesis.goodnessOfFit.toString)
   }
 
   /*
-   * Pearon's independence test on the input contingency matrix.
+   * Pearson's independence test on the input contingency matrix.
    * TODO: optimize for SparseMatrix when it becomes supported.
    */
   def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSqTestResult = {
@@ -238,7 +238,13 @@ private[stat] object ChiSqTest extends Logging {
       j += 1
     }
     val df = (numCols - 1) * (numRows - 1)
-    val pValue = chiSquareComplemented(df, statistic)
-    new ChiSqTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
+    if (df == 0) {
+      // 1 column or 1 row. Constant distribution is independent of anything.
+      // pValue = 1.0 and statistic = 0.0 in this case.
+      new ChiSqTestResult(1.0, 0, 0.0, methodName, NullHypothesis.independence.toString)
+    } else {
+      val pValue = 1.0 - new ChiSquaredDistribution(df).cumulativeProbability(statistic)
+      new ChiSqTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
+    }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala
index 937c8a2ac5..e7a2127c5d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.mllib.tree.impl
 
-import cern.jet.random.Poisson
-import cern.jet.random.engine.DRand
+import org.apache.commons.math3.distribution.PoissonDistribution
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
@@ -60,12 +59,13 @@ private[tree] object BaggedPoint {
     input.mapPartitionsWithIndex { (partitionIndex, instances) =>
       // TODO: Support different sampling rates, and sampling without replacement.
       // Use random seed = seed + partitionIndex + 1 to make generation reproducible.
-      val poisson = new Poisson(1.0, new DRand(seed + partitionIndex + 1))
+      val poisson = new PoissonDistribution(1.0)
+      poisson.reseedRandomGenerator(seed + partitionIndex + 1)
       instances.map { instance =>
         val subsampleWeights = new Array[Double](numSubsamples)
         var subsampleIndex = 0
         while (subsampleIndex < numSubsamples) {
-          subsampleWeights(subsampleIndex) = poisson.nextInt()
+          subsampleWeights(subsampleIndex) = poisson.sample()
           subsampleIndex += 1
         }
         new BaggedPoint(instance, subsampleWeights)
author	Sean Owen <sowen@cloudera.com>	2014-10-27 10:53:15 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-10-27 10:53:15 -0700
commit	bfa614b12795f1cfce4de0950f90cb8c4f2a7d53 (patch)
tree	e968afcba36be2d27db36bc17c26f38f1a491a48 /mllib/src/main
parent	1d7bcc88401d66c8d17a075355acfc25a8b7615c (diff)
download	spark-bfa614b12795f1cfce4de0950f90cb8c4f2a7d53.tar.gz spark-bfa614b12795f1cfce4de0950f90cb8c4f2a7d53.tar.bz2 spark-bfa614b12795f1cfce4de0950f90cb8c4f2a7d53.zip