From 69bc2c17f1ca047d4915a4791b624d60c5943dc8 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Wed, 23 Mar 2016 15:08:47 -0700
Subject: [SPARK-13952][ML] Add random seed to GBT

## What changes were proposed in this pull request?

`GBTClassifier` and `GBTRegressor` should use random seed for reproducible results. Because of the nature of current unit tests, which compare GBTs in ML and GBTs in MLlib for equality, I also added a random seed to MLlib GBT algorithm. I made alternate constructors in `mllib.tree.GradientBoostedTrees` to accept a random seed, but left them as private so as to not change the API unnecessarily.

## How was this patch tested?

Existing unit tests verify that functionality did not change. Other ML algorithms do not seem to have unit tests that directly test the functionality of random seeding, but reproducibility with seeding for GBTs is effectively verified in existing tests. I can add more tests if needed.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #11903 from sethah/SPARK-13952.
---
 .../scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala | 4 +++-
 .../test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala | 4 +++-
 .../scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'mllib/src/test')

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 29efd675ab..f3680ed044 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -74,6 +74,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
           .setLossType("logistic")
           .setMaxIter(maxIter)
           .setStepSize(learningRate)
+          .setSeed(123)
         compareAPIs(data, None, gbt, categoricalFeatures)
     }
   }
@@ -91,6 +92,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
       .setMaxIter(5)
       .setStepSize(0.1)
       .setCheckpointInterval(2)
+      .setSeed(123)
     val model = gbt.fit(df)
 
     // copied model must have the same parent.
@@ -159,7 +161,7 @@ private object GBTClassifierSuite extends SparkFunSuite {
     val numFeatures = data.first().features.size
     val oldBoostingStrategy =
       gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
-    val oldGBT = new OldGBT(oldBoostingStrategy)
+    val oldGBT = new OldGBT(oldBoostingStrategy, gbt.getSeed.toInt)
     val oldModel = oldGBT.run(data)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 2)
     val newModel = gbt.fit(newData)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index db68606397..84148a8a4a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -65,6 +65,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
             .setLossType(loss)
             .setMaxIter(maxIter)
             .setStepSize(learningRate)
+            .setSeed(123)
           compareAPIs(data, None, gbt, categoricalFeatures)
       }
     }
@@ -104,6 +105,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
       .setMaxIter(5)
       .setStepSize(0.1)
       .setCheckpointInterval(2)
+      .setSeed(123)
     val model = gbt.fit(df)
 
     sc.checkpointDir = None
@@ -169,7 +171,7 @@ private object GBTRegressorSuite extends SparkFunSuite {
       categoricalFeatures: Map[Int, Int]): Unit = {
     val numFeatures = data.first().features.size
     val oldBoostingStrategy = gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression)
-    val oldGBT = new OldGBT(oldBoostingStrategy)
+    val oldGBT = new OldGBT(oldBoostingStrategy, gbt.getSeed.toInt)
     val oldModel = oldGBT.run(data)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
     val newModel = gbt.fit(newData)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
index 58828b3af9..747c267b4f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -171,13 +171,13 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
         categoricalFeaturesInfo = Map.empty)
       val boostingStrategy =
         new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
-      val gbtValidate = new GradientBoostedTrees(boostingStrategy)
+      val gbtValidate = new GradientBoostedTrees(boostingStrategy, seed = 0)
         .runWithValidation(trainRdd, validateRdd)
       val numTrees = gbtValidate.numTrees
       assert(numTrees !== numIterations)
 
       // Test that it performs better on the validation dataset.
-      val gbt = new GradientBoostedTrees(boostingStrategy).run(trainRdd)
+      val gbt = new GradientBoostedTrees(boostingStrategy, seed = 0).run(trainRdd)
       val (errorWithoutValidation, errorWithValidation) = {
         if (algo == Classification) {
           val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
-- 
cgit v1.2.3