From 048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Thu, 24 Mar 2016 09:25:00 +0000
Subject: [SPARK-14030][MLLIB] Add parameter check to MLLIB

## What changes were proposed in this pull request?

add parameter verification to MLLIB, like
numCorrections > 0
tolerance >= 0
iters > 0
regParam >= 0

## How was this patch tested?

manual tests

Author: Ruifeng Zheng <ruifengz@foxmail.com>
Author: Zheng RuiFeng <mllabs@datanode1.(none)>
Author: mllabs <mllabs@datanode1.(none)>
Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #11852 from zhengruifeng/lbfgs_check.
---
 .../apache/spark/mllib/classification/NaiveBayes.scala    |  2 ++
 .../apache/spark/mllib/clustering/GaussianMixture.scala   | 14 +++++++++-----
 .../scala/org/apache/spark/mllib/clustering/KMeans.scala  | 11 ++++++++---
 .../scala/org/apache/spark/mllib/clustering/LDA.scala     |  2 ++
 .../spark/mllib/clustering/PowerIterationClustering.scala |  4 ++++
 .../apache/spark/mllib/clustering/StreamingKMeans.scala   |  6 ++++++
 .../main/scala/org/apache/spark/mllib/feature/PCA.scala   |  3 ++-
 .../scala/org/apache/spark/mllib/feature/Word2Vec.scala   | 15 ++++++++++++++-
 .../org/apache/spark/mllib/fpm/AssociationRules.scala     |  3 ++-
 .../main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala  |  4 ++++
 .../apache/spark/mllib/optimization/GradientDescent.scala | 11 ++++++++++-
 .../scala/org/apache/spark/mllib/optimization/LBFGS.scala |  9 ++++++++-
 .../scala/org/apache/spark/mllib/recommendation/ALS.scala | 12 ++++++++++++
 13 files changed, 83 insertions(+), 13 deletions(-)

(limited to 'mllib')

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index bf0d9d9231..eb3ee41f7c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -326,6 +326,8 @@ class NaiveBayes private (
   /** Set the smoothing parameter. Default: 1.0. */
   @Since("0.9.0")
   def setLambda(lambda: Double): NaiveBayes = {
+    require(lambda >= 0,
+      s"Smoothing parameter must be nonnegative but got ${lambda}")
     this.lambda = lambda
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 88dbfe3fcc..03eb903bb8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -78,11 +78,9 @@ class GaussianMixture private (
    */
   @Since("1.3.0")
   def setInitialModel(model: GaussianMixtureModel): this.type = {
-    if (model.k == k) {
-      initialModel = Some(model)
-    } else {
-      throw new IllegalArgumentException("mismatched cluster count (model.k != k)")
-    }
+    require(model.k == k,
+      s"Mismatched cluster count (model.k ${model.k} != k ${k})")
+    initialModel = Some(model)
     this
   }
 
@@ -97,6 +95,8 @@ class GaussianMixture private (
    */
   @Since("1.3.0")
   def setK(k: Int): this.type = {
+    require(k > 0,
+      s"Number of Gaussians must be positive but got ${k}")
     this.k = k
     this
   }
@@ -112,6 +112,8 @@ class GaussianMixture private (
    */
   @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
+    require(maxIterations >= 0,
+      s"Maximum of iterations must be nonnegative but got ${maxIterations}")
     this.maxIterations = maxIterations
     this
   }
@@ -128,6 +130,8 @@ class GaussianMixture private (
    */
   @Since("1.3.0")
   def setConvergenceTol(convergenceTol: Double): this.type = {
+    require(convergenceTol >= 0.0,
+      s"Convergence tolerance must be nonnegative but got ${convergenceTol}")
     this.convergenceTol = convergenceTol
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 26f5600e6c..a7beb81980 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -65,6 +65,8 @@ class KMeans private (
    */
   @Since("0.8.0")
   def setK(k: Int): this.type = {
+    require(k > 0,
+      s"Number of clusters must be positive but got ${k}")
     this.k = k
     this
   }
@@ -80,6 +82,8 @@ class KMeans private (
    */
   @Since("0.8.0")
   def setMaxIterations(maxIterations: Int): this.type = {
+    require(maxIterations >= 0,
+      s"Maximum of iterations must be nonnegative but got ${maxIterations}")
     this.maxIterations = maxIterations
     this
   }
@@ -147,9 +151,8 @@ class KMeans private (
    */
   @Since("0.8.0")
   def setInitializationSteps(initializationSteps: Int): this.type = {
-    if (initializationSteps <= 0) {
-      throw new IllegalArgumentException("Number of initialization steps must be positive")
-    }
+    require(initializationSteps > 0,
+      s"Number of initialization steps must be positive but got ${initializationSteps}")
     this.initializationSteps = initializationSteps
     this
   }
@@ -166,6 +169,8 @@ class KMeans private (
    */
   @Since("0.8.0")
   def setEpsilon(epsilon: Double): this.type = {
+    require(epsilon >= 0,
+      s"Distance threshold must be nonnegative but got ${epsilon}")
     this.epsilon = epsilon
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index fad808857a..12813fd412 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -232,6 +232,8 @@ class LDA private (
    */
   @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
+    require(maxIterations >= 0,
+      s"Maximum of iterations must be nonnegative but got ${maxIterations}")
     this.maxIterations = maxIterations
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index a422303dc9..2e257ff9b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -137,6 +137,8 @@ class PowerIterationClustering private[clustering] (
    */
   @Since("1.3.0")
   def setK(k: Int): this.type = {
+    require(k > 0,
+      s"Number of clusters must be positive but got ${k}")
     this.k = k
     this
   }
@@ -146,6 +148,8 @@ class PowerIterationClustering private[clustering] (
    */
   @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
+    require(maxIterations >= 0,
+      s"Maximum of iterations must be nonnegative but got ${maxIterations}")
     this.maxIterations = maxIterations
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index a8d7b8fded..4eb8fc049e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -178,6 +178,8 @@ class StreamingKMeans @Since("1.2.0") (
    */
   @Since("1.2.0")
   def setK(k: Int): this.type = {
+    require(k > 0,
+      s"Number of clusters must be positive but got ${k}")
     this.k = k
     this
   }
@@ -187,6 +189,8 @@ class StreamingKMeans @Since("1.2.0") (
    */
   @Since("1.2.0")
   def setDecayFactor(a: Double): this.type = {
+    require(a >= 0,
+      s"Decay factor must be nonnegative but got ${a}")
     this.decayFactor = a
     this
   }
@@ -198,6 +202,8 @@ class StreamingKMeans @Since("1.2.0") (
    */
   @Since("1.2.0")
   def setHalfLife(halfLife: Double, timeUnit: String): this.type = {
+    require(halfLife > 0,
+      s"Half life must be positive but got ${halfLife}")
     if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) {
       throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index 24e0a98c39..30c403e547 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -30,7 +30,8 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.4.0")
 class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
-  require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k")
+  require(k > 0,
+    s"Number of principal components must be positive but got ${k}")
 
   /**
    * Computes a [[PCAModel]] that contains the principal components of the input vectors.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index d3356b783f..5b079fce3a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -84,6 +84,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("2.0.0")
   def setMaxSentenceLength(maxSentenceLength: Int): this.type = {
+    require(maxSentenceLength > 0,
+      s"Maximum length of sentences must be positive but got ${maxSentenceLength}")
     this.maxSentenceLength = maxSentenceLength
     this
   }
@@ -93,6 +95,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("1.1.0")
   def setVectorSize(vectorSize: Int): this.type = {
+    require(vectorSize > 0,
+      s"vector size must be positive but got ${vectorSize}")
     this.vectorSize = vectorSize
     this
   }
@@ -102,6 +106,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("1.1.0")
   def setLearningRate(learningRate: Double): this.type = {
+    require(learningRate > 0,
+      s"Initial learning rate must be positive but got ${learningRate}")
     this.learningRate = learningRate
     this
   }
@@ -111,7 +117,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("1.1.0")
   def setNumPartitions(numPartitions: Int): this.type = {
-    require(numPartitions > 0, s"numPartitions must be greater than 0 but got $numPartitions")
+    require(numPartitions > 0,
+      s"Number of partitions must be positive but got ${numPartitions}")
     this.numPartitions = numPartitions
     this
   }
@@ -122,6 +129,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("1.1.0")
   def setNumIterations(numIterations: Int): this.type = {
+    require(numIterations >= 0,
+      s"Number of iterations must be nonnegative but got ${numIterations}")
     this.numIterations = numIterations
     this
   }
@@ -140,6 +149,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("1.6.0")
   def setWindowSize(window: Int): this.type = {
+    require(window > 0,
+      s"Window of words must be positive but got ${window}")
     this.window = window
     this
   }
@@ -150,6 +161,8 @@ class Word2Vec extends Serializable with Logging {
    */
   @Since("1.3.0")
   def setMinCount(minCount: Int): this.type = {
+    require(minCount >= 0,
+      s"Minimum number of times must be nonnegative but got ${minCount}")
     this.minCount = minCount
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 5592416964..9a63cc29da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -50,7 +50,8 @@ class AssociationRules private[fpm] (
    */
   @Since("1.5.0")
   def setMinConfidence(minConfidence: Double): this.type = {
-    require(minConfidence >= 0.0 && minConfidence <= 1.0)
+    require(minConfidence >= 0.0 && minConfidence <= 1.0,
+      s"Minimal confidence must be in range [0, 1] but got ${minConfidence}")
     this.minConfidence = minConfidence
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 3f40af8f3a..4f4996f3be 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -180,6 +180,8 @@ class FPGrowth private (
    */
   @Since("1.3.0")
   def setMinSupport(minSupport: Double): this.type = {
+    require(minSupport >= 0.0 && minSupport <= 1.0,
+      s"Minimal support level must be in range [0, 1] but got ${minSupport}")
     this.minSupport = minSupport
     this
   }
@@ -190,6 +192,8 @@ class FPGrowth private (
    */
   @Since("1.3.0")
   def setNumPartitions(numPartitions: Int): this.type = {
+    require(numPartitions > 0,
+      s"Number of partitions must be positive but got ${numPartitions}")
     this.numPartitions = numPartitions
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index fbf657b0fa..a67ea836e5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -46,6 +46,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * In subsequent steps, the step size will decrease with stepSize/sqrt(t)
    */
   def setStepSize(step: Double): this.type = {
+    require(step > 0,
+      s"Initial step size must be positive but got ${step}")
     this.stepSize = step
     this
   }
@@ -57,6 +59,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    */
   @Experimental
   def setMiniBatchFraction(fraction: Double): this.type = {
+    require(fraction > 0 && fraction <= 1.0,
+      s"Fraction for mini-batch SGD must be in range (0, 1] but got ${fraction}")
     this.miniBatchFraction = fraction
     this
   }
@@ -65,6 +69,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * Set the number of iterations for SGD. Default 100.
    */
   def setNumIterations(iters: Int): this.type = {
+    require(iters >= 0,
+      s"Number of iterations must be nonnegative but got ${iters}")
     this.numIterations = iters
     this
   }
@@ -73,6 +79,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * Set the regularization parameter. Default 0.0.
    */
   def setRegParam(regParam: Double): this.type = {
+    require(regParam >= 0,
+      s"Regularization parameter must be nonnegative but got ${regParam}")
     this.regParam = regParam
     this
   }
@@ -91,7 +99,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * Must be between 0.0 and 1.0 inclusively.
    */
   def setConvergenceTol(tolerance: Double): this.type = {
-    require(0.0 <= tolerance && tolerance <= 1.0)
+    require(tolerance >= 0.0 && tolerance <= 1.0,
+      s"Convergence tolerance must be in range [0, 1] but got ${tolerance}")
     this.convergenceTol = tolerance
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index 82c2ce4790..16a3352641 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -52,7 +52,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * Restriction: numCorrections > 0
    */
   def setNumCorrections(corrections: Int): this.type = {
-    assert(corrections > 0)
+    require(corrections > 0,
+      s"Number of corrections must be positive but got ${corrections}")
     this.numCorrections = corrections
     this
   }
@@ -64,6 +65,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * and therefore generally cause more iterations to be run.
    */
   def setConvergenceTol(tolerance: Double): this.type = {
+    require(tolerance >= 0,
+      s"Convergence tolerance must be nonnegative but got ${tolerance}")
     this.convergenceTol = tolerance
     this
   }
@@ -88,6 +91,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * Set the maximal number of iterations for L-BFGS. Default 100.
    */
   def setNumIterations(iters: Int): this.type = {
+    require(iters >= 0,
+      s"Maximum of iterations must be nonnegative but got ${iters}")
     this.maxNumIterations = iters
     this
   }
@@ -103,6 +108,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * Set the regularization parameter. Default 0.0.
    */
   def setRegParam(regParam: Double): this.type = {
+    require(regParam >= 0,
+      s"Regularization parameter must be nonnegative but got ${regParam}")
     this.regParam = regParam
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index c5b02d6b2e..467cb83cd1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -97,6 +97,8 @@ class ALS private (
    */
   @Since("0.8.0")
   def setBlocks(numBlocks: Int): this.type = {
+    require(numBlocks == -1 || numBlocks > 0,
+      s"Number of blocks must be -1 or positive but got ${numBlocks}")
     this.numUserBlocks = numBlocks
     this.numProductBlocks = numBlocks
     this
@@ -107,6 +109,8 @@ class ALS private (
    */
   @Since("1.1.0")
   def setUserBlocks(numUserBlocks: Int): this.type = {
+    require(numUserBlocks == -1 || numUserBlocks > 0,
+      s"Number of blocks must be -1 or positive but got ${numUserBlocks}")
     this.numUserBlocks = numUserBlocks
     this
   }
@@ -116,6 +120,8 @@ class ALS private (
    */
   @Since("1.1.0")
   def setProductBlocks(numProductBlocks: Int): this.type = {
+    require(numProductBlocks == -1 || numProductBlocks > 0,
+      s"Number of product blocks must be -1 or positive but got ${numProductBlocks}")
     this.numProductBlocks = numProductBlocks
     this
   }
@@ -123,6 +129,8 @@ class ALS private (
   /** Set the rank of the feature matrices computed (number of features). Default: 10. */
   @Since("0.8.0")
   def setRank(rank: Int): this.type = {
+    require(rank > 0,
+      s"Rank of the feature matrices must be positive but got ${rank}")
     this.rank = rank
     this
   }
@@ -130,6 +138,8 @@ class ALS private (
   /** Set the number of iterations to run. Default: 10. */
   @Since("0.8.0")
   def setIterations(iterations: Int): this.type = {
+    require(iterations >= 0,
+      s"Number of iterations must be nonnegative but got ${iterations}")
     this.iterations = iterations
     this
   }
@@ -137,6 +147,8 @@ class ALS private (
   /** Set the regularization parameter, lambda. Default: 0.01. */
   @Since("0.8.0")
   def setLambda(lambda: Double): this.type = {
+    require(lambda >= 0.0,
+      s"Regularization parameter must be nonnegative but got ${lambda}")
     this.lambda = lambda
     this
   }
-- 
cgit v1.2.3