From 048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 24 Mar 2016 09:25:00 +0000 Subject: [SPARK-14030][MLLIB] Add parameter check to MLLIB ## What changes were proposed in this pull request? add parameter verification to MLLIB, like numCorrections > 0 tolerance >= 0 iters > 0 regParam >= 0 ## How was this patch tested? manual tests Author: Ruifeng Zheng Author: Zheng RuiFeng Author: mllabs Author: Zheng RuiFeng Closes #11852 from zhengruifeng/lbfgs_check. --- .../apache/spark/mllib/classification/NaiveBayes.scala | 2 ++ .../apache/spark/mllib/clustering/GaussianMixture.scala | 14 +++++++++----- .../scala/org/apache/spark/mllib/clustering/KMeans.scala | 11 ++++++++--- .../scala/org/apache/spark/mllib/clustering/LDA.scala | 2 ++ .../spark/mllib/clustering/PowerIterationClustering.scala | 4 ++++ .../apache/spark/mllib/clustering/StreamingKMeans.scala | 6 ++++++ .../main/scala/org/apache/spark/mllib/feature/PCA.scala | 3 ++- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 15 ++++++++++++++- .../org/apache/spark/mllib/fpm/AssociationRules.scala | 3 ++- .../main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala | 4 ++++ .../apache/spark/mllib/optimization/GradientDescent.scala | 11 ++++++++++- .../scala/org/apache/spark/mllib/optimization/LBFGS.scala | 9 ++++++++- .../scala/org/apache/spark/mllib/recommendation/ALS.scala | 12 ++++++++++++ 13 files changed, 83 insertions(+), 13 deletions(-) (limited to 'mllib') diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index bf0d9d9231..eb3ee41f7c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -326,6 +326,8 @@ class NaiveBayes private ( /** Set the smoothing parameter. Default: 1.0. */ @Since("0.9.0") def setLambda(lambda: Double): NaiveBayes = { + require(lambda >= 0, + s"Smoothing parameter must be nonnegative but got ${lambda}") this.lambda = lambda this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index 88dbfe3fcc..03eb903bb8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -78,11 +78,9 @@ class GaussianMixture private ( */ @Since("1.3.0") def setInitialModel(model: GaussianMixtureModel): this.type = { - if (model.k == k) { - initialModel = Some(model) - } else { - throw new IllegalArgumentException("mismatched cluster count (model.k != k)") - } + require(model.k == k, + s"Mismatched cluster count (model.k ${model.k} != k ${k})") + initialModel = Some(model) this } @@ -97,6 +95,8 @@ class GaussianMixture private ( */ @Since("1.3.0") def setK(k: Int): this.type = { + require(k > 0, + s"Number of Gaussians must be positive but got ${k}") this.k = k this } @@ -112,6 +112,8 @@ class GaussianMixture private ( */ @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { + require(maxIterations >= 0, + s"Maximum of iterations must be nonnegative but got ${maxIterations}") this.maxIterations = maxIterations this } @@ -128,6 +130,8 @@ class GaussianMixture private ( */ @Since("1.3.0") def setConvergenceTol(convergenceTol: Double): this.type = { + require(convergenceTol >= 0.0, + s"Convergence tolerance must be nonnegative but got ${convergenceTol}") this.convergenceTol = convergenceTol this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 26f5600e6c..a7beb81980 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -65,6 +65,8 @@ class KMeans private ( */ @Since("0.8.0") def setK(k: Int): this.type = { + require(k > 0, + s"Number of clusters must be positive but got ${k}") this.k = k this } @@ -80,6 +82,8 @@ class KMeans private ( */ @Since("0.8.0") def setMaxIterations(maxIterations: Int): this.type = { + require(maxIterations >= 0, + s"Maximum of iterations must be nonnegative but got ${maxIterations}") this.maxIterations = maxIterations this } @@ -147,9 +151,8 @@ class KMeans private ( */ @Since("0.8.0") def setInitializationSteps(initializationSteps: Int): this.type = { - if (initializationSteps <= 0) { - throw new IllegalArgumentException("Number of initialization steps must be positive") - } + require(initializationSteps > 0, + s"Number of initialization steps must be positive but got ${initializationSteps}") this.initializationSteps = initializationSteps this } @@ -166,6 +169,8 @@ class KMeans private ( */ @Since("0.8.0") def setEpsilon(epsilon: Double): this.type = { + require(epsilon >= 0, + s"Distance threshold must be nonnegative but got ${epsilon}") this.epsilon = epsilon this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index fad808857a..12813fd412 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -232,6 +232,8 @@ class LDA private ( */ @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { + require(maxIterations >= 0, + s"Maximum of iterations must be nonnegative but got ${maxIterations}") this.maxIterations = maxIterations this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index a422303dc9..2e257ff9b7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -137,6 +137,8 @@ class PowerIterationClustering private[clustering] ( */ @Since("1.3.0") def setK(k: Int): this.type = { + require(k > 0, + s"Number of clusters must be positive but got ${k}") this.k = k this } @@ -146,6 +148,8 @@ class PowerIterationClustering private[clustering] ( */ @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { + require(maxIterations >= 0, + s"Maximum of iterations must be nonnegative but got ${maxIterations}") this.maxIterations = maxIterations this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index a8d7b8fded..4eb8fc049e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -178,6 +178,8 @@ class StreamingKMeans @Since("1.2.0") ( */ @Since("1.2.0") def setK(k: Int): this.type = { + require(k > 0, + s"Number of clusters must be positive but got ${k}") this.k = k this } @@ -187,6 +189,8 @@ class StreamingKMeans @Since("1.2.0") ( */ @Since("1.2.0") def setDecayFactor(a: Double): this.type = { + require(a >= 0, + s"Decay factor must be nonnegative but got ${a}") this.decayFactor = a this } @@ -198,6 +202,8 @@ class StreamingKMeans @Since("1.2.0") ( */ @Since("1.2.0") def setHalfLife(halfLife: Double, timeUnit: String): this.type = { + require(halfLife > 0, + s"Half life must be positive but got ${halfLife}") if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) { throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index 24e0a98c39..30c403e547 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -30,7 +30,8 @@ import org.apache.spark.rdd.RDD */ @Since("1.4.0") class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { - require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k") + require(k > 0, + s"Number of principal components must be positive but got ${k}") /** * Computes a [[PCAModel]] that contains the principal components of the input vectors. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index d3356b783f..5b079fce3a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -84,6 +84,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("2.0.0") def setMaxSentenceLength(maxSentenceLength: Int): this.type = { + require(maxSentenceLength > 0, + s"Maximum length of sentences must be positive but got ${maxSentenceLength}") this.maxSentenceLength = maxSentenceLength this } @@ -93,6 +95,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("1.1.0") def setVectorSize(vectorSize: Int): this.type = { + require(vectorSize > 0, + s"vector size must be positive but got ${vectorSize}") this.vectorSize = vectorSize this } @@ -102,6 +106,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("1.1.0") def setLearningRate(learningRate: Double): this.type = { + require(learningRate > 0, + s"Initial learning rate must be positive but got ${learningRate}") this.learningRate = learningRate this } @@ -111,7 +117,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("1.1.0") def setNumPartitions(numPartitions: Int): this.type = { - require(numPartitions > 0, s"numPartitions must be greater than 0 but got $numPartitions") + require(numPartitions > 0, + s"Number of partitions must be positive but got ${numPartitions}") this.numPartitions = numPartitions this } @@ -122,6 +129,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("1.1.0") def setNumIterations(numIterations: Int): this.type = { + require(numIterations >= 0, + s"Number of iterations must be nonnegative but got ${numIterations}") this.numIterations = numIterations this } @@ -140,6 +149,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("1.6.0") def setWindowSize(window: Int): this.type = { + require(window > 0, + s"Window of words must be positive but got ${window}") this.window = window this } @@ -150,6 +161,8 @@ class Word2Vec extends Serializable with Logging { */ @Since("1.3.0") def setMinCount(minCount: Int): this.type = { + require(minCount >= 0, + s"Minimum number of times must be nonnegative but got ${minCount}") this.minCount = minCount this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 5592416964..9a63cc29da 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -50,7 +50,8 @@ class AssociationRules private[fpm] ( */ @Since("1.5.0") def setMinConfidence(minConfidence: Double): this.type = { - require(minConfidence >= 0.0 && minConfidence <= 1.0) + require(minConfidence >= 0.0 && minConfidence <= 1.0, + s"Minimal confidence must be in range [0, 1] but got ${minConfidence}") this.minConfidence = minConfidence this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index 3f40af8f3a..4f4996f3be 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -180,6 +180,8 @@ class FPGrowth private ( */ @Since("1.3.0") def setMinSupport(minSupport: Double): this.type = { + require(minSupport >= 0.0 && minSupport <= 1.0, + s"Minimal support level must be in range [0, 1] but got ${minSupport}") this.minSupport = minSupport this } @@ -190,6 +192,8 @@ class FPGrowth private ( */ @Since("1.3.0") def setNumPartitions(numPartitions: Int): this.type = { + require(numPartitions > 0, + s"Number of partitions must be positive but got ${numPartitions}") this.numPartitions = numPartitions this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index fbf657b0fa..a67ea836e5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -46,6 +46,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va * In subsequent steps, the step size will decrease with stepSize/sqrt(t) */ def setStepSize(step: Double): this.type = { + require(step > 0, + s"Initial step size must be positive but got ${step}") this.stepSize = step this } @@ -57,6 +59,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va */ @Experimental def setMiniBatchFraction(fraction: Double): this.type = { + require(fraction > 0 && fraction <= 1.0, + s"Fraction for mini-batch SGD must be in range (0, 1] but got ${fraction}") this.miniBatchFraction = fraction this } @@ -65,6 +69,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va * Set the number of iterations for SGD. Default 100. */ def setNumIterations(iters: Int): this.type = { + require(iters >= 0, + s"Number of iterations must be nonnegative but got ${iters}") this.numIterations = iters this } @@ -73,6 +79,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va * Set the regularization parameter. Default 0.0. */ def setRegParam(regParam: Double): this.type = { + require(regParam >= 0, + s"Regularization parameter must be nonnegative but got ${regParam}") this.regParam = regParam this } @@ -91,7 +99,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va * Must be between 0.0 and 1.0 inclusively. */ def setConvergenceTol(tolerance: Double): this.type = { - require(0.0 <= tolerance && tolerance <= 1.0) + require(tolerance >= 0.0 && tolerance <= 1.0, + s"Convergence tolerance must be in range [0, 1] but got ${tolerance}") this.convergenceTol = tolerance this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index 82c2ce4790..16a3352641 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -52,7 +52,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater) * Restriction: numCorrections > 0 */ def setNumCorrections(corrections: Int): this.type = { - assert(corrections > 0) + require(corrections > 0, + s"Number of corrections must be positive but got ${corrections}") this.numCorrections = corrections this } @@ -64,6 +65,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater) * and therefore generally cause more iterations to be run. */ def setConvergenceTol(tolerance: Double): this.type = { + require(tolerance >= 0, + s"Convergence tolerance must be nonnegative but got ${tolerance}") this.convergenceTol = tolerance this } @@ -88,6 +91,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater) * Set the maximal number of iterations for L-BFGS. Default 100. */ def setNumIterations(iters: Int): this.type = { + require(iters >= 0, + s"Maximum of iterations must be nonnegative but got ${iters}") this.maxNumIterations = iters this } @@ -103,6 +108,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater) * Set the regularization parameter. Default 0.0. */ def setRegParam(regParam: Double): this.type = { + require(regParam >= 0, + s"Regularization parameter must be nonnegative but got ${regParam}") this.regParam = regParam this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index c5b02d6b2e..467cb83cd1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -97,6 +97,8 @@ class ALS private ( */ @Since("0.8.0") def setBlocks(numBlocks: Int): this.type = { + require(numBlocks == -1 || numBlocks > 0, + s"Number of blocks must be -1 or positive but got ${numBlocks}") this.numUserBlocks = numBlocks this.numProductBlocks = numBlocks this @@ -107,6 +109,8 @@ class ALS private ( */ @Since("1.1.0") def setUserBlocks(numUserBlocks: Int): this.type = { + require(numUserBlocks == -1 || numUserBlocks > 0, + s"Number of blocks must be -1 or positive but got ${numUserBlocks}") this.numUserBlocks = numUserBlocks this } @@ -116,6 +120,8 @@ class ALS private ( */ @Since("1.1.0") def setProductBlocks(numProductBlocks: Int): this.type = { + require(numProductBlocks == -1 || numProductBlocks > 0, + s"Number of product blocks must be -1 or positive but got ${numProductBlocks}") this.numProductBlocks = numProductBlocks this } @@ -123,6 +129,8 @@ class ALS private ( /** Set the rank of the feature matrices computed (number of features). Default: 10. */ @Since("0.8.0") def setRank(rank: Int): this.type = { + require(rank > 0, + s"Rank of the feature matrices must be positive but got ${rank}") this.rank = rank this } @@ -130,6 +138,8 @@ class ALS private ( /** Set the number of iterations to run. Default: 10. */ @Since("0.8.0") def setIterations(iterations: Int): this.type = { + require(iterations >= 0, + s"Number of iterations must be nonnegative but got ${iterations}") this.iterations = iterations this } @@ -137,6 +147,8 @@ class ALS private ( /** Set the regularization parameter, lambda. Default: 0.01. */ @Since("0.8.0") def setLambda(lambda: Double): this.type = { + require(lambda >= 0.0, + s"Regularization parameter must be nonnegative but got ${lambda}") this.lambda = lambda this } -- cgit v1.2.3