aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorRuifeng Zheng <ruifengz@foxmail.com>2016-03-24 09:25:00 +0000
committerSean Owen <sowen@cloudera.com>2016-03-24 09:25:00 +0000
commit048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2 (patch)
tree5a67f3ba083422d13d49be835bab9879431c6023 /mllib
parent1803bf63338ff20cf983e60724d169f62a1663c2 (diff)
downloadspark-048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2.tar.gz
spark-048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2.tar.bz2
spark-048a7594e2bfd2a3e531ecfa8ebbcc2032c1dac2.zip
[SPARK-14030][MLLIB] Add parameter check to MLLIB
## What changes were proposed in this pull request? add parameter verification to MLLIB, like numCorrections > 0 tolerance >= 0 iters > 0 regParam >= 0 ## How was this patch tested? manual tests Author: Ruifeng Zheng <ruifengz@foxmail.com> Author: Zheng RuiFeng <mllabs@datanode1.(none)> Author: mllabs <mllabs@datanode1.(none)> Author: Zheng RuiFeng <ruifengz@foxmail.com> Closes #11852 from zhengruifeng/lbfgs_check.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala14
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala15
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala12
13 files changed, 83 insertions, 13 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index bf0d9d9231..eb3ee41f7c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -326,6 +326,8 @@ class NaiveBayes private (
/** Set the smoothing parameter. Default: 1.0. */
@Since("0.9.0")
def setLambda(lambda: Double): NaiveBayes = {
+ require(lambda >= 0,
+ s"Smoothing parameter must be nonnegative but got ${lambda}")
this.lambda = lambda
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 88dbfe3fcc..03eb903bb8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -78,11 +78,9 @@ class GaussianMixture private (
*/
@Since("1.3.0")
def setInitialModel(model: GaussianMixtureModel): this.type = {
- if (model.k == k) {
- initialModel = Some(model)
- } else {
- throw new IllegalArgumentException("mismatched cluster count (model.k != k)")
- }
+ require(model.k == k,
+ s"Mismatched cluster count (model.k ${model.k} != k ${k})")
+ initialModel = Some(model)
this
}
@@ -97,6 +95,8 @@ class GaussianMixture private (
*/
@Since("1.3.0")
def setK(k: Int): this.type = {
+ require(k > 0,
+ s"Number of Gaussians must be positive but got ${k}")
this.k = k
this
}
@@ -112,6 +112,8 @@ class GaussianMixture private (
*/
@Since("1.3.0")
def setMaxIterations(maxIterations: Int): this.type = {
+ require(maxIterations >= 0,
+ s"Maximum of iterations must be nonnegative but got ${maxIterations}")
this.maxIterations = maxIterations
this
}
@@ -128,6 +130,8 @@ class GaussianMixture private (
*/
@Since("1.3.0")
def setConvergenceTol(convergenceTol: Double): this.type = {
+ require(convergenceTol >= 0.0,
+ s"Convergence tolerance must be nonnegative but got ${convergenceTol}")
this.convergenceTol = convergenceTol
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 26f5600e6c..a7beb81980 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -65,6 +65,8 @@ class KMeans private (
*/
@Since("0.8.0")
def setK(k: Int): this.type = {
+ require(k > 0,
+ s"Number of clusters must be positive but got ${k}")
this.k = k
this
}
@@ -80,6 +82,8 @@ class KMeans private (
*/
@Since("0.8.0")
def setMaxIterations(maxIterations: Int): this.type = {
+ require(maxIterations >= 0,
+ s"Maximum of iterations must be nonnegative but got ${maxIterations}")
this.maxIterations = maxIterations
this
}
@@ -147,9 +151,8 @@ class KMeans private (
*/
@Since("0.8.0")
def setInitializationSteps(initializationSteps: Int): this.type = {
- if (initializationSteps <= 0) {
- throw new IllegalArgumentException("Number of initialization steps must be positive")
- }
+ require(initializationSteps > 0,
+ s"Number of initialization steps must be positive but got ${initializationSteps}")
this.initializationSteps = initializationSteps
this
}
@@ -166,6 +169,8 @@ class KMeans private (
*/
@Since("0.8.0")
def setEpsilon(epsilon: Double): this.type = {
+ require(epsilon >= 0,
+ s"Distance threshold must be nonnegative but got ${epsilon}")
this.epsilon = epsilon
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index fad808857a..12813fd412 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -232,6 +232,8 @@ class LDA private (
*/
@Since("1.3.0")
def setMaxIterations(maxIterations: Int): this.type = {
+ require(maxIterations >= 0,
+ s"Maximum of iterations must be nonnegative but got ${maxIterations}")
this.maxIterations = maxIterations
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index a422303dc9..2e257ff9b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -137,6 +137,8 @@ class PowerIterationClustering private[clustering] (
*/
@Since("1.3.0")
def setK(k: Int): this.type = {
+ require(k > 0,
+ s"Number of clusters must be positive but got ${k}")
this.k = k
this
}
@@ -146,6 +148,8 @@ class PowerIterationClustering private[clustering] (
*/
@Since("1.3.0")
def setMaxIterations(maxIterations: Int): this.type = {
+ require(maxIterations >= 0,
+ s"Maximum of iterations must be nonnegative but got ${maxIterations}")
this.maxIterations = maxIterations
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index a8d7b8fded..4eb8fc049e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -178,6 +178,8 @@ class StreamingKMeans @Since("1.2.0") (
*/
@Since("1.2.0")
def setK(k: Int): this.type = {
+ require(k > 0,
+ s"Number of clusters must be positive but got ${k}")
this.k = k
this
}
@@ -187,6 +189,8 @@ class StreamingKMeans @Since("1.2.0") (
*/
@Since("1.2.0")
def setDecayFactor(a: Double): this.type = {
+ require(a >= 0,
+ s"Decay factor must be nonnegative but got ${a}")
this.decayFactor = a
this
}
@@ -198,6 +202,8 @@ class StreamingKMeans @Since("1.2.0") (
*/
@Since("1.2.0")
def setHalfLife(halfLife: Double, timeUnit: String): this.type = {
+ require(halfLife > 0,
+ s"Half life must be positive but got ${halfLife}")
if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) {
throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index 24e0a98c39..30c403e547 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -30,7 +30,8 @@ import org.apache.spark.rdd.RDD
*/
@Since("1.4.0")
class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
- require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k")
+ require(k > 0,
+ s"Number of principal components must be positive but got ${k}")
/**
* Computes a [[PCAModel]] that contains the principal components of the input vectors.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index d3356b783f..5b079fce3a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -84,6 +84,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("2.0.0")
def setMaxSentenceLength(maxSentenceLength: Int): this.type = {
+ require(maxSentenceLength > 0,
+ s"Maximum length of sentences must be positive but got ${maxSentenceLength}")
this.maxSentenceLength = maxSentenceLength
this
}
@@ -93,6 +95,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("1.1.0")
def setVectorSize(vectorSize: Int): this.type = {
+ require(vectorSize > 0,
+ s"vector size must be positive but got ${vectorSize}")
this.vectorSize = vectorSize
this
}
@@ -102,6 +106,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("1.1.0")
def setLearningRate(learningRate: Double): this.type = {
+ require(learningRate > 0,
+ s"Initial learning rate must be positive but got ${learningRate}")
this.learningRate = learningRate
this
}
@@ -111,7 +117,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("1.1.0")
def setNumPartitions(numPartitions: Int): this.type = {
- require(numPartitions > 0, s"numPartitions must be greater than 0 but got $numPartitions")
+ require(numPartitions > 0,
+ s"Number of partitions must be positive but got ${numPartitions}")
this.numPartitions = numPartitions
this
}
@@ -122,6 +129,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("1.1.0")
def setNumIterations(numIterations: Int): this.type = {
+ require(numIterations >= 0,
+ s"Number of iterations must be nonnegative but got ${numIterations}")
this.numIterations = numIterations
this
}
@@ -140,6 +149,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("1.6.0")
def setWindowSize(window: Int): this.type = {
+ require(window > 0,
+ s"Window of words must be positive but got ${window}")
this.window = window
this
}
@@ -150,6 +161,8 @@ class Word2Vec extends Serializable with Logging {
*/
@Since("1.3.0")
def setMinCount(minCount: Int): this.type = {
+ require(minCount >= 0,
+ s"Minimum number of times must be nonnegative but got ${minCount}")
this.minCount = minCount
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 5592416964..9a63cc29da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -50,7 +50,8 @@ class AssociationRules private[fpm] (
*/
@Since("1.5.0")
def setMinConfidence(minConfidence: Double): this.type = {
- require(minConfidence >= 0.0 && minConfidence <= 1.0)
+ require(minConfidence >= 0.0 && minConfidence <= 1.0,
+ s"Minimal confidence must be in range [0, 1] but got ${minConfidence}")
this.minConfidence = minConfidence
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 3f40af8f3a..4f4996f3be 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -180,6 +180,8 @@ class FPGrowth private (
*/
@Since("1.3.0")
def setMinSupport(minSupport: Double): this.type = {
+ require(minSupport >= 0.0 && minSupport <= 1.0,
+ s"Minimal support level must be in range [0, 1] but got ${minSupport}")
this.minSupport = minSupport
this
}
@@ -190,6 +192,8 @@ class FPGrowth private (
*/
@Since("1.3.0")
def setNumPartitions(numPartitions: Int): this.type = {
+ require(numPartitions > 0,
+ s"Number of partitions must be positive but got ${numPartitions}")
this.numPartitions = numPartitions
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index fbf657b0fa..a67ea836e5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -46,6 +46,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
* In subsequent steps, the step size will decrease with stepSize/sqrt(t)
*/
def setStepSize(step: Double): this.type = {
+ require(step > 0,
+ s"Initial step size must be positive but got ${step}")
this.stepSize = step
this
}
@@ -57,6 +59,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
*/
@Experimental
def setMiniBatchFraction(fraction: Double): this.type = {
+ require(fraction > 0 && fraction <= 1.0,
+ s"Fraction for mini-batch SGD must be in range (0, 1] but got ${fraction}")
this.miniBatchFraction = fraction
this
}
@@ -65,6 +69,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
* Set the number of iterations for SGD. Default 100.
*/
def setNumIterations(iters: Int): this.type = {
+ require(iters >= 0,
+ s"Number of iterations must be nonnegative but got ${iters}")
this.numIterations = iters
this
}
@@ -73,6 +79,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
* Set the regularization parameter. Default 0.0.
*/
def setRegParam(regParam: Double): this.type = {
+ require(regParam >= 0,
+ s"Regularization parameter must be nonnegative but got ${regParam}")
this.regParam = regParam
this
}
@@ -91,7 +99,8 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
* Must be between 0.0 and 1.0 inclusively.
*/
def setConvergenceTol(tolerance: Double): this.type = {
- require(0.0 <= tolerance && tolerance <= 1.0)
+ require(tolerance >= 0.0 && tolerance <= 1.0,
+ s"Convergence tolerance must be in range [0, 1] but got ${tolerance}")
this.convergenceTol = tolerance
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index 82c2ce4790..16a3352641 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -52,7 +52,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
* Restriction: numCorrections > 0
*/
def setNumCorrections(corrections: Int): this.type = {
- assert(corrections > 0)
+ require(corrections > 0,
+ s"Number of corrections must be positive but got ${corrections}")
this.numCorrections = corrections
this
}
@@ -64,6 +65,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
* and therefore generally cause more iterations to be run.
*/
def setConvergenceTol(tolerance: Double): this.type = {
+ require(tolerance >= 0,
+ s"Convergence tolerance must be nonnegative but got ${tolerance}")
this.convergenceTol = tolerance
this
}
@@ -88,6 +91,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
* Set the maximal number of iterations for L-BFGS. Default 100.
*/
def setNumIterations(iters: Int): this.type = {
+ require(iters >= 0,
+ s"Maximum of iterations must be nonnegative but got ${iters}")
this.maxNumIterations = iters
this
}
@@ -103,6 +108,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
* Set the regularization parameter. Default 0.0.
*/
def setRegParam(regParam: Double): this.type = {
+ require(regParam >= 0,
+ s"Regularization parameter must be nonnegative but got ${regParam}")
this.regParam = regParam
this
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index c5b02d6b2e..467cb83cd1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -97,6 +97,8 @@ class ALS private (
*/
@Since("0.8.0")
def setBlocks(numBlocks: Int): this.type = {
+ require(numBlocks == -1 || numBlocks > 0,
+ s"Number of blocks must be -1 or positive but got ${numBlocks}")
this.numUserBlocks = numBlocks
this.numProductBlocks = numBlocks
this
@@ -107,6 +109,8 @@ class ALS private (
*/
@Since("1.1.0")
def setUserBlocks(numUserBlocks: Int): this.type = {
+ require(numUserBlocks == -1 || numUserBlocks > 0,
+ s"Number of blocks must be -1 or positive but got ${numUserBlocks}")
this.numUserBlocks = numUserBlocks
this
}
@@ -116,6 +120,8 @@ class ALS private (
*/
@Since("1.1.0")
def setProductBlocks(numProductBlocks: Int): this.type = {
+ require(numProductBlocks == -1 || numProductBlocks > 0,
+ s"Number of product blocks must be -1 or positive but got ${numProductBlocks}")
this.numProductBlocks = numProductBlocks
this
}
@@ -123,6 +129,8 @@ class ALS private (
/** Set the rank of the feature matrices computed (number of features). Default: 10. */
@Since("0.8.0")
def setRank(rank: Int): this.type = {
+ require(rank > 0,
+ s"Rank of the feature matrices must be positive but got ${rank}")
this.rank = rank
this
}
@@ -130,6 +138,8 @@ class ALS private (
/** Set the number of iterations to run. Default: 10. */
@Since("0.8.0")
def setIterations(iterations: Int): this.type = {
+ require(iterations >= 0,
+ s"Number of iterations must be nonnegative but got ${iterations}")
this.iterations = iterations
this
}
@@ -137,6 +147,8 @@ class ALS private (
/** Set the regularization parameter, lambda. Default: 0.01. */
@Since("0.8.0")
def setLambda(lambda: Double): this.type = {
+ require(lambda >= 0.0,
+ s"Regularization parameter must be nonnegative but got ${lambda}")
this.lambda = lambda
this
}