1 files changed, 7 insertions, 2 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index e09800877c..1e59d71a70 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -39,7 +39,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
    * default: 2
    * @group param
    */
-  val numBuckets = new IntParam(this, "numBuckets", "Maximum number of buckets (quantiles, or " +
+  val numBuckets = new IntParam(this, "numBuckets", "Number of buckets (quantiles, or " +
     "categories) into which data points are grouped. Must be >= 2.",
     ParamValidators.gtEq(2))
   setDefault(numBuckets -> 2)
@@ -65,7 +65,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
 
 /**
  * `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
- * categorical features. The number of bins can be set using the `numBuckets` parameter.
+ * categorical features. The number of bins can be set using the `numBuckets` parameter. It is
+ * possible that the number of buckets used will be less than this value, for example, if there
+ * are too few distinct values of the input to create enough distinct quantiles. Note also that
+ * NaN values are handled specially and placed into their own bucket. For example, if 4 buckets
+ * are used, then non-NaN data will be put into buckets(0-3), but NaNs will be counted in a special
+ * bucket(4).
  * The bin ranges are chosen using an approximate algorithm (see the documentation for
  * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]]
  * for a detailed description). The precision of the approximation can be controlled with the