aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--R/pkg/R/stats.R3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala4
-rw-r--r--python/pyspark/sql/dataframe.py3
3 files changed, 6 insertions, 4 deletions
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 8d1d165052..d78a10893f 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -149,7 +149,8 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
-#' Note that rows containing any NA values will be removed before calculation.
+#' Note that NA values will be ignored in numerical columns before calculation. For
+#' columns only containing NA values, an empty list is returned.
#'
#' @param x A SparkDataFrame.
#' @param cols A single column name, or a list of names for multiple columns.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 80c7f55e26..feceeba866 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -93,8 +93,8 @@ private[feature] trait QuantileDiscretizerBase extends Params
* are too few distinct values of the input to create enough distinct quantiles.
*
* NaN handling:
- * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
- * produce a `Bucketizer` model for making predictions. During the transformation,
+ * null and NaN values will be ignored from the column during `QuantileDiscretizer` fitting. This
+ * will produce a `Bucketizer` model for making predictions. During the transformation,
* `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
* also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
* If the user chooses to keep NaN values, they will be handled specially and placed into their own
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index bb6df22682..a24512f53c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1384,7 +1384,8 @@ class DataFrame(object):
Space-efficient Online Computation of Quantile Summaries]]
by Greenwald and Khanna.
- Note that rows containing any null values will be removed before calculation.
+ Note that null values will be ignored in numerical columns before calculation.
+ For columns only containing null values, an empty list is returned.
:param col: str, list.
Can be a single column name, or a list of names for multiple columns.