aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst/src
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-03-30 20:47:10 -0700
committerReynold Xin <rxin@databricks.com>2015-03-30 20:47:10 -0700
commitb8ff2bc61c9835867f56afa1860ab5eb727c4a58 (patch)
treee29f737f32f9c21e22ff6fd7778549ec907c6015 /sql/catalyst/src
parentfde6945417355ae57500b67d034c9cad4f20d240 (diff)
downloadspark-b8ff2bc61c9835867f56afa1860ab5eb727c4a58.tar.gz
spark-b8ff2bc61c9835867f56afa1860ab5eb727c4a58.tar.bz2
spark-b8ff2bc61c9835867f56afa1860ab5eb727c4a58.zip
[SPARK-6119][SQL] DataFrame support for missing data handling
This pull request adds variants of DataFrame.na.drop and DataFrame.na.fill to the Scala/Java API, and DataFrame.fillna and DataFrame.dropna to the Python API. Author: Reynold Xin <rxin@databricks.com> Closes #5274 from rxin/df-missing-value and squashes the following commits: 4ee1b98 [Reynold Xin] Improve error reporting in Python. 33a330c [Reynold Xin] Remove replace for now. bc4fdbb [Reynold Xin] Added documentation for replace. d56f5a5 [Reynold Xin] Added replace for Scala/Java. 2385d00 [Reynold Xin] Feedback from Xiangrui on "how". 914a374 [Reynold Xin] fill with map. 185c67e [Reynold Xin] Allow specifying column subsets in fill. 749eb47 [Reynold Xin] fillna 249b94e [Reynold Xin] Removing undefined functions. 6a73c68 [Reynold Xin] Missing file. 67d7003 [Reynold Xin] [SPARK-6119][SQL] DataFrame.na.drop (Scala/Java) and DataFrame.dropna (Python)
Diffstat (limited to 'sql/catalyst/src')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala25
1 files changed, 24 insertions, 1 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index d1f3d4f4ee..f9161cf34f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -35,7 +35,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
override def toString: String = s"Coalesce(${children.mkString(",")})"
- def dataType: DataType = if (resolved) {
+ override def dataType: DataType = if (resolved) {
children.head.dataType
} else {
val childTypes = children.map(c => s"$c: ${c.dataType}").mkString(", ")
@@ -74,3 +74,26 @@ case class IsNotNull(child: Expression) extends Predicate with trees.UnaryNode[E
child.eval(input) != null
}
}
+
+/**
+ * A predicate that is evaluated to be true if there are at least `n` non-null values.
+ */
+case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate {
+ override def nullable: Boolean = false
+ override def foldable: Boolean = false
+ override def toString: String = s"AtLeastNNulls(n, ${children.mkString(",")})"
+
+ private[this] val childrenArray = children.toArray
+
+ override def eval(input: Row): Boolean = {
+ var numNonNulls = 0
+ var i = 0
+ while (i < childrenArray.length && numNonNulls < n) {
+ if (childrenArray(i).eval(input) != null) {
+ numNonNulls += 1
+ }
+ i += 1
+ }
+ numNonNulls >= n
+ }
+}