aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-08-06 10:39:16 -0700
committerReynold Xin <rxin@databricks.com>2015-08-06 10:39:16 -0700
commit5e1b0ef07942a041195b3decd05d86c289bc8d2b (patch)
tree3ee8fa52d14bce8b62e152da4aa560eae780338b
parent98e69467d4fda2c26a951409b5b7c6f1e9345ce4 (diff)
downloadspark-5e1b0ef07942a041195b3decd05d86c289bc8d2b.tar.gz
spark-5e1b0ef07942a041195b3decd05d86c289bc8d2b.tar.bz2
spark-5e1b0ef07942a041195b3decd05d86c289bc8d2b.zip
[SPARK-9659][SQL] Rename inSet to isin to match Pandas function.
Inspiration drawn from this blog post: https://lab.getbase.com/pandarize-spark-dataframes/ Author: Reynold Xin <rxin@databricks.com> Closes #7977 from rxin/isin and squashes the following commits: 9b1d3d6 [Reynold Xin] Added return. 2197d37 [Reynold Xin] Fixed test case. 7c1b6cf [Reynold Xin] Import warnings. 4f4a35d [Reynold Xin] [SPARK-9659][SQL] Rename inSet to isin to match Pandas function.
-rw-r--r--python/pyspark/sql/column.py20
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Column.scala13
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala14
3 files changed, 38 insertions, 9 deletions
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 0a85da7443..8af8637cf9 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -16,6 +16,7 @@
#
import sys
+import warnings
if sys.version >= '3':
basestring = str
@@ -254,12 +255,29 @@ class Column(object):
[Row(age=5, name=u'Bob')]
>>> df[df.age.inSet([1, 2, 3])].collect()
[Row(age=2, name=u'Alice')]
+
+ .. note:: Deprecated in 1.5, use :func:`Column.isin` instead.
+ """
+ warnings.warn("inSet is deprecated. Use isin() instead.")
+ return self.isin(*cols)
+
+ @ignore_unicode_prefix
+ @since(1.5)
+ def isin(self, *cols):
+ """
+ A boolean expression that is evaluated to true if the value of this
+ expression is contained by the evaluated values of the arguments.
+
+ >>> df[df.name.isin("Bob", "Mike")].collect()
+ [Row(age=5, name=u'Bob')]
+ >>> df[df.age.isin([1, 2, 3])].collect()
+ [Row(age=2, name=u'Alice')]
"""
if len(cols) == 1 and isinstance(cols[0], (list, set)):
cols = cols[0]
cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols]
sc = SparkContext._active_spark_context
- jc = getattr(self._jc, "in")(_to_seq(sc, cols))
+ jc = getattr(self._jc, "isin")(_to_seq(sc, cols))
return Column(jc)
# order
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index b25dcbca82..75365fbcec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -627,8 +627,19 @@ class Column(protected[sql] val expr: Expression) extends Logging {
* @group expr_ops
* @since 1.3.0
*/
+ @deprecated("use isin", "1.5.0")
@scala.annotation.varargs
- def in(list: Any*): Column = In(expr, list.map(lit(_).expr))
+ def in(list: Any*): Column = isin(list : _*)
+
+ /**
+ * A boolean expression that is evaluated to true if the value of this expression is contained
+ * by the evaluated values of the arguments.
+ *
+ * @group expr_ops
+ * @since 1.5.0
+ */
+ @scala.annotation.varargs
+ def isin(list: Any*): Column = In(expr, list.map(lit(_).expr))
/**
* SQL like expression.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index b351380373..e1b3443d74 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -345,23 +345,23 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils {
test("in") {
val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b")
- checkAnswer(df.filter($"a".in(1, 2)),
+ checkAnswer(df.filter($"a".isin(1, 2)),
df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2))
- checkAnswer(df.filter($"a".in(3, 2)),
+ checkAnswer(df.filter($"a".isin(3, 2)),
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2))
- checkAnswer(df.filter($"a".in(3, 1)),
+ checkAnswer(df.filter($"a".isin(3, 1)),
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1))
- checkAnswer(df.filter($"b".in("y", "x")),
+ checkAnswer(df.filter($"b".isin("y", "x")),
df.collect().toSeq.filter(r => r.getString(1) == "y" || r.getString(1) == "x"))
- checkAnswer(df.filter($"b".in("z", "x")),
+ checkAnswer(df.filter($"b".isin("z", "x")),
df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "x"))
- checkAnswer(df.filter($"b".in("z", "y")),
+ checkAnswer(df.filter($"b".isin("z", "y")),
df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "y"))
val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b")
intercept[AnalysisException] {
- df2.filter($"a".in($"b"))
+ df2.filter($"a".isin($"b"))
}
}