diff options
author | Wenchen Fan <wenchen@databricks.com> | 2016-01-05 10:23:36 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-01-05 10:23:36 -0800 |
commit | 76768337beec6842660db7522ad15c25ee66d346 (patch) | |
tree | d58976552a906917b9126712e75a7a2136fbd01c | |
parent | 9a6ba7e2c538124f539b50512a7f95059f81cc16 (diff) | |
download | spark-76768337beec6842660db7522ad15c25ee66d346.tar.gz spark-76768337beec6842660db7522ad15c25ee66d346.tar.bz2 spark-76768337beec6842660db7522ad15c25ee66d346.zip |
[SPARK-12480][FOLLOW-UP] use a single column vararg for hash
address comments in #10435
This makes the API easier to use if user programmatically generate the call to hash, and they will get analysis exception if the arguments of hash is empty.
Author: Wenchen Fan <wenchen@databricks.com>
Closes #10588 from cloud-fan/hash.
4 files changed, 16 insertions, 3 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 7c15e38458..b0390cb994 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1018,6 +1018,18 @@ def sha2(col, numBits): return Column(jc) +@since(2.0) +def hash(*cols): + """Calculates the hash code of given columns, and returns the result as a int column. + + >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect() + [Row(hash=1358996357)] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.hash(_to_seq(sc, cols, _to_java_column)) + return Column(jc) + + # ---------------------- String/Binary functions ------------------------------ _string_functions = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 8834924687..6697d46361 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -200,7 +200,7 @@ case class Murmur3Hash(children: Seq[Expression], seed: Int) extends Expression override def checkInputDataTypes(): TypeCheckResult = { if (children.isEmpty) { - TypeCheckResult.TypeCheckFailure("arguments of function hash cannot be empty") + TypeCheckResult.TypeCheckFailure("function hash requires at least one argument") } else { TypeCheckResult.TypeCheckSuccess } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala index 915c585ec9..f3df716a57 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala @@ -163,6 +163,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite { assertError(Coalesce(Seq('intField, 'booleanField)), "input to function coalesce should all be the same type") assertError(Coalesce(Nil), "input to function coalesce cannot be empty") + assertError(new Murmur3Hash(Nil), "function hash requires at least one argument") assertError(Explode('intField), "input to function explode should be array or map type") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index e223e32fd7..1c96f647b6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1820,8 +1820,8 @@ object functions extends LegacyFunctions { * @since 2.0 */ @scala.annotation.varargs - def hash(col: Column, cols: Column*): Column = withExpr { - new Murmur3Hash((col +: cols).map(_.expr)) + def hash(cols: Column*): Column = withExpr { + new Murmur3Hash(cols.map(_.expr)) } ////////////////////////////////////////////////////////////////////////////////////////////// |