aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongjoon Hyun <dongjoon@apache.org>2016-05-23 14:19:25 -0700
committerMichael Armbrust <michael@databricks.com>2016-05-23 14:19:25 -0700
commit37c617e4f580482b59e1abbe3c0c27c7125cf605 (patch)
treef6608e06c3732555e9ec3d2ca33464010cf7b7c5
parent2585d2b322f3b6b85a0a12ddf7dcde957453000d (diff)
downloadspark-37c617e4f580482b59e1abbe3c0c27c7125cf605.tar.gz
spark-37c617e4f580482b59e1abbe3c0c27c7125cf605.tar.bz2
spark-37c617e4f580482b59e1abbe3c0c27c7125cf605.zip
[MINOR][SQL][DOCS] Add notes of the deterministic assumption on UDF functions
## What changes were proposed in this pull request? Spark assumes that UDF functions are deterministic. This PR adds explicit notes about that. ## How was this patch tested? It's only about docs. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #13087 from dongjoon-hyun/SPARK-15282.
-rw-r--r--python/pyspark/sql/functions.py3
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala1
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala3
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala3
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala1
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala3
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala1
7 files changed, 15 insertions, 0 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index dac842c0ce..716b16fdc9 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1756,6 +1756,9 @@ class UserDefinedFunction(object):
@since(1.3)
def udf(f, returnType=StringType()):
"""Creates a :class:`Column` expression representing a user defined function (UDF).
+ Note that the user-defined functions must be deterministic. Due to optimization,
+ duplicate invocations may be eliminated or the function may even be invoked more times than
+ it is present in the query.
>>> from pyspark.sql.types import IntegerType
>>> slen = udf(lambda s: len(s), IntegerType())
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 0038cf65e2..21390644bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.types.DataType
/**
* User-defined function.
+ * Note that the user-defined functions must be deterministic.
* @param function The user defined scala function to run.
* Note that if you use primitive parameters, you are not able to check if it is
* null or not, and the UDF will return null for you if the primitive input is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 14d12d30bc..7013e316ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -199,6 +199,9 @@ class SQLContext private[sql](
/**
* A collection of methods for registering user-defined functions (UDF).
+ * Note that the user-defined functions must be deterministic. Due to optimization,
+ * duplicate invocations may be eliminated or the function may even be invoked more times than
+ * it is present in the query.
*
* The following example registers a Scala closure as UDF:
* {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index f697769bdc..5c87c84418 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -145,6 +145,9 @@ class SparkSession private(
/**
* A collection of methods for registering user-defined functions (UDF).
+ * Note that the user-defined functions must be deterministic. Due to optimization,
+ * duplicate invocations may be eliminated or the function may even be invoked more times than
+ * it is present in the query.
*
* The following example registers a Scala closure as UDF:
* {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 3a043dcc6a..b006236481 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.types.DataType
/**
* Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
+ * Note that the user-defined functions must be deterministic.
*
* @since 1.3.0
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index bd35d19aa2..49fdec5755 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -25,6 +25,9 @@ import org.apache.spark.sql.types.DataType
/**
* A user-defined function. To create one, use the `udf` functions in [[functions]].
+ * Note that the user-defined functions must be deterministic. Due to optimization,
+ * duplicate invocations may be eliminated or the function may even be invoked more times than
+ * it is present in the query.
* As an example:
* {{{
* // Defined a UDF that returns true or false based on some numeric score.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index 939b9195ca..c9cc2ba04a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -100,6 +100,7 @@ private[sql] class SessionState(sparkSession: SparkSession) {
/**
* Interface exposed to the user for registering user-defined functions.
+ * Note that the user-defined functions must be deterministic.
*/
lazy val udf: UDFRegistration = new UDFRegistration(functionRegistry)