aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@databricks.com>2015-10-13 15:09:31 -0700
committerMichael Armbrust <michael@databricks.com>2015-10-13 15:09:31 -0700
commitef72673b234579c161b8cbb6cafc851d9eba1bfb (patch)
tree15ed300794eba02ef46890d190e70b9c5ba546f6 /sql/catalyst
parent0d1b73b78b600420121ea8e58ff659ae8b4feebe (diff)
downloadspark-ef72673b234579c161b8cbb6cafc851d9eba1bfb.tar.gz
spark-ef72673b234579c161b8cbb6cafc851d9eba1bfb.tar.bz2
spark-ef72673b234579c161b8cbb6cafc851d9eba1bfb.zip
[SPARK-11080] [SQL] Incorporate per-JVM id into ExprId to prevent unsafe cross-JVM comparisions
In the current implementation of named expressions' `ExprIds`, we rely on a per-JVM AtomicLong to ensure that expression ids are unique within a JVM. However, these expression ids will not be _globally_ unique. This opens the potential for id collisions if new expression ids happen to be created inside of tasks rather than on the driver. There are currently a few cases where tasks allocate expression ids, which happen to be safe because those expressions are never compared to expressions created on the driver. In order to guard against the introduction of invalid comparisons between driver-created and executor-created expression ids, this patch extends `ExprId` to incorporate a UUID to identify the JVM that created the id, which prevents collisions. Author: Josh Rosen <joshrosen@databricks.com> Closes #9093 from JoshRosen/SPARK-11080.
Diffstat (limited to 'sql/catalyst')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala15
1 files changed, 12 insertions, 3 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 5768c6087d..8957df0be6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -17,6 +17,8 @@
package org.apache.spark.sql.catalyst.expressions
+import java.util.UUID
+
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -24,16 +26,23 @@ import org.apache.spark.sql.types._
object NamedExpression {
private val curId = new java.util.concurrent.atomic.AtomicLong()
- def newExprId: ExprId = ExprId(curId.getAndIncrement())
+ private[expressions] val jvmId = UUID.randomUUID()
+ def newExprId: ExprId = ExprId(curId.getAndIncrement(), jvmId)
def unapply(expr: NamedExpression): Option[(String, DataType)] = Some(expr.name, expr.dataType)
}
/**
- * A globally unique (within this JVM) id for a given named expression.
+ * A globally unique id for a given named expression.
* Used to identify which attribute output by a relation is being
* referenced in a subsequent computation.
+ *
+ * The `id` field is unique within a given JVM, while the `uuid` is used to uniquely identify JVMs.
*/
-case class ExprId(id: Long)
+case class ExprId(id: Long, jvmId: UUID)
+
+object ExprId {
+ def apply(id: Long): ExprId = ExprId(id, NamedExpression.jvmId)
+}
/**
* An [[Expression]] that is named.