aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-03-24 12:08:19 -0700
committerMichael Armbrust <michael@databricks.com>2015-03-24 12:08:19 -0700
commit6bdddb6f6ffbd1bee4c45880904e9561b18764a7 (patch)
treed184db4915e72bebd1735702b293717de9340ef1 /sql
parenta1d1529daebee30b76b954d16a30849407f795d1 (diff)
downloadspark-6bdddb6f6ffbd1bee4c45880904e9561b18764a7.tar.gz
spark-6bdddb6f6ffbd1bee4c45880904e9561b18764a7.tar.bz2
spark-6bdddb6f6ffbd1bee4c45880904e9561b18764a7.zip
[SPARK-6361][SQL] support adding a column with metadata in DF
This is used by ML pipelines to embed ML attributes in columns created by ML transformers/estimators. marmbrus Author: Xiangrui Meng <meng@databricks.com> Closes #5151 from mengxr/SPARK-6361 and squashes the following commits: bb30de3 [Xiangrui Meng] support adding a column with metadata in DF
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala20
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Column.scala13
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala15
3 files changed, 38 insertions, 10 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 17f7f9fe51..3dd7d38847 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -95,9 +95,12 @@ abstract class Attribute extends NamedExpression {
* @param name the name to be associated with the result of computing [[child]].
* @param exprId A globally unique id used to check if an [[AttributeReference]] refers to this
* alias. Auto-assigned if left blank.
+ * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's.
*/
-case class Alias(child: Expression, name: String)
- (val exprId: ExprId = NamedExpression.newExprId, val qualifiers: Seq[String] = Nil)
+case class Alias(child: Expression, name: String)(
+ val exprId: ExprId = NamedExpression.newExprId,
+ val qualifiers: Seq[String] = Nil,
+ val explicitMetadata: Option[Metadata] = None)
extends NamedExpression with trees.UnaryNode[Expression] {
override type EvaluatedType = Any
@@ -107,9 +110,11 @@ case class Alias(child: Expression, name: String)
override def dataType = child.dataType
override def nullable = child.nullable
override def metadata: Metadata = {
- child match {
- case named: NamedExpression => named.metadata
- case _ => Metadata.empty
+ explicitMetadata.getOrElse {
+ child match {
+ case named: NamedExpression => named.metadata
+ case _ => Metadata.empty
+ }
}
}
@@ -123,11 +128,12 @@ case class Alias(child: Expression, name: String)
override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix"
- override protected final def otherCopyArgs = exprId :: qualifiers :: Nil
+ override protected final def otherCopyArgs = exprId :: qualifiers :: explicitMetadata :: Nil
override def equals(other: Any): Boolean = other match {
case a: Alias =>
- name == a.name && exprId == a.exprId && child == a.child && qualifiers == a.qualifiers
+ name == a.name && exprId == a.exprId && child == a.child && qualifiers == a.qualifiers &&
+ explicitMetadata == a.explicitMetadata
case _ => false
}
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index ec7d15f5bc..2ae47e07d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -595,6 +595,19 @@ class Column(protected[sql] val expr: Expression) {
def as(alias: Symbol): Column = Alias(expr, alias.name)()
/**
+ * Gives the column an alias with metadata.
+ * {{{
+ * val metadata: Metadata = ...
+ * df.select($"colA".as("colB", metadata))
+ * }}}
+ *
+ * @group expr_ops
+ */
+ def as(alias: String, metadata: Metadata): Column = {
+ Alias(expr, alias)(explicitMetadata = Some(metadata))
+ }
+
+ /**
* Casts the column to a different data type.
* {{{
* // Casts colA to IntegerType.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index a53ae97d62..bc8fae100d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -17,12 +17,10 @@
package org.apache.spark.sql
-import org.apache.spark.sql.catalyst.expressions.NamedExpression
-import org.apache.spark.sql.catalyst.plans.logical.{Project, NoRelation}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.test.TestSQLContext
import org.apache.spark.sql.test.TestSQLContext.implicits._
-import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType}
+import org.apache.spark.sql.types._
class ColumnExpressionSuite extends QueryTest {
@@ -322,4 +320,15 @@ class ColumnExpressionSuite extends QueryTest {
assert('key.desc == 'key.desc)
assert('key.desc != 'key.asc)
}
+
+ test("alias with metadata") {
+ val metadata = new MetadataBuilder()
+ .putString("originName", "value")
+ .build()
+ val schema = testData
+ .select($"*", col("value").as("abc", metadata))
+ .schema
+ assert(schema("value").metadata === Metadata.empty)
+ assert(schema("abc").metadata === metadata)
+ }
}