[SPARK-4192][SQL] Internal API for Python UDT

Following #2919, this PR adds Python UDT (for internal use only) with tests under "pyspark.tests". Before `SQLContext.applySchema`, we check whether we need to convert user-type instances into SQL recognizable data. In the current implementation, a Python UDT must be paired with a Scala UDT for serialization on the JVM side. A following PR will add VectorUDT in MLlib for both Scala and Python. marmbrus jkbradley davies Author: Xiangrui Meng <meng@databricks.com> Closes #3068 from mengxr/SPARK-4192-sql and squashes the following commits: acff637 [Xiangrui Meng] merge master dba5ea7 [Xiangrui Meng] only use pyClass for Python UDT output sqlType as well 2c9d7e4 [Xiangrui Meng] move import to global setup; update needsConversion 7c4a6a9 [Xiangrui Meng] address comments 75223db [Xiangrui Meng] minor update f740379 [Xiangrui Meng] remove UDT from default imports e98d9d0 [Xiangrui Meng] fix py style 4e84fce [Xiangrui Meng] remove local hive tests and add more tests 39f19e0 [Xiangrui Meng] add tests b7f666d [Xiangrui Meng] add Python UDT (cherry picked from commit 04450d11548cfb25d4fb77d4a33e3a7cd4254183) Signed-off-by: Xiangrui Meng <meng@databricks.com>
author: Xiangrui Meng <meng@databricks.com> 2014-11-03 19:29:11 -0800
committer: Xiangrui Meng <meng@databricks.com> 2014-11-03 19:30:32 -0800
commit: 42d02db86cd973cf31ceeede0c5a723238bbe746 (patch)
tree: 4d773eec8740849bdbca1007f7a0b0af03a1e1bc /sql/catalyst
parent: 0826eed9c84a73544e3d8289834c8b5ebac47e03 (diff)
download: spark-42d02db86cd973cf31ceeede0c5a723238bbe746.tar.gz
spark-42d02db86cd973cf31ceeede0c5a723238bbe746.tar.bz2
spark-42d02db86cd973cf31ceeede0c5a723238bbe746.zip
1 files changed, 8 insertions, 1 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
index e1b5992a36..5dd19dd12d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
@@ -71,6 +71,8 @@ object DataType {
 
     case JSortedObject(
         ("class", JString(udtClass)),
+        ("pyClass", _),
+        ("sqlType", _),
         ("type", JString("udt"))) =>
       Class.forName(udtClass).newInstance().asInstanceOf[UserDefinedType[_]]
   }
@@ -593,6 +595,9 @@ abstract class UserDefinedType[UserType] extends DataType with Serializable {
   /** Underlying storage type for this UDT */
   def sqlType: DataType
 
+  /** Paired Python UDT class, if exists. */
+  def pyUDT: String = null
+
   /**
    * Convert the user type to a SQL datum
    *
@@ -606,7 +611,9 @@ abstract class UserDefinedType[UserType] extends DataType with Serializable {
 
   override private[sql] def jsonValue: JValue = {
     ("type" -> "udt") ~
-      ("class" -> this.getClass.getName)
+      ("class" -> this.getClass.getName) ~
+      ("pyClass" -> pyUDT) ~
+      ("sqlType" -> sqlType.jsonValue)
   }
 
   /**
author	Xiangrui Meng <meng@databricks.com>	2014-11-03 19:29:11 -0800
committer	Xiangrui Meng <meng@databricks.com>	2014-11-03 19:30:32 -0800
commit	42d02db86cd973cf31ceeede0c5a723238bbe746 (patch)
tree	4d773eec8740849bdbca1007f7a0b0af03a1e1bc /sql/catalyst
parent	0826eed9c84a73544e3d8289834c8b5ebac47e03 (diff)
download	spark-42d02db86cd973cf31ceeede0c5a723238bbe746.tar.gz spark-42d02db86cd973cf31ceeede0c5a723238bbe746.tar.bz2 spark-42d02db86cd973cf31ceeede0c5a723238bbe746.zip