From f0d880e288eba97c86dceb1b5edab4f3a935943b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 29 Jul 2014 12:31:39 -0700
Subject: [SPARK-2674] [SQL] [PySpark] support datetime type for SchemaRDD

Datetime and time in Python will be converted into java.util.Calendar after serialization, it will be converted into java.sql.Timestamp during inferSchema().

In javaToPython(), Timestamp will be converted into Calendar, then be converted into datetime in Python after pickling.

Author: Davies Liu <davies.liu@gmail.com>

Closes #1601 from davies/date and squashes the following commits:

f0599b0 [Davies Liu] remove tests for sets and tuple in sql, fix list of list
c9d607a [Davies Liu] convert datetype for runtime
709d40d [Davies Liu] remove brackets
96db384 [Davies Liu] support datetime type for SchemaRDD
---
 python/pyspark/sql.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'python')

diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index cb83e89176..a6b3277db3 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -47,12 +47,14 @@ class SQLContext:
             ...
         ValueError:...
 
-        >>> allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L,
-        ... "boolean" : True}])
+        >>> from datetime import datetime
+        >>> allTypes = sc.parallelize([{"int": 1, "string": "string", "double": 1.0, "long": 1L,
+        ... "boolean": True, "time": datetime(2010, 1, 1, 1, 1, 1), "dict": {"a": 1},
+        ... "list": [1, 2, 3]}])
         >>> srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (x.int, x.string, x.double, x.long,
-        ... x.boolean))
+        ... x.boolean, x.time, x.dict["a"], x.list))
         >>> srdd.collect()[0]
-        (1, u'string', 1.0, 1, True)
+        (1, u'string', 1.0, 1, True, datetime.datetime(2010, 1, 1, 1, 1, 1), 1, [1, 2, 3])
         """
         self._sc = sparkContext
         self._jsc = self._sc._jsc
@@ -88,13 +90,13 @@ class SQLContext:
 
         >>> from array import array
         >>> srdd = sqlCtx.inferSchema(nestedRdd1)
-        >>> srdd.collect() == [{"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
-        ...                    {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]
+        >>> srdd.collect() == [{"f1" : [1, 2], "f2" : {"row1" : 1.0}},
+        ...                    {"f1" : [2, 3], "f2" : {"row2" : 2.0}}]
         True
 
         >>> srdd = sqlCtx.inferSchema(nestedRdd2)
-        >>> srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
-        ...                    {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]
+        >>> srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : [1, 2]},
+        ...                    {"f1" : [[2, 3], [3, 4]], "f2" : [2, 3]}]
         True
         """
         if (rdd.__class__ is SchemaRDD):
@@ -509,8 +511,8 @@ def _test():
         {"f1": array('i', [1, 2]), "f2": {"row1": 1.0}},
         {"f1": array('i', [2, 3]), "f2": {"row2": 2.0}}])
     globs['nestedRdd2'] = sc.parallelize([
-        {"f1": [[1, 2], [2, 3]], "f2": set([1, 2]), "f3": (1, 2)},
-        {"f1": [[2, 3], [3, 4]], "f2": set([2, 3]), "f3": (2, 3)}])
+        {"f1": [[1, 2], [2, 3]], "f2": [1, 2]},
+        {"f1": [[2, 3], [3, 4]], "f2": [2, 3]}])
     (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
-- 
cgit v1.2.3