From 424b0075a1a31c251451c6a75c6ba8e81c39453d Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 11 Jun 2015 01:00:41 -0700 Subject: [SPARK-6411] [SQL] [PySpark] support date/datetime with timezone in Python Spark SQL does not support timezone, and Pyrolite does not support timezone well. This patch will convert datetime into POSIX timestamp (without confusing of timezone), which is used by SQL. If the datetime object does not have timezone, it's treated as local time. The timezone in RDD will be lost after one round trip, all the datetime from SQL will be local time. Because of Pyrolite, datetime from SQL only has precision as 1 millisecond. This PR also drop the timezone in date, convert it to number of days since epoch (used in SQL). Author: Davies Liu Closes #6250 from davies/tzone and squashes the following commits: 44d8497 [Davies Liu] add timezone support for DateType 99d9d9c [Davies Liu] use int for timestamp 10aa7ca [Davies Liu] Merge branch 'master' of github.com:apache/spark into tzone 6a29aa4 [Davies Liu] support datetime with timezone --- python/pyspark/sql/tests.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'python/pyspark/sql/tests.py') diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index a6fce50c76..b5fbb7d098 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -26,6 +26,7 @@ import shutil import tempfile import pickle import functools +import time import datetime import py4j @@ -47,6 +48,20 @@ from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.window import Window +class UTC(datetime.tzinfo): + """UTC""" + ZERO = datetime.timedelta(0) + + def utcoffset(self, dt): + return self.ZERO + + def tzname(self, dt): + return "UTC" + + def dst(self, dt): + return self.ZERO + + class ExamplePointUDT(UserDefinedType): """ User-defined type (UDT) for ExamplePoint. @@ -588,6 +603,23 @@ class SQLTests(ReusedPySparkTestCase): self.assertEqual(0, df.filter(df.date > date).count()) self.assertEqual(0, df.filter(df.time > time).count()) + def test_time_with_timezone(self): + day = datetime.date.today() + now = datetime.datetime.now() + ts = time.mktime(now.timetuple()) + now.microsecond / 1e6 + # class in __main__ is not serializable + from pyspark.sql.tests import UTC + utc = UTC() + utcnow = datetime.datetime.fromtimestamp(ts, utc) + df = self.sqlCtx.createDataFrame([(day, now, utcnow)]) + day1, now1, utcnow1 = df.first() + # Pyrolite serialize java.sql.Date as datetime, will be fixed in new version + self.assertEqual(day1.date(), day) + # Pyrolite does not support microsecond, the error should be + # less than 1 millisecond + self.assertTrue(now - now1 < datetime.timedelta(0.001)) + self.assertTrue(now - utcnow1 < datetime.timedelta(0.001)) + def test_dropna(self): schema = StructType([ StructField("name", StringType(), True), -- cgit v1.2.3