aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
author0x0FFF <programmerag@gmail.com>2015-09-01 14:34:59 -0700
committerDavies Liu <davies.liu@gmail.com>2015-09-01 14:34:59 -0700
commitbf550a4b551b6dd18fea3eb3f70497f9a6ad8e6c (patch)
tree24904c89a11ee8420b3998234a54d6b7047dc48f /python
parentec012805337926e56343be2761a1037296446880 (diff)
downloadspark-bf550a4b551b6dd18fea3eb3f70497f9a6ad8e6c.tar.gz
spark-bf550a4b551b6dd18fea3eb3f70497f9a6ad8e6c.tar.bz2
spark-bf550a4b551b6dd18fea3eb3f70497f9a6ad8e6c.zip
[SPARK-10162] [SQL] Fix the timezone omitting for PySpark Dataframe filter function
This PR addresses [SPARK-10162](https://issues.apache.org/jira/browse/SPARK-10162) The issue is with DataFrame filter() function, if datetime.datetime is passed to it: * Timezone information of this datetime is ignored * This datetime is assumed to be in local timezone, which depends on the OS timezone setting Fix includes both code change and regression test. Problem reproduction code on master: ```python import pytz from datetime import datetime from pyspark.sql import * from pyspark.sql.types import * sqc = SQLContext(sc) df = sqc.createDataFrame([], StructType([StructField("dt", TimestampType())])) m1 = pytz.timezone('UTC') m2 = pytz.timezone('Etc/GMT+3') df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m1)).explain() df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m2)).explain() ``` It gives the same timestamp ignoring time zone: ``` >>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m1)).explain() Filter (dt#0 > 946713600000000) Scan PhysicalRDD[dt#0] >>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m2)).explain() Filter (dt#0 > 946713600000000) Scan PhysicalRDD[dt#0] ``` After the fix: ``` >>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m1)).explain() Filter (dt#0 > 946684800000000) Scan PhysicalRDD[dt#0] >>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m2)).explain() Filter (dt#0 > 946695600000000) Scan PhysicalRDD[dt#0] ``` PR [8536](https://github.com/apache/spark/pull/8536) was occasionally closed by me dropping the repo Author: 0x0FFF <programmerag@gmail.com> Closes #8555 from 0x0FFF/SPARK-10162.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/tests.py26
-rw-r--r--python/pyspark/sql/types.py7
2 files changed, 23 insertions, 10 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cd32e26c64..59a891bd7c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -50,16 +50,17 @@ from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException, IllegalArgumentException
-class UTC(datetime.tzinfo):
- """UTC"""
- ZERO = datetime.timedelta(0)
+class UTCOffsetTimezone(datetime.tzinfo):
+ """
+ Specifies timezone in UTC offset
+ """
+
+ def __init__(self, offset=0):
+ self.ZERO = datetime.timedelta(hours=offset)
def utcoffset(self, dt):
return self.ZERO
- def tzname(self, dt):
- return "UTC"
-
def dst(self, dt):
return self.ZERO
@@ -841,13 +842,22 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEqual(0, df.filter(df.date > date).count())
self.assertEqual(0, df.filter(df.time > time).count())
+ def test_filter_with_datetime_timezone(self):
+ dt1 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(0))
+ dt2 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(1))
+ row = Row(date=dt1)
+ df = self.sqlCtx.createDataFrame([row])
+ self.assertEqual(0, df.filter(df.date == dt2).count())
+ self.assertEqual(1, df.filter(df.date > dt2).count())
+ self.assertEqual(0, df.filter(df.date < dt2).count())
+
def test_time_with_timezone(self):
day = datetime.date.today()
now = datetime.datetime.now()
ts = time.mktime(now.timetuple())
# class in __main__ is not serializable
- from pyspark.sql.tests import UTC
- utc = UTC()
+ from pyspark.sql.tests import UTCOffsetTimezone
+ utc = UTCOffsetTimezone()
utcnow = datetime.datetime.utcfromtimestamp(ts) # without microseconds
# add microseconds to utcnow (keeping year,month,day,hour,minute,second)
utcnow = datetime.datetime(*(utcnow.timetuple()[:6] + (now.microsecond, utc)))
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 94e581a783..f84d08d709 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1290,8 +1290,11 @@ class DatetimeConverter(object):
def convert(self, obj, gateway_client):
Timestamp = JavaClass("java.sql.Timestamp", gateway_client)
- return Timestamp(int(time.mktime(obj.timetuple())) * 1000 + obj.microsecond // 1000)
-
+ seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo
+ else time.mktime(obj.timetuple()))
+ t = Timestamp(int(seconds) * 1000)
+ t.setNanos(obj.microsecond * 1000)
+ return t
# datetime is a subclass of date, we should register DatetimeConverter first
register_input_converter(DatetimeConverter())