[SPARK-6677] [SQL] [PySpark] fix cached classes

It's possible to have two DataType object with same id (memory address) at different time, we should check the cached classes to verify that it's generated by given datatype. This PR also change `__FIELDS__` and `__DATATYPE__` to lower case to match Python code style. Author: Davies Liu <davies@databricks.com> Closes #5445 from davies/fix_type_cache and squashes the following commits: 63b3238 [Davies Liu] typo 47bdede [Davies Liu] fix cached classes
author: Davies Liu <davies@databricks.com> 2015-04-11 22:33:23 -0700
committer: Josh Rosen <joshrosen@databricks.com> 2015-04-11 22:33:23 -0700
commit: 5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4 (patch)
tree: d74e28f0ccfc386c962bdd66126361b969f0d17b /python
parent: 0cc8fcb4cd20cb90a1fac50b1a3ffed833ce5eac (diff)
download: spark-5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4.tar.gz
spark-5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4.tar.bz2
spark-5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4.zip
1 files changed, 20 insertions, 19 deletions
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 7e0124b136..ef76d84c00 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -567,8 +567,8 @@ def _infer_schema(row):
     elif isinstance(row, (tuple, list)):
         if hasattr(row, "_fields"):  # namedtuple
             items = zip(row._fields, tuple(row))
-        elif hasattr(row, "__FIELDS__"):  # Row
-            items = zip(row.__FIELDS__, tuple(row))
+        elif hasattr(row, "__fields__"):  # Row
+            items = zip(row.__fields__, tuple(row))
         else:
             names = ['_%d' % i for i in range(1, len(row) + 1)]
             items = zip(names, row)
@@ -647,7 +647,7 @@ def _python_to_sql_converter(dataType):
             if isinstance(obj, dict):
                 return tuple(c(obj.get(n)) for n, c in zip(names, converters))
             elif isinstance(obj, tuple):
-                if hasattr(obj, "_fields") or hasattr(obj, "__FIELDS__"):
+                if hasattr(obj, "_fields") or hasattr(obj, "__fields__"):
                     return tuple(c(v) for c, v in zip(converters, obj))
                 elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):  # k-v pairs
                     d = dict(obj)
@@ -997,12 +997,13 @@ def _restore_object(dataType, obj):
     # same object in most cases.
     k = id(dataType)
     cls = _cached_cls.get(k)
-    if cls is None:
+    if cls is None or cls.__datatype is not dataType:
         # use dataType as key to avoid create multiple class
         cls = _cached_cls.get(dataType)
         if cls is None:
             cls = _create_cls(dataType)
             _cached_cls[dataType] = cls
+        cls.__datatype = dataType
         _cached_cls[k] = cls
     return cls(obj)
 
@@ -1119,8 +1120,8 @@ def _create_cls(dataType):
     class Row(tuple):
 
         """ Row in DataFrame """
-        __DATATYPE__ = dataType
-        __FIELDS__ = tuple(f.name for f in dataType.fields)
+        __datatype = dataType
+        __fields__ = tuple(f.name for f in dataType.fields)
         __slots__ = ()
 
         # create property for fast access
@@ -1128,22 +1129,22 @@ def _create_cls(dataType):
 
         def asDict(self):
             """ Return as a dict """
-            return dict((n, getattr(self, n)) for n in self.__FIELDS__)
+            return dict((n, getattr(self, n)) for n in self.__fields__)
 
         def __repr__(self):
             # call collect __repr__ for nested objects
             return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
-                                          for n in self.__FIELDS__))
+                                          for n in self.__fields__))
 
         def __reduce__(self):
-            return (_restore_object, (self.__DATATYPE__, tuple(self)))
+            return (_restore_object, (self.__datatype, tuple(self)))
 
     return Row
 
 
 def _create_row(fields, values):
     row = Row(*values)
-    row.__FIELDS__ = fields
+    row.__fields__ = fields
     return row
 
 
@@ -1183,7 +1184,7 @@ class Row(tuple):
             # create row objects
             names = sorted(kwargs.keys())
             row = tuple.__new__(self, [kwargs[n] for n in names])
-            row.__FIELDS__ = names
+            row.__fields__ = names
             return row
 
         else:
@@ -1193,11 +1194,11 @@ class Row(tuple):
         """
         Return as an dict
         """
-        if not hasattr(self, "__FIELDS__"):
+        if not hasattr(self, "__fields__"):
             raise TypeError("Cannot convert a Row class into dict")
-        return dict(zip(self.__FIELDS__, self))
+        return dict(zip(self.__fields__, self))
 
-    # let obect acs like class
+    # let object acts like class
     def __call__(self, *args):
         """create new Row object"""
         return _create_row(self, args)
@@ -1208,21 +1209,21 @@ class Row(tuple):
         try:
             # it will be slow when it has many fields,
             # but this will not be used in normal cases
-            idx = self.__FIELDS__.index(item)
+            idx = self.__fields__.index(item)
             return self[idx]
         except IndexError:
             raise AttributeError(item)
 
     def __reduce__(self):
-        if hasattr(self, "__FIELDS__"):
-            return (_create_row, (self.__FIELDS__, tuple(self)))
+        if hasattr(self, "__fields__"):
+            return (_create_row, (self.__fields__, tuple(self)))
         else:
             return tuple.__reduce__(self)
 
     def __repr__(self):
-        if hasattr(self, "__FIELDS__"):
+        if hasattr(self, "__fields__"):
             return "Row(%s)" % ", ".join("%s=%r" % (k, v)
-                                         for k, v in zip(self.__FIELDS__, self))
+                                         for k, v in zip(self.__fields__, tuple(self)))
         else:
             return "<Row(%s)>" % ", ".join(self)
author	Davies Liu <davies@databricks.com>	2015-04-11 22:33:23 -0700
committer	Josh Rosen <joshrosen@databricks.com>	2015-04-11 22:33:23 -0700
commit	5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4 (patch)
tree	d74e28f0ccfc386c962bdd66126361b969f0d17b /python
parent	0cc8fcb4cd20cb90a1fac50b1a3ffed833ce5eac (diff)
download	spark-5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4.tar.gz spark-5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4.tar.bz2 spark-5d8f7b9e87e8066d54717a1a78b06e8531d8b0d4.zip