[SPARK-5722] [SQL] [PySpark] infer int as LongType

The `int` is 64-bit on 64-bit machine (very common now), we should infer it as LongType for it in Spark SQL. Also, LongType in SQL will come back as `int`. Author: Davies Liu <davies@databricks.com> Closes #4666 from davies/long and squashes the following commits: 6bc6cc4 [Davies Liu] infer int as LongType
author: Davies Liu <davies@databricks.com> 2015-02-18 14:17:04 -0800
committer: Michael Armbrust <michael@databricks.com> 2015-02-18 14:17:04 -0800
commit: aa8f10e82a743d59ce87348af19c0177eb618a66 (patch)
tree: 87fc8bfc978015fcf3d7ff9ff2aa3717b0885f28 /python/pyspark/sql/tests.py
parent: f0e3b71077a6c28aba29a7a75e901a9e0911b9f0 (diff)
download: spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.gz
spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.bz2
spark-aa8f10e82a743d59ce87348af19c0177eb618a66.zip
1 files changed, 21 insertions, 1 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 52f7e65d9c..8e1bb36598 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -38,7 +38,7 @@ else:
 
 from pyspark.sql import SQLContext, HiveContext, Column
 from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
-    UserDefinedType, DoubleType, LongType, StringType
+    UserDefinedType, DoubleType, LongType, StringType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase
 
 
@@ -324,6 +324,26 @@ class SQLTests(ReusedPySparkTestCase):
         pydoc.render_doc(df.foo)
         pydoc.render_doc(df.take(1))
 
+    def test_infer_long_type(self):
+        longrow = [Row(f1='a', f2=100000000000000)]
+        df = self.sc.parallelize(longrow).toDF()
+        self.assertEqual(df.schema.fields[1].dataType, LongType())
+
+        # this saving as Parquet caused issues as well.
+        output_dir = os.path.join(self.tempdir.name, "infer_long_type")
+        df.saveAsParquetFile(output_dir)
+        df1 = self.sqlCtx.parquetFile(output_dir)
+        self.assertEquals('a', df1.first().f1)
+        self.assertEquals(100000000000000, df1.first().f2)
+
+        self.assertEqual(_infer_type(1), LongType())
+        self.assertEqual(_infer_type(2**10), LongType())
+        self.assertEqual(_infer_type(2**20), LongType())
+        self.assertEqual(_infer_type(2**31 - 1), LongType())
+        self.assertEqual(_infer_type(2**31), LongType())
+        self.assertEqual(_infer_type(2**61), LongType())
+        self.assertEqual(_infer_type(2**71), LongType())
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
author	Davies Liu <davies@databricks.com>	2015-02-18 14:17:04 -0800
committer	Michael Armbrust <michael@databricks.com>	2015-02-18 14:17:04 -0800
commit	aa8f10e82a743d59ce87348af19c0177eb618a66 (patch)
tree	87fc8bfc978015fcf3d7ff9ff2aa3717b0885f28 /python/pyspark/sql/tests.py
parent	f0e3b71077a6c28aba29a7a75e901a9e0911b9f0 (diff)
download	spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.gz spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.bz2 spark-aa8f10e82a743d59ce87348af19c0177eb618a66.zip