aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/tests.py
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-02-18 14:17:04 -0800
committerMichael Armbrust <michael@databricks.com>2015-02-18 14:17:04 -0800
commitaa8f10e82a743d59ce87348af19c0177eb618a66 (patch)
tree87fc8bfc978015fcf3d7ff9ff2aa3717b0885f28 /python/pyspark/sql/tests.py
parentf0e3b71077a6c28aba29a7a75e901a9e0911b9f0 (diff)
downloadspark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.gz
spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.bz2
spark-aa8f10e82a743d59ce87348af19c0177eb618a66.zip
[SPARK-5722] [SQL] [PySpark] infer int as LongType
The `int` is 64-bit on 64-bit machine (very common now), we should infer it as LongType for it in Spark SQL. Also, LongType in SQL will come back as `int`. Author: Davies Liu <davies@databricks.com> Closes #4666 from davies/long and squashes the following commits: 6bc6cc4 [Davies Liu] infer int as LongType
Diffstat (limited to 'python/pyspark/sql/tests.py')
-rw-r--r--python/pyspark/sql/tests.py22
1 files changed, 21 insertions, 1 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 52f7e65d9c..8e1bb36598 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -38,7 +38,7 @@ else:
from pyspark.sql import SQLContext, HiveContext, Column
from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
- UserDefinedType, DoubleType, LongType, StringType
+ UserDefinedType, DoubleType, LongType, StringType, _infer_type
from pyspark.tests import ReusedPySparkTestCase
@@ -324,6 +324,26 @@ class SQLTests(ReusedPySparkTestCase):
pydoc.render_doc(df.foo)
pydoc.render_doc(df.take(1))
+ def test_infer_long_type(self):
+ longrow = [Row(f1='a', f2=100000000000000)]
+ df = self.sc.parallelize(longrow).toDF()
+ self.assertEqual(df.schema.fields[1].dataType, LongType())
+
+ # this saving as Parquet caused issues as well.
+ output_dir = os.path.join(self.tempdir.name, "infer_long_type")
+ df.saveAsParquetFile(output_dir)
+ df1 = self.sqlCtx.parquetFile(output_dir)
+ self.assertEquals('a', df1.first().f1)
+ self.assertEquals(100000000000000, df1.first().f2)
+
+ self.assertEqual(_infer_type(1), LongType())
+ self.assertEqual(_infer_type(2**10), LongType())
+ self.assertEqual(_infer_type(2**20), LongType())
+ self.assertEqual(_infer_type(2**31 - 1), LongType())
+ self.assertEqual(_infer_type(2**31), LongType())
+ self.assertEqual(_infer_type(2**61), LongType())
+ self.assertEqual(_infer_type(2**71), LongType())
+
class HiveContextSQLTests(ReusedPySparkTestCase):