diff options
author | Davies Liu <davies@databricks.com> | 2015-02-18 14:17:04 -0800 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2015-02-18 14:17:04 -0800 |
commit | aa8f10e82a743d59ce87348af19c0177eb618a66 (patch) | |
tree | 87fc8bfc978015fcf3d7ff9ff2aa3717b0885f28 /python/pyspark/sql/tests.py | |
parent | f0e3b71077a6c28aba29a7a75e901a9e0911b9f0 (diff) | |
download | spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.gz spark-aa8f10e82a743d59ce87348af19c0177eb618a66.tar.bz2 spark-aa8f10e82a743d59ce87348af19c0177eb618a66.zip |
[SPARK-5722] [SQL] [PySpark] infer int as LongType
The `int` is 64-bit on 64-bit machine (very common now), we should infer it as LongType for it in Spark SQL.
Also, LongType in SQL will come back as `int`.
Author: Davies Liu <davies@databricks.com>
Closes #4666 from davies/long and squashes the following commits:
6bc6cc4 [Davies Liu] infer int as LongType
Diffstat (limited to 'python/pyspark/sql/tests.py')
-rw-r--r-- | python/pyspark/sql/tests.py | 22 |
1 files changed, 21 insertions, 1 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 52f7e65d9c..8e1bb36598 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -38,7 +38,7 @@ else: from pyspark.sql import SQLContext, HiveContext, Column from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \ - UserDefinedType, DoubleType, LongType, StringType + UserDefinedType, DoubleType, LongType, StringType, _infer_type from pyspark.tests import ReusedPySparkTestCase @@ -324,6 +324,26 @@ class SQLTests(ReusedPySparkTestCase): pydoc.render_doc(df.foo) pydoc.render_doc(df.take(1)) + def test_infer_long_type(self): + longrow = [Row(f1='a', f2=100000000000000)] + df = self.sc.parallelize(longrow).toDF() + self.assertEqual(df.schema.fields[1].dataType, LongType()) + + # this saving as Parquet caused issues as well. + output_dir = os.path.join(self.tempdir.name, "infer_long_type") + df.saveAsParquetFile(output_dir) + df1 = self.sqlCtx.parquetFile(output_dir) + self.assertEquals('a', df1.first().f1) + self.assertEquals(100000000000000, df1.first().f2) + + self.assertEqual(_infer_type(1), LongType()) + self.assertEqual(_infer_type(2**10), LongType()) + self.assertEqual(_infer_type(2**20), LongType()) + self.assertEqual(_infer_type(2**31 - 1), LongType()) + self.assertEqual(_infer_type(2**31), LongType()) + self.assertEqual(_infer_type(2**61), LongType()) + self.assertEqual(_infer_type(2**71), LongType()) + class HiveContextSQLTests(ReusedPySparkTestCase): |