diff options
author | Davies Liu <davies@databricks.com> | 2015-02-23 17:29:25 -0800 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2015-02-23 17:29:25 -0800 |
commit | 71173de7ac1bde60afb4e8473817ec766eda013a (patch) | |
tree | 379ffc2987cf5d1f64703504d811caa4951ed121 /python/pyspark/tests.py | |
parent | 5cea859fd27dc6a216fa9d31d293c93407fbff01 (diff) | |
download | spark-71173de7ac1bde60afb4e8473817ec766eda013a.tar.gz spark-71173de7ac1bde60afb4e8473817ec766eda013a.tar.bz2 spark-71173de7ac1bde60afb4e8473817ec766eda013a.zip |
[SPARK-5722] [SQL] [PySpark] infer int as LongType in Python (for 1.2 branch)
This PR change to use LongType for int in Python, when inferSchema(), because IntegerType in SQL is not enough for int in Python (which is 64-bit on 64-bit machines).
Closes #4521
cc dondrake marmbrus
Author: Davies Liu <davies@databricks.com>
Closes #4681 from davies/long2 and squashes the following commits:
05ef1c8 [Davies Liu] infer LongType for int in Python
Diffstat (limited to 'python/pyspark/tests.py')
-rw-r--r-- | python/pyspark/tests.py | 23 |
1 files changed, 22 insertions, 1 deletions
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 1349384d0f..1fc690a649 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -51,7 +51,7 @@ from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, CloudPickleSerializer, CompressedSerializer from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \ - UserDefinedType, DoubleType + UserDefinedType, DoubleType, LongType, _infer_type from pyspark import shuffle _have_scipy = False @@ -985,6 +985,27 @@ class SQLTests(ReusedPySparkTestCase): point = srdd1.first().point self.assertEquals(point, ExamplePoint(1.0, 2.0)) + def test_infer_long_type(self): + longrow = [Row(f1='a', f2=100000000000000)] + rdd = self.sc.parallelize(longrow) + srdd = self.sqlCtx.inferSchema(rdd) + self.assertEqual(srdd.schema().fields[1].dataType, LongType()) + + # this saving as Parquet caused issues as well. + output_dir = os.path.join(self.tempdir.name, "infer_long_type") + srdd.saveAsParquetFile(output_dir) + df1 = self.sqlCtx.parquetFile(output_dir) + self.assertEquals('a', df1.first().f1) + self.assertEquals(100000000000000, df1.first().f2) + + self.assertEqual(_infer_type(1), LongType()) + self.assertEqual(_infer_type(2**10), LongType()) + self.assertEqual(_infer_type(2**20), LongType()) + self.assertEqual(_infer_type(2**31 - 1), LongType()) + self.assertEqual(_infer_type(2**31), LongType()) + self.assertEqual(_infer_type(2**61), LongType()) + self.assertEqual(_infer_type(2**71), LongType()) + class InputFormatTests(ReusedPySparkTestCase): |