aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/tests.py
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-02-23 17:29:25 -0800
committerMichael Armbrust <michael@databricks.com>2015-02-23 17:29:25 -0800
commit71173de7ac1bde60afb4e8473817ec766eda013a (patch)
tree379ffc2987cf5d1f64703504d811caa4951ed121 /python/pyspark/tests.py
parent5cea859fd27dc6a216fa9d31d293c93407fbff01 (diff)
downloadspark-71173de7ac1bde60afb4e8473817ec766eda013a.tar.gz
spark-71173de7ac1bde60afb4e8473817ec766eda013a.tar.bz2
spark-71173de7ac1bde60afb4e8473817ec766eda013a.zip
[SPARK-5722] [SQL] [PySpark] infer int as LongType in Python (for 1.2 branch)
This PR change to use LongType for int in Python, when inferSchema(), because IntegerType in SQL is not enough for int in Python (which is 64-bit on 64-bit machines). Closes #4521 cc dondrake marmbrus Author: Davies Liu <davies@databricks.com> Closes #4681 from davies/long2 and squashes the following commits: 05ef1c8 [Davies Liu] infer LongType for int in Python
Diffstat (limited to 'python/pyspark/tests.py')
-rw-r--r--python/pyspark/tests.py23
1 files changed, 22 insertions, 1 deletions
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 1349384d0f..1fc690a649 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -51,7 +51,7 @@ from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer,
CloudPickleSerializer, CompressedSerializer
from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
- UserDefinedType, DoubleType
+ UserDefinedType, DoubleType, LongType, _infer_type
from pyspark import shuffle
_have_scipy = False
@@ -985,6 +985,27 @@ class SQLTests(ReusedPySparkTestCase):
point = srdd1.first().point
self.assertEquals(point, ExamplePoint(1.0, 2.0))
+ def test_infer_long_type(self):
+ longrow = [Row(f1='a', f2=100000000000000)]
+ rdd = self.sc.parallelize(longrow)
+ srdd = self.sqlCtx.inferSchema(rdd)
+ self.assertEqual(srdd.schema().fields[1].dataType, LongType())
+
+ # this saving as Parquet caused issues as well.
+ output_dir = os.path.join(self.tempdir.name, "infer_long_type")
+ srdd.saveAsParquetFile(output_dir)
+ df1 = self.sqlCtx.parquetFile(output_dir)
+ self.assertEquals('a', df1.first().f1)
+ self.assertEquals(100000000000000, df1.first().f2)
+
+ self.assertEqual(_infer_type(1), LongType())
+ self.assertEqual(_infer_type(2**10), LongType())
+ self.assertEqual(_infer_type(2**20), LongType())
+ self.assertEqual(_infer_type(2**31 - 1), LongType())
+ self.assertEqual(_infer_type(2**31), LongType())
+ self.assertEqual(_infer_type(2**61), LongType())
+ self.assertEqual(_infer_type(2**71), LongType())
+
class InputFormatTests(ReusedPySparkTestCase):