aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorMatei Zaharia <matei@databricks.com>2014-11-01 19:29:14 -0700
committerMichael Armbrust <michael@databricks.com>2014-11-01 19:29:14 -0700
commit23f966f47523f85ba440b4080eee665271f53b5e (patch)
treed796351567f8b187511b9049199cbf99c5826fb3 /python
parent56f2c61cde3f5d906c2a58e9af1a661222f2c679 (diff)
downloadspark-23f966f47523f85ba440b4080eee665271f53b5e.tar.gz
spark-23f966f47523f85ba440b4080eee665271f53b5e.tar.bz2
spark-23f966f47523f85ba440b4080eee665271f53b5e.zip
[SPARK-3930] [SPARK-3933] Support fixed-precision decimal in SQL, and some optimizations
- Adds optional precision and scale to Spark SQL's decimal type, which behave similarly to those in Hive 13 (https://cwiki.apache.org/confluence/download/attachments/27362075/Hive_Decimal_Precision_Scale_Support.pdf) - Replaces our internal representation of decimals with a Decimal class that can store small values in a mutable Long, saving memory in this situation and letting some operations happen directly on Longs This is still marked WIP because there are a few TODOs, but I'll remove that tag when done. Author: Matei Zaharia <matei@databricks.com> Closes #2983 from mateiz/decimal-1 and squashes the following commits: 35e6b02 [Matei Zaharia] Fix issues after merge 227f24a [Matei Zaharia] Review comments 31f915e [Matei Zaharia] Implement Davies's suggestions in Python eb84820 [Matei Zaharia] Support reading/writing decimals as fixed-length binary in Parquet 4dc6bae [Matei Zaharia] Fix decimal support in PySpark d1d9d68 [Matei Zaharia] Fix compile error and test issues after rebase b28933d [Matei Zaharia] Support decimal precision/scale in Hive metastore 2118c0d [Matei Zaharia] Some test and bug fixes 81db9cb [Matei Zaharia] Added mutable Decimal that will be more efficient for small precisions 7af0c3b [Matei Zaharia] Add optional precision and scale to DecimalType, but use Unlimited for now ec0a947 [Matei Zaharia] Make the result of AVG on Decimals be Decimal, not Double
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql.py35
1 files changed, 32 insertions, 3 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 93bfc25bca..98e41f8575 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -35,6 +35,7 @@ import datetime
import keyword
import warnings
import json
+import re
from array import array
from operator import itemgetter
from itertools import imap
@@ -148,13 +149,30 @@ class TimestampType(PrimitiveType):
"""
-class DecimalType(PrimitiveType):
+class DecimalType(DataType):
"""Spark SQL DecimalType
The data type representing decimal.Decimal values.
"""
+ def __init__(self, precision=None, scale=None):
+ self.precision = precision
+ self.scale = scale
+ self.hasPrecisionInfo = precision is not None
+
+ def jsonValue(self):
+ if self.hasPrecisionInfo:
+ return "decimal(%d,%d)" % (self.precision, self.scale)
+ else:
+ return "decimal"
+
+ def __repr__(self):
+ if self.hasPrecisionInfo:
+ return "DecimalType(%d,%d)" % (self.precision, self.scale)
+ else:
+ return "DecimalType()"
+
class DoubleType(PrimitiveType):
@@ -446,9 +464,20 @@ def _parse_datatype_json_string(json_string):
return _parse_datatype_json_value(json.loads(json_string))
+_FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)")
+
+
def _parse_datatype_json_value(json_value):
- if type(json_value) is unicode and json_value in _all_primitive_types.keys():
- return _all_primitive_types[json_value]()
+ if type(json_value) is unicode:
+ if json_value in _all_primitive_types.keys():
+ return _all_primitive_types[json_value]()
+ elif json_value == u'decimal':
+ return DecimalType()
+ elif _FIXED_DECIMAL.match(json_value):
+ m = _FIXED_DECIMAL.match(json_value)
+ return DecimalType(int(m.group(1)), int(m.group(2)))
+ else:
+ raise ValueError("Could not parse datatype: %s" % json_value)
else:
return _all_complex_types[json_value["type"]].fromJson(json_value)