diff options
author | Matei Zaharia <matei@databricks.com> | 2014-11-01 19:29:14 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2014-11-01 19:29:14 -0700 |
commit | 23f966f47523f85ba440b4080eee665271f53b5e (patch) | |
tree | d796351567f8b187511b9049199cbf99c5826fb3 /python/pyspark | |
parent | 56f2c61cde3f5d906c2a58e9af1a661222f2c679 (diff) | |
download | spark-23f966f47523f85ba440b4080eee665271f53b5e.tar.gz spark-23f966f47523f85ba440b4080eee665271f53b5e.tar.bz2 spark-23f966f47523f85ba440b4080eee665271f53b5e.zip |
[SPARK-3930] [SPARK-3933] Support fixed-precision decimal in SQL, and some optimizations
- Adds optional precision and scale to Spark SQL's decimal type, which behave similarly to those in Hive 13 (https://cwiki.apache.org/confluence/download/attachments/27362075/Hive_Decimal_Precision_Scale_Support.pdf)
- Replaces our internal representation of decimals with a Decimal class that can store small values in a mutable Long, saving memory in this situation and letting some operations happen directly on Longs
This is still marked WIP because there are a few TODOs, but I'll remove that tag when done.
Author: Matei Zaharia <matei@databricks.com>
Closes #2983 from mateiz/decimal-1 and squashes the following commits:
35e6b02 [Matei Zaharia] Fix issues after merge
227f24a [Matei Zaharia] Review comments
31f915e [Matei Zaharia] Implement Davies's suggestions in Python
eb84820 [Matei Zaharia] Support reading/writing decimals as fixed-length binary in Parquet
4dc6bae [Matei Zaharia] Fix decimal support in PySpark
d1d9d68 [Matei Zaharia] Fix compile error and test issues after rebase
b28933d [Matei Zaharia] Support decimal precision/scale in Hive metastore
2118c0d [Matei Zaharia] Some test and bug fixes
81db9cb [Matei Zaharia] Added mutable Decimal that will be more efficient for small precisions
7af0c3b [Matei Zaharia] Add optional precision and scale to DecimalType, but use Unlimited for now
ec0a947 [Matei Zaharia] Make the result of AVG on Decimals be Decimal, not Double
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/sql.py | 35 |
1 files changed, 32 insertions, 3 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 93bfc25bca..98e41f8575 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -35,6 +35,7 @@ import datetime import keyword import warnings import json +import re from array import array from operator import itemgetter from itertools import imap @@ -148,13 +149,30 @@ class TimestampType(PrimitiveType): """ -class DecimalType(PrimitiveType): +class DecimalType(DataType): """Spark SQL DecimalType The data type representing decimal.Decimal values. """ + def __init__(self, precision=None, scale=None): + self.precision = precision + self.scale = scale + self.hasPrecisionInfo = precision is not None + + def jsonValue(self): + if self.hasPrecisionInfo: + return "decimal(%d,%d)" % (self.precision, self.scale) + else: + return "decimal" + + def __repr__(self): + if self.hasPrecisionInfo: + return "DecimalType(%d,%d)" % (self.precision, self.scale) + else: + return "DecimalType()" + class DoubleType(PrimitiveType): @@ -446,9 +464,20 @@ def _parse_datatype_json_string(json_string): return _parse_datatype_json_value(json.loads(json_string)) +_FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)") + + def _parse_datatype_json_value(json_value): - if type(json_value) is unicode and json_value in _all_primitive_types.keys(): - return _all_primitive_types[json_value]() + if type(json_value) is unicode: + if json_value in _all_primitive_types.keys(): + return _all_primitive_types[json_value]() + elif json_value == u'decimal': + return DecimalType() + elif _FIXED_DECIMAL.match(json_value): + m = _FIXED_DECIMAL.match(json_value) + return DecimalType(int(m.group(1)), int(m.group(2))) + else: + raise ValueError("Could not parse datatype: %s" % json_value) else: return _all_complex_types[json_value["type"]].fromJson(json_value) |