aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2015-08-08 18:09:48 +0800
committerCheng Lian <lian@databricks.com>2015-08-08 18:09:48 +0800
commit11caf1ce290b6931647c2f71268f847d1d48930e (patch)
treea6b8ba615404ba7554998901f38671be645bdd3d /sql
parentef062c15992b0d08554495b8ea837bef3fabf6e9 (diff)
downloadspark-11caf1ce290b6931647c2f71268f847d1d48930e.tar.gz
spark-11caf1ce290b6931647c2f71268f847d1d48930e.tar.bz2
spark-11caf1ce290b6931647c2f71268f847d1d48930e.zip
[SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for precision <= 18 rather than 8
This PR fixes a minor bug introduced in #7455: when writing decimals, we should use the unscaled Long for better performance when the precision <= 18 rather than 8 (should be a typo). This bug doesn't affect correctness, but hurts Parquet decimal writing performance. This PR also replaced similar magic numbers with newly defined constants. Author: Cheng Lian <lian@databricks.com> Closes #8031 from liancheng/spark-4176/minor-fix-for-writing-decimals and squashes the following commits: 10d4ea3 [Cheng Lian] Should use unscaled Long to write decimals for precision <= 18 rather than 8
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala29
2 files changed, 18 insertions, 13 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index 6938b07106..4fe8a39f20 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -264,7 +264,7 @@ private[parquet] class CatalystRowConverter(
val scale = decimalType.scale
val bytes = value.getBytes
- if (precision <= 8) {
+ if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
// Constructs a `Decimal` with an unscaled `Long` value if possible.
var unscaled = 0L
var i = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index d43ca95b4e..b12149dcf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -25,6 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
import org.apache.parquet.schema.Type.Repetition._
import org.apache.parquet.schema._
+import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{AnalysisException, SQLConf}
@@ -155,7 +156,7 @@ private[parquet] class CatalystSchemaConverter(
case INT_16 => ShortType
case INT_32 | null => IntegerType
case DATE => DateType
- case DECIMAL => makeDecimalType(maxPrecisionForBytes(4))
+ case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT32)
case TIME_MILLIS => typeNotImplemented()
case _ => illegalType()
}
@@ -163,7 +164,7 @@ private[parquet] class CatalystSchemaConverter(
case INT64 =>
originalType match {
case INT_64 | null => LongType
- case DECIMAL => makeDecimalType(maxPrecisionForBytes(8))
+ case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT64)
case TIMESTAMP_MILLIS => typeNotImplemented()
case _ => illegalType()
}
@@ -405,7 +406,7 @@ private[parquet] class CatalystSchemaConverter(
// Uses INT32 for 1 <= precision <= 9
case DecimalType.Fixed(precision, scale)
- if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec =>
+ if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
Types
.primitive(INT32, repetition)
.as(DECIMAL)
@@ -415,7 +416,7 @@ private[parquet] class CatalystSchemaConverter(
// Uses INT64 for 1 <= precision <= 18
case DecimalType.Fixed(precision, scale)
- if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec =>
+ if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
Types
.primitive(INT64, repetition)
.as(DECIMAL)
@@ -534,14 +535,6 @@ private[parquet] class CatalystSchemaConverter(
throw new AnalysisException(s"Unsupported data type $field.dataType")
}
}
-
- // Max precision of a decimal value stored in `numBytes` bytes
- private def maxPrecisionForBytes(numBytes: Int): Int = {
- Math.round( // convert double to long
- Math.floor(Math.log10( // number of base-10 digits
- Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes
- .asInstanceOf[Int]
- }
}
@@ -584,4 +577,16 @@ private[parquet] object CatalystSchemaConverter {
computeMinBytesForPrecision(precision)
}
}
+
+ val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4)
+
+ val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8)
+
+ // Max precision of a decimal value stored in `numBytes` bytes
+ def maxPrecisionForBytes(numBytes: Int): Int = {
+ Math.round( // convert double to long
+ Math.floor(Math.log10( // number of base-10 digits
+ Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes
+ .asInstanceOf[Int]
+ }
}