From a133057ce5817f834babe9f25023092aec3c321d Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 25 Aug 2016 23:22:40 +0200 Subject: [SPARK-17229][SQL] PostgresDialect shouldn't widen float and short types during reads ## What changes were proposed in this pull request? When reading float4 and smallint columns from PostgreSQL, Spark's `PostgresDialect` widens these types to Decimal and Integer rather than using the narrower Float and Short types. According to https://www.postgresql.org/docs/7.1/static/datatype.html#DATATYPE-TABLE, Postgres maps the `smallint` type to a signed two-byte integer and the `real` / `float4` types to single precision floating point numbers. This patch fixes this by adding more special-cases to `getCatalystType`, similar to what was done for the Derby JDBC dialect. I also fixed a similar problem in the write path which causes Spark to create integer columns in Postgres for what should have been ShortType columns. ## How was this patch tested? New test cases in `PostgresIntegrationSuite` (which I ran manually because Jenkins can't run it right now). Author: Josh Rosen Closes #14796 from JoshRosen/postgres-jdbc-type-fixes. --- .../org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala | 4 ++++ .../src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'sql') diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 6dad8cbef7..8d9048ab82 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -390,6 +390,10 @@ private[jdbc] class JDBCRDD( (rs: ResultSet, row: MutableRow, pos: Int) => row.setLong(pos, rs.getLong(pos + 1)) + case ShortType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setShort(pos, rs.getShort(pos + 1)) + case StringType => (rs: ResultSet, row: MutableRow, pos: Int) => // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index fb959d881e..3f540d6258 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -29,7 +29,11 @@ private object PostgresDialect extends JdbcDialect { override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { - if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { + if (sqlType == Types.REAL) { + Some(FloatType) + } else if (sqlType == Types.SMALLINT) { + Some(ShortType) + } else if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) { Some(BinaryType) } else if (sqlType == Types.OTHER) { Some(StringType) @@ -66,6 +70,7 @@ private object PostgresDialect extends JdbcDialect { case BooleanType => Some(JdbcType("BOOLEAN", Types.BOOLEAN)) case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) + case ShortType => Some(JdbcType("SMALLINT", Types.SMALLINT)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] => -- cgit v1.2.3