aboutsummaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
authorWenchen Fan <wenchen@databricks.com>2017-01-12 22:52:34 -0800
committergatorsmile <gatorsmile@gmail.com>2017-01-12 22:52:34 -0800
commit6b34e745bb8bdcf5a8bb78359fa39bbe8c6563cc (patch)
treeec818366a8134bbe1a62e0770e712694410d9860 /common
parent7f24a0b6c32c56a38cf879d953bbd523922ab9c9 (diff)
downloadspark-6b34e745bb8bdcf5a8bb78359fa39bbe8c6563cc.tar.gz
spark-6b34e745bb8bdcf5a8bb78359fa39bbe8c6563cc.tar.bz2
spark-6b34e745bb8bdcf5a8bb78359fa39bbe8c6563cc.zip
[SPARK-19178][SQL] convert string of large numbers to int should return null
## What changes were proposed in this pull request? When we convert a string to integral, we will convert that string to `decimal(20, 0)` first, so that we can turn a string with decimal format to truncated integral, e.g. `CAST('1.2' AS int)` will return `1`. However, this brings problems when we convert a string with large numbers to integral, e.g. `CAST('1234567890123' AS int)` will return `1912276171`, while Hive returns null as we expected. This is a long standing bug(seems it was there the first day Spark SQL was created), this PR fixes this bug by adding the native support to convert `UTF8String` to integral. ## How was this patch tested? new regression tests Author: Wenchen Fan <wenchen@databricks.com> Closes #16550 from cloud-fan/string-to-int.
Diffstat (limited to 'common')
-rw-r--r--common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java184
1 files changed, 184 insertions, 0 deletions
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 0255f53113..3800d53c02 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -835,6 +835,190 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
return fromString(sb.toString());
}
+ private int getDigit(byte b) {
+ if (b >= '0' && b <= '9') {
+ return b - '0';
+ }
+ throw new NumberFormatException(toString());
+ }
+
+ /**
+ * Parses this UTF8String to long.
+ *
+ * Note that, in this method we accumulate the result in negative format, and convert it to
+ * positive format at the end, if this string is not started with '-'. This is because min value
+ * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
+ * Integer.MIN_VALUE is '-2147483648'.
+ *
+ * This code is mostly copied from LazyLong.parseLong in Hive.
+ */
+ public long toLong() {
+ if (numBytes == 0) {
+ throw new NumberFormatException("Empty string");
+ }
+
+ byte b = getByte(0);
+ final boolean negative = b == '-';
+ int offset = 0;
+ if (negative || b == '+') {
+ offset++;
+ if (numBytes == 1) {
+ throw new NumberFormatException(toString());
+ }
+ }
+
+ final byte separator = '.';
+ final int radix = 10;
+ final long stopValue = Long.MIN_VALUE / radix;
+ long result = 0;
+
+ while (offset < numBytes) {
+ b = getByte(offset);
+ offset++;
+ if (b == separator) {
+ // We allow decimals and will return a truncated integral in that case.
+ // Therefore we won't throw an exception here (checking the fractional
+ // part happens below.)
+ break;
+ }
+
+ int digit = getDigit(b);
+ // We are going to process the new digit and accumulate the result. However, before doing
+ // this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
+ // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
+ if (result < stopValue) {
+ throw new NumberFormatException(toString());
+ }
+
+ result = result * radix - digit;
+ // Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
+ // can just use `result > 0` to check overflow. If result overflows, we should stop and throw
+ // exception.
+ if (result > 0) {
+ throw new NumberFormatException(toString());
+ }
+ }
+
+ // This is the case when we've encountered a decimal separator. The fractional
+ // part will not change the number, but we will verify that the fractional part
+ // is well formed.
+ while (offset < numBytes) {
+ if (getDigit(getByte(offset)) == -1) {
+ throw new NumberFormatException(toString());
+ }
+ offset++;
+ }
+
+ if (!negative) {
+ result = -result;
+ if (result < 0) {
+ throw new NumberFormatException(toString());
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Parses this UTF8String to int.
+ *
+ * Note that, in this method we accumulate the result in negative format, and convert it to
+ * positive format at the end, if this string is not started with '-'. This is because min value
+ * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
+ * Integer.MIN_VALUE is '-2147483648'.
+ *
+ * This code is mostly copied from LazyInt.parseInt in Hive.
+ *
+ * Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
+ * reasons, like Hive does.
+ */
+ public int toInt() {
+ if (numBytes == 0) {
+ throw new NumberFormatException("Empty string");
+ }
+
+ byte b = getByte(0);
+ final boolean negative = b == '-';
+ int offset = 0;
+ if (negative || b == '+') {
+ offset++;
+ if (numBytes == 1) {
+ throw new NumberFormatException(toString());
+ }
+ }
+
+ final byte separator = '.';
+ final int radix = 10;
+ final int stopValue = Integer.MIN_VALUE / radix;
+ int result = 0;
+
+ while (offset < numBytes) {
+ b = getByte(offset);
+ offset++;
+ if (b == separator) {
+ // We allow decimals and will return a truncated integral in that case.
+ // Therefore we won't throw an exception here (checking the fractional
+ // part happens below.)
+ break;
+ }
+
+ int digit = getDigit(b);
+ // We are going to process the new digit and accumulate the result. However, before doing
+ // this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
+ // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
+ if (result < stopValue) {
+ throw new NumberFormatException(toString());
+ }
+
+ result = result * radix - digit;
+ // Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
+ // we can just use `result > 0` to check overflow. If result overflows, we should stop and
+ // throw exception.
+ if (result > 0) {
+ throw new NumberFormatException(toString());
+ }
+ }
+
+ // This is the case when we've encountered a decimal separator. The fractional
+ // part will not change the number, but we will verify that the fractional part
+ // is well formed.
+ while (offset < numBytes) {
+ if (getDigit(getByte(offset)) == -1) {
+ throw new NumberFormatException(toString());
+ }
+ offset++;
+ }
+
+ if (!negative) {
+ result = -result;
+ if (result < 0) {
+ throw new NumberFormatException(toString());
+ }
+ }
+
+ return result;
+ }
+
+ public short toShort() {
+ int intValue = toInt();
+ short result = (short) intValue;
+ if (result != intValue) {
+ throw new NumberFormatException(toString());
+ }
+
+ return result;
+ }
+
+ public byte toByte() {
+ int intValue = toInt();
+ byte result = (byte) intValue;
+ if (result != intValue) {
+ throw new NumberFormatException(toString());
+ }
+
+ return result;
+ }
+
@Override
public String toString() {
return new String(getBytes(), StandardCharsets.UTF_8);