From c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce Mon Sep 17 00:00:00 2001 From: "zhichao.li" Date: Sat, 1 Aug 2015 08:48:46 -0700 Subject: [SPARK-8263] [SQL] substr/substring should also support binary type This is based on #7641, thanks to zhichao-li Closes #7641 Author: zhichao.li Author: Davies Liu Closes #7848 from davies/substr and squashes the following commits: 461b709 [Davies Liu] remove bytearry from tests b45377a [Davies Liu] Merge branch 'master' of github.com:apache/spark into substr 01d795e [zhichao.li] scala style 99aa130 [zhichao.li] add substring to dataframe 4f68bfe [zhichao.li] add binary type support for substring --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'unsafe/src') diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index f6dafe94c6..208503d2fd 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -198,7 +198,7 @@ public final class UTF8String implements Comparable, Serializable { */ public UTF8String substring(final int start, final int until) { if (until <= start || start >= numBytes) { - return UTF8String.EMPTY_UTF8; + return EMPTY_UTF8; } int i = 0; @@ -214,9 +214,13 @@ public final class UTF8String implements Comparable, Serializable { c += 1; } - byte[] bytes = new byte[i - j]; - copyMemory(base, offset + j, bytes, BYTE_ARRAY_OFFSET, i - j); - return fromBytes(bytes); + if (i > j) { + byte[] bytes = new byte[i - j]; + copyMemory(base, offset + j, bytes, BYTE_ARRAY_OFFSET, i - j); + return fromBytes(bytes); + } else { + return EMPTY_UTF8; + } } public UTF8String substringSQL(int pos, int length) { @@ -226,8 +230,9 @@ public final class UTF8String implements Comparable, Serializable { // refers to element i-1 in the sequence. If a start index i is less than 0, it refers // to the -ith element before the end of the sequence. If a start index i is 0, it // refers to the first element. - int start = (pos > 0) ? pos -1 : ((pos < 0) ? numChars() + pos : 0); - int end = (length == Integer.MAX_VALUE) ? Integer.MAX_VALUE : start + length; + int len = numChars(); + int start = (pos > 0) ? pos -1 : ((pos < 0) ? len + pos : 0); + int end = (length == Integer.MAX_VALUE) ? len : start + length; return substring(start, end); } -- cgit v1.2.3