From 560b355ccd038ca044726c9c9fcffd14d02e6696 Mon Sep 17 00:00:00 2001 From: Tarek Auel Date: Mon, 20 Jul 2015 22:43:30 -0700 Subject: [SPARK-9157] [SQL] codegen substring https://issues.apache.org/jira/browse/SPARK-9157 Author: Tarek Auel Closes #7534 from tarekauel/SPARK-9157 and squashes the following commits: e65e3e9 [Tarek Auel] [SPARK-9157] indent fix 44e89f8 [Tarek Auel] [SPARK-9157] use EMPTY_UTF8 37d54c4 [Tarek Auel] Merge branch 'master' into SPARK-9157 60732ea [Tarek Auel] [SPARK-9157] created substringSQL in UTF8String 18c3576 [Tarek Auel] [SPARK-9157][SQL] remove slice pos 1a2e611 [Tarek Auel] [SPARK-9157][SQL] codegen substring --- .../org/apache/spark/unsafe/types/UTF8String.java | 12 ++++++++++++ .../apache/spark/unsafe/types/UTF8StringSuite.java | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'unsafe') diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index ed354f7f87..946d355f1f 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -165,6 +165,18 @@ public final class UTF8String implements Comparable, Serializable { return fromBytes(bytes); } + public UTF8String substringSQL(int pos, int length) { + // Information regarding the pos calculation: + // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and + // negative indices for start positions. If a start index i is greater than 0, it + // refers to element i-1 in the sequence. If a start index i is less than 0, it refers + // to the -ith element before the end of the sequence. If a start index i is 0, it + // refers to the first element. + int start = (pos > 0) ? pos -1 : ((pos < 0) ? numChars() + pos : 0); + int end = (length == Integer.MAX_VALUE) ? Integer.MAX_VALUE : start + length; + return substring(start, end); + } + /** * Returns whether this contains `substring` or not. */ diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 1f5572c509..e2a5628ff4 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -272,6 +272,25 @@ public class UTF8StringSuite { fromString("数据砖头").rpad(12, fromString("孙行者"))); } + @Test + public void substringSQL() { + UTF8String e = fromString("example"); + assertEquals(e.substringSQL(0, 2), fromString("ex")); + assertEquals(e.substringSQL(1, 2), fromString("ex")); + assertEquals(e.substringSQL(0, 7), fromString("example")); + assertEquals(e.substringSQL(1, 2), fromString("ex")); + assertEquals(e.substringSQL(0, 100), fromString("example")); + assertEquals(e.substringSQL(1, 100), fromString("example")); + assertEquals(e.substringSQL(2, 2), fromString("xa")); + assertEquals(e.substringSQL(1, 6), fromString("exampl")); + assertEquals(e.substringSQL(2, 100), fromString("xample")); + assertEquals(e.substringSQL(0, 0), fromString("")); + assertEquals(e.substringSQL(100, 4), EMPTY_UTF8); + assertEquals(e.substringSQL(0, Integer.MAX_VALUE), fromString("example")); + assertEquals(e.substringSQL(1, Integer.MAX_VALUE), fromString("example")); + assertEquals(e.substringSQL(2, Integer.MAX_VALUE), fromString("xample")); + } + @Test public void split() { assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1), -- cgit v1.2.3