aboutsummaryrefslogtreecommitdiff
path: root/unsafe/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'unsafe/src/main')
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java80
1 files changed, 79 insertions, 1 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 9d4998fd48..2561c1c2a1 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -198,7 +198,7 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
*/
public UTF8String substring(final int start, final int until) {
if (until <= start || start >= numBytes) {
- return fromBytes(new byte[0]);
+ return UTF8String.EMPTY_UTF8;
}
int i = 0;
@@ -407,6 +407,84 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
}
/**
+ * Find the `str` from left to right.
+ */
+ private int find(UTF8String str, int start) {
+ assert (str.numBytes > 0);
+ while (start <= numBytes - str.numBytes) {
+ if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
+ return start;
+ }
+ start += 1;
+ }
+ return -1;
+ }
+
+ /**
+ * Find the `str` from right to left.
+ */
+ private int rfind(UTF8String str, int start) {
+ assert (str.numBytes > 0);
+ while (start >= 0) {
+ if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
+ return start;
+ }
+ start -= 1;
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the substring from string str before count occurrences of the delimiter delim.
+ * If count is positive, everything the left of the final delimiter (counting from left) is
+ * returned. If count is negative, every to the right of the final delimiter (counting from the
+ * right) is returned. subStringIndex performs a case-sensitive match when searching for delim.
+ */
+ public UTF8String subStringIndex(UTF8String delim, int count) {
+ if (delim.numBytes == 0 || count == 0) {
+ return EMPTY_UTF8;
+ }
+ if (count > 0) {
+ int idx = -1;
+ while (count > 0) {
+ idx = find(delim, idx + 1);
+ if (idx >= 0) {
+ count --;
+ } else {
+ // can not find enough delim
+ return this;
+ }
+ }
+ if (idx == 0) {
+ return EMPTY_UTF8;
+ }
+ byte[] bytes = new byte[idx];
+ copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, idx);
+ return fromBytes(bytes);
+
+ } else {
+ int idx = numBytes - delim.numBytes + 1;
+ count = -count;
+ while (count > 0) {
+ idx = rfind(delim, idx - 1);
+ if (idx >= 0) {
+ count --;
+ } else {
+ // can not find enough delim
+ return this;
+ }
+ }
+ if (idx + delim.numBytes == numBytes) {
+ return EMPTY_UTF8;
+ }
+ int size = numBytes - delim.numBytes - idx;
+ byte[] bytes = new byte[size];
+ copyMemory(base, offset + idx + delim.numBytes, bytes, BYTE_ARRAY_OFFSET, size);
+ return fromBytes(bytes);
+ }
+ }
+
+ /**
* Returns str, right-padded with pad to a length of len
* For example:
* ('hi', 5, '??') =&gt; 'hi???'