aboutsummaryrefslogtreecommitdiff
path: root/unsafe
diff options
context:
space:
mode:
authorCheng Hao <hao.cheng@intel.com>2015-07-09 11:11:34 -0700
committerDavies Liu <davies.liu@gmail.com>2015-07-09 11:11:34 -0700
commit0b0b9ceaf73de472198c9804fb7ae61fa2a2e097 (patch)
treee6e86d9c5921fdd26a1393beffa3e1b7bc6f2504 /unsafe
parent0cd84c86cac68600a74d84e50ad40c0c8b84822a (diff)
downloadspark-0b0b9ceaf73de472198c9804fb7ae61fa2a2e097.tar.gz
spark-0b0b9ceaf73de472198c9804fb7ae61fa2a2e097.tar.bz2
spark-0b0b9ceaf73de472198c9804fb7ae61fa2a2e097.zip
[SPARK-8247] [SPARK-8249] [SPARK-8252] [SPARK-8254] [SPARK-8257] [SPARK-8258] [SPARK-8259] [SPARK-8261] [SPARK-8262] [SPARK-8253] [SPARK-8260] [SPARK-8267] [SQL] Add String Expressions
Author: Cheng Hao <hao.cheng@intel.com> Closes #6762 from chenghao-intel/str_funcs and squashes the following commits: b09a909 [Cheng Hao] update the code as feedback 7ebbf4c [Cheng Hao] Add more string expressions
Diffstat (limited to 'unsafe')
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java191
-rw-r--r--unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java94
2 files changed, 279 insertions, 6 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 847d80ad58..60d050b0a0 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -25,6 +25,7 @@ import org.apache.spark.unsafe.array.ByteArrayMethods;
import static org.apache.spark.unsafe.PlatformDependent.*;
+
/**
* A UTF-8 String for internal Spark use.
* <p>
@@ -204,6 +205,196 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
return fromString(toString().toLowerCase());
}
+ /**
+ * Copy the bytes from the current UTF8String, and make a new UTF8String.
+ * @param start the start position of the current UTF8String in bytes.
+ * @param end the end position of the current UTF8String in bytes.
+ * @return a new UTF8String in the position of [start, end] of current UTF8String bytes.
+ */
+ private UTF8String copyUTF8String(int start, int end) {
+ int len = end - start + 1;
+ byte[] newBytes = new byte[len];
+ copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len);
+ return UTF8String.fromBytes(newBytes);
+ }
+
+ public UTF8String trim() {
+ int s = 0;
+ int e = this.numBytes - 1;
+ // skip all of the space (0x20) in the left side
+ while (s < this.numBytes && getByte(s) == 0x20) s++;
+ // skip all of the space (0x20) in the right side
+ while (e >= 0 && getByte(e) == 0x20) e--;
+
+ if (s > e) {
+ // empty string
+ return UTF8String.fromBytes(new byte[0]);
+ } else {
+ return copyUTF8String(s, e);
+ }
+ }
+
+ public UTF8String trimLeft() {
+ int s = 0;
+ // skip all of the space (0x20) in the left side
+ while (s < this.numBytes && getByte(s) == 0x20) s++;
+ if (s == this.numBytes) {
+ // empty string
+ return UTF8String.fromBytes(new byte[0]);
+ } else {
+ return copyUTF8String(s, this.numBytes - 1);
+ }
+ }
+
+ public UTF8String trimRight() {
+ int e = numBytes - 1;
+ // skip all of the space (0x20) in the right side
+ while (e >= 0 && getByte(e) == 0x20) e--;
+
+ if (e < 0) {
+ // empty string
+ return UTF8String.fromBytes(new byte[0]);
+ } else {
+ return copyUTF8String(0, e);
+ }
+ }
+
+ public UTF8String reverse() {
+ byte[] bytes = getBytes();
+ byte[] result = new byte[bytes.length];
+
+ int i = 0; // position in byte
+ while (i < numBytes) {
+ int len = numBytesForFirstByte(getByte(i));
+ System.arraycopy(bytes, i, result, result.length - i - len, len);
+
+ i += len;
+ }
+
+ return UTF8String.fromBytes(result);
+ }
+
+ public UTF8String repeat(int times) {
+ if (times <=0) {
+ return fromBytes(new byte[0]);
+ }
+
+ byte[] newBytes = new byte[numBytes * times];
+ System.arraycopy(getBytes(), 0, newBytes, 0, numBytes);
+
+ int copied = 1;
+ while (copied < times) {
+ int toCopy = Math.min(copied, times - copied);
+ System.arraycopy(newBytes, 0, newBytes, copied * numBytes, numBytes * toCopy);
+ copied += toCopy;
+ }
+
+ return UTF8String.fromBytes(newBytes);
+ }
+
+ /**
+ * Returns the position of the first occurrence of substr in
+ * current string from the specified position (0-based index).
+ *
+ * @param v the string to be searched
+ * @param start the start position of the current string for searching
+ * @return the position of the first occurrence of substr, if not found, -1 returned.
+ */
+ public int indexOf(UTF8String v, int start) {
+ if (v.numBytes() == 0) {
+ return 0;
+ }
+
+ // locate to the start position.
+ int i = 0; // position in byte
+ int c = 0; // position in character
+ while (i < numBytes && c < start) {
+ i += numBytesForFirstByte(getByte(i));
+ c += 1;
+ }
+
+ do {
+ if (i + v.numBytes > numBytes) {
+ return -1;
+ }
+ if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) {
+ return c;
+ }
+ i += numBytesForFirstByte(getByte(i));
+ c += 1;
+ } while(i < numBytes);
+
+ return -1;
+ }
+
+ /**
+ * Returns str, right-padded with pad to a length of len
+ * For example:
+ * ('hi', 5, '??') => 'hi???'
+ * ('hi', 1, '??') => 'h'
+ */
+ public UTF8String rpad(int len, UTF8String pad) {
+ int spaces = len - this.numChars(); // number of char need to pad
+ if (spaces <= 0) {
+ // no padding at all, return the substring of the current string
+ return substring(0, len);
+ } else {
+ int padChars = pad.numChars();
+ int count = spaces / padChars; // how many padding string needed
+ // the partial string of the padding
+ UTF8String remain = pad.substring(0, spaces - padChars * count);
+
+ byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+ System.arraycopy(getBytes(), 0, data, 0, this.numBytes);
+ int offset = this.numBytes;
+ int idx = 0;
+ byte[] padBytes = pad.getBytes();
+ while (idx < count) {
+ System.arraycopy(padBytes, 0, data, offset, pad.numBytes);
+ ++idx;
+ offset += pad.numBytes;
+ }
+ System.arraycopy(remain.getBytes(), 0, data, offset, remain.numBytes);
+
+ return UTF8String.fromBytes(data);
+ }
+ }
+
+ /**
+ * Returns str, left-padded with pad to a length of len.
+ * For example:
+ * ('hi', 5, '??') => '???hi'
+ * ('hi', 1, '??') => 'h'
+ */
+ public UTF8String lpad(int len, UTF8String pad) {
+ int spaces = len - this.numChars(); // number of char need to pad
+ if (spaces <= 0) {
+ // no padding at all, return the substring of the current string
+ return substring(0, len);
+ } else {
+ int padChars = pad.numChars();
+ int count = spaces / padChars; // how many padding string needed
+ // the partial string of the padding
+ UTF8String remain = pad.substring(0, spaces - padChars * count);
+
+ byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+
+ int offset = 0;
+ int idx = 0;
+ byte[] padBytes = pad.getBytes();
+ while (idx < count) {
+ System.arraycopy(padBytes, 0, data, offset, pad.numBytes);
+ ++idx;
+ offset += pad.numBytes;
+ }
+ System.arraycopy(remain.getBytes(), 0, data, offset, remain.numBytes);
+ offset += remain.numBytes;
+ System.arraycopy(getBytes(), 0, data, offset, numBytes());
+
+ return UTF8String.fromBytes(data);
+ }
+ }
+
@Override
public String toString() {
try {
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index fb463ba17f..694bdc29f3 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -121,12 +121,94 @@ public class UTF8StringSuite {
@Test
public void substring() {
- assertEquals(fromString("hello").substring(0, 0), fromString(""));
- assertEquals(fromString("hello").substring(1, 3), fromString("el"));
- assertEquals(fromString("数据砖头").substring(0, 1), fromString("数"));
- assertEquals(fromString("数据砖头").substring(1, 3), fromString("据砖"));
- assertEquals(fromString("数据砖头").substring(3, 5), fromString("头"));
- assertEquals(fromString("ߵ梷").substring(0, 2), fromString("ߵ梷"));
+ assertEquals(fromString(""), fromString("hello").substring(0, 0));
+ assertEquals(fromString("el"), fromString("hello").substring(1, 3));
+ assertEquals(fromString("数"), fromString("数据砖头").substring(0, 1));
+ assertEquals(fromString("据砖"), fromString("数据砖头").substring(1, 3));
+ assertEquals(fromString("头"), fromString("数据砖头").substring(3, 5));
+ assertEquals(fromString("ߵ梷"), fromString("ߵ梷").substring(0, 2));
+ }
+
+ @Test
+ public void trims() {
+ assertEquals(fromString("hello"), fromString(" hello ").trim());
+ assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
+ assertEquals(fromString(" hello"), fromString(" hello ").trimRight());
+
+ assertEquals(fromString(""), fromString(" ").trim());
+ assertEquals(fromString(""), fromString(" ").trimLeft());
+ assertEquals(fromString(""), fromString(" ").trimRight());
+
+ assertEquals(fromString("数据砖头"), fromString(" 数据砖头 ").trim());
+ assertEquals(fromString("数据砖头 "), fromString(" 数据砖头 ").trimLeft());
+ assertEquals(fromString(" 数据砖头"), fromString(" 数据砖头 ").trimRight());
+
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
+ }
+
+ @Test
+ public void indexOf() {
+ assertEquals(0, fromString("").indexOf(fromString(""), 0));
+ assertEquals(-1, fromString("").indexOf(fromString("l"), 0));
+ assertEquals(0, fromString("hello").indexOf(fromString(""), 0));
+ assertEquals(2, fromString("hello").indexOf(fromString("l"), 0));
+ assertEquals(3, fromString("hello").indexOf(fromString("l"), 3));
+ assertEquals(-1, fromString("hello").indexOf(fromString("a"), 0));
+ assertEquals(2, fromString("hello").indexOf(fromString("ll"), 0));
+ assertEquals(-1, fromString("hello").indexOf(fromString("ll"), 4));
+ assertEquals(1, fromString("数据砖头").indexOf(fromString("据砖"), 0));
+ assertEquals(-1, fromString("数据砖头").indexOf(fromString("数"), 3));
+ assertEquals(0, fromString("数据砖头").indexOf(fromString("数"), 0));
+ assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
+ }
+
+ @Test
+ public void reverse() {
+ assertEquals(fromString("olleh"), fromString("hello").reverse());
+ assertEquals(fromString(""), fromString("").reverse());
+ assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
+ assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
+ }
+
+ @Test
+ public void repeat() {
+ assertEquals(fromString("数d数d数d数d数d"), fromString("数d").repeat(5));
+ assertEquals(fromString("数d"), fromString("数d").repeat(1));
+ assertEquals(fromString(""), fromString("数d").repeat(-1));
+ }
+
+ @Test
+ public void pad() {
+ assertEquals(fromString("hel"), fromString("hello").lpad(3, fromString("????")));
+ assertEquals(fromString("hello"), fromString("hello").lpad(5, fromString("????")));
+ assertEquals(fromString("?hello"), fromString("hello").lpad(6, fromString("????")));
+ assertEquals(fromString("???????hello"), fromString("hello").lpad(12, fromString("????")));
+ assertEquals(fromString("?????hello"), fromString("hello").lpad(10, fromString("?????")));
+ assertEquals(fromString("???????"), fromString("").lpad(7, fromString("?????")));
+
+ assertEquals(fromString("hel"), fromString("hello").rpad(3, fromString("????")));
+ assertEquals(fromString("hello"), fromString("hello").rpad(5, fromString("????")));
+ assertEquals(fromString("hello?"), fromString("hello").rpad(6, fromString("????")));
+ assertEquals(fromString("hello???????"), fromString("hello").rpad(12, fromString("????")));
+ assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
+ assertEquals(fromString("???????"), fromString("").rpad(7, fromString("?????")));
+
+
+ assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
+ assertEquals(fromString("?数据砖头"), fromString("数据砖头").lpad(5, fromString("????")));
+ assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
+ assertEquals(fromString("孙行数据砖头"), fromString("数据砖头").lpad(6, fromString("孙行者")));
+ assertEquals(fromString("孙行者数据砖头"), fromString("数据砖头").lpad(7, fromString("孙行者")));
+ assertEquals(fromString("孙行者孙行者孙行数据砖头"), fromString("数据砖头").lpad(12, fromString("孙行者")));
+
+ assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, fromString("????")));
+ assertEquals(fromString("数据砖头?"), fromString("数据砖头").rpad(5, fromString("????")));
+ assertEquals(fromString("数据砖头??"), fromString("数据砖头").rpad(6, fromString("????")));
+ assertEquals(fromString("数据砖头孙行"), fromString("数据砖头").rpad(6, fromString("孙行者")));
+ assertEquals(fromString("数据砖头孙行者"), fromString("数据砖头").rpad(7, fromString("孙行者")));
+ assertEquals(fromString("数据砖头孙行者孙行者孙行"), fromString("数据砖头").rpad(12, fromString("孙行者")));
}
@Test