aboutsummaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
authorSandeep Singh <sandeep@techaddict.me>2016-09-06 22:18:28 +0100
committerSean Owen <sowen@cloudera.com>2016-09-06 22:18:28 +0100
commit7775d9f224e22400c6c8c093652a383f4af66ee0 (patch)
tree183e806ab3805be973e9a2700dc823bad85d3bb2 /common
parent6c08dbf683875ff1ba724447e0531f673bcff8ba (diff)
downloadspark-7775d9f224e22400c6c8c093652a383f4af66ee0.tar.gz
spark-7775d9f224e22400c6c8c093652a383f4af66ee0.tar.bz2
spark-7775d9f224e22400c6c8c093652a383f4af66ee0.zip
[SPARK-17299] TRIM/LTRIM/RTRIM should not strips characters other than spaces
## What changes were proposed in this pull request? TRIM/LTRIM/RTRIM should not strips characters other than spaces, we were trimming all chars small than ASCII 0x20(space) ## How was this patch tested? fixed existing tests. Author: Sandeep Singh <sandeep@techaddict.me> Closes #14924 from techaddict/SPARK-17299.
Diffstat (limited to 'common')
-rw-r--r--common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java8
-rw-r--r--common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java10
-rw-r--r--common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala8
3 files changed, 18 insertions, 8 deletions
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index dc03d893a5..e09a6b7d93 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -465,9 +465,9 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
int s = 0;
int e = this.numBytes - 1;
// skip all of the space (0x20) in the left side
- while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
+ while (s < this.numBytes && getByte(s) == 0x20) s++;
// skip all of the space (0x20) in the right side
- while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
+ while (e >= 0 && getByte(e) == 0x20) e--;
if (s > e) {
// empty string
return EMPTY_UTF8;
@@ -479,7 +479,7 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trimLeft() {
int s = 0;
// skip all of the space (0x20) in the left side
- while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
+ while (s < this.numBytes && getByte(s) == 0x20) s++;
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
@@ -491,7 +491,7 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trimRight() {
int e = numBytes - 1;
// skip all of the space (0x20) in the right side
- while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
+ while (e >= 0 && getByte(e) == 0x20) e--;
if (e < 0) {
// empty string
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index d4160ad029..7f03686dce 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -232,6 +232,16 @@ public class UTF8StringSuite {
assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
+
+ char[] charsLessThan0x20 = new char[10];
+ Arrays.fill(charsLessThan0x20, (char)(' ' - 1));
+ String stringStartingWithSpace =
+ new String(charsLessThan0x20) + "hello" + new String(charsLessThan0x20);
+ assertEquals(fromString(stringStartingWithSpace), fromString(stringStartingWithSpace).trim());
+ assertEquals(fromString(stringStartingWithSpace),
+ fromString(stringStartingWithSpace).trimLeft());
+ assertEquals(fromString(stringStartingWithSpace),
+ fromString(stringStartingWithSpace).trimRight());
}
@Test
diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
index 8a6b9e3e45..62d4176d00 100644
--- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
+++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
@@ -98,7 +98,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
}
}
- val whitespaceChar: Gen[Char] = Gen.choose(0x00, 0x20).map(_.toChar)
+ val whitespaceChar: Gen[Char] = Gen.const(0x20.toChar)
val whitespaceString: Gen[String] = Gen.listOf(whitespaceChar).map(_.mkString)
val randomString: Gen[String] = Arbitrary.arbString.arbitrary
@@ -107,7 +107,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
def lTrim(s: String): String = {
var st = 0
val array: Array[Char] = s.toCharArray
- while ((st < s.length) && (array(st) <= ' ')) {
+ while ((st < s.length) && (array(st) == ' ')) {
st += 1
}
if (st > 0) s.substring(st, s.length) else s
@@ -115,7 +115,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
def rTrim(s: String): String = {
var len = s.length
val array: Array[Char] = s.toCharArray
- while ((len > 0) && (array(len - 1) <= ' ')) {
+ while ((len > 0) && (array(len - 1) == ' ')) {
len -= 1
}
if (len < s.length) s.substring(0, len) else s
@@ -127,7 +127,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
whitespaceString
) { (start: String, middle: String, end: String) =>
val s = start + middle + end
- assert(toUTF8(s).trim() === toUTF8(s.trim()))
+ assert(toUTF8(s).trim() === toUTF8(rTrim(lTrim(s))))
assert(toUTF8(s).trimLeft() === toUTF8(lTrim(s)))
assert(toUTF8(s).trimRight() === toUTF8(rTrim(s)))
}