diff options
author | Tarek Auel <tarek.auel@googlemail.com> | 2015-06-20 20:03:59 -0700 |
---|---|---|
committer | Davies Liu <davies@databricks.com> | 2015-06-20 20:03:59 -0700 |
commit | 41ab2853f41de2abc415358b69671f37a0653533 (patch) | |
tree | 37210295ec1c2cd3cb6f81015d09aa0a18d1f949 /sql | |
parent | 004f57374b98c4df32d9f1e19221f68e92639a49 (diff) | |
download | spark-41ab2853f41de2abc415358b69671f37a0653533.tar.gz spark-41ab2853f41de2abc415358b69671f37a0653533.tar.bz2 spark-41ab2853f41de2abc415358b69671f37a0653533.zip |
[SPARK-8301] [SQL] Improve UTF8String substring/startsWith/endsWith/contains performance
Jira: https://issues.apache.org/jira/browse/SPARK-8301
Added the private method startsWith(prefix, offset) to implement startsWith, endsWith and contains without copying the array
I hope that the component SQL is still correct. I copied it from the Jira ticket.
Author: Tarek Auel <tarek.auel@googlemail.com>
Author: Tarek Auel <tarek.auel@gmail.com>
Closes #6804 from tarekauel/SPARK-8301 and squashes the following commits:
f5d6b9a [Tarek Auel] fixed parentheses and annotation
6d7b068 [Tarek Auel] [SPARK-8301] removed null checks
9ca0473 [Tarek Auel] [SPARK-8301] removed null checks
1c327eb [Tarek Auel] [SPARK-8301] removed new
9f17cc8 [Tarek Auel] [SPARK-8301] fixed conversion byte to string in codegen
3a0040f [Tarek Auel] [SPARK-8301] changed call of UTF8String.set to UTF8String.from
e4530d2 [Tarek Auel] [SPARK-8301] changed call of UTF8String.set to UTF8String.from
a5f853a [Tarek Auel] [SPARK-8301] changed visibility of set to protected. Changed annotation of bytes from Nullable to Nonnull
d2fb05f [Tarek Auel] [SPARK-8301] added additional null checks
79cb55b [Tarek Auel] [SPARK-8301] null check. Added test cases for null check.
b17909e [Tarek Auel] [SPARK-8301] removed unnecessary copying of UTF8String. Added a private function startsWith(prefix, offset) to implement the check for startsWith, endsWith and contains.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java | 4 | ||||
-rw-r--r-- | sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala | 6 |
2 files changed, 4 insertions, 6 deletions
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java index c4b7f8490a..ed04d2e50e 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java @@ -312,7 +312,6 @@ public final class UnsafeRow extends BaseMutableRow { public UTF8String getUTF8String(int i) { assertIndexIsValid(i); - final UTF8String str = new UTF8String(); final long offsetToStringSize = getLong(i); final int stringSizeInBytes = (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize); @@ -324,8 +323,7 @@ public final class UnsafeRow extends BaseMutableRow { PlatformDependent.BYTE_ARRAY_OFFSET, stringSizeInBytes ); - str.set(strBytes); - return str; + return UTF8String.fromBytes(strBytes); } @Override diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index b20086bcc4..ad920f2878 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -438,17 +438,17 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w case (BinaryType, StringType) => defineCodeGen (ctx, ev, c => - s"new ${ctx.stringType}().set($c)") + s"${ctx.stringType}.fromBytes($c)") case (DateType, StringType) => defineCodeGen(ctx, ev, c => - s"""new ${ctx.stringType}().set( + s"""${ctx.stringType}.fromString( org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""") // Special handling required for timestamps in hive test cases since the toString function // does not match the expected output. case (TimestampType, StringType) => super.genCode(ctx, ev) case (_, StringType) => - defineCodeGen(ctx, ev, c => s"new ${ctx.stringType}().set(String.valueOf($c))") + defineCodeGen(ctx, ev, c => s"${ctx.stringType}.fromString(String.valueOf($c))") // fallback for DecimalType, this must be before other numeric types case (_, dt: DecimalType) => |