aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorTarek Auel <tarek.auel@googlemail.com>2015-06-20 20:03:59 -0700
committerDavies Liu <davies@databricks.com>2015-06-20 20:03:59 -0700
commit41ab2853f41de2abc415358b69671f37a0653533 (patch)
tree37210295ec1c2cd3cb6f81015d09aa0a18d1f949 /sql
parent004f57374b98c4df32d9f1e19221f68e92639a49 (diff)
downloadspark-41ab2853f41de2abc415358b69671f37a0653533.tar.gz
spark-41ab2853f41de2abc415358b69671f37a0653533.tar.bz2
spark-41ab2853f41de2abc415358b69671f37a0653533.zip
[SPARK-8301] [SQL] Improve UTF8String substring/startsWith/endsWith/contains performance
Jira: https://issues.apache.org/jira/browse/SPARK-8301 Added the private method startsWith(prefix, offset) to implement startsWith, endsWith and contains without copying the array I hope that the component SQL is still correct. I copied it from the Jira ticket. Author: Tarek Auel <tarek.auel@googlemail.com> Author: Tarek Auel <tarek.auel@gmail.com> Closes #6804 from tarekauel/SPARK-8301 and squashes the following commits: f5d6b9a [Tarek Auel] fixed parentheses and annotation 6d7b068 [Tarek Auel] [SPARK-8301] removed null checks 9ca0473 [Tarek Auel] [SPARK-8301] removed null checks 1c327eb [Tarek Auel] [SPARK-8301] removed new 9f17cc8 [Tarek Auel] [SPARK-8301] fixed conversion byte to string in codegen 3a0040f [Tarek Auel] [SPARK-8301] changed call of UTF8String.set to UTF8String.from e4530d2 [Tarek Auel] [SPARK-8301] changed call of UTF8String.set to UTF8String.from a5f853a [Tarek Auel] [SPARK-8301] changed visibility of set to protected. Changed annotation of bytes from Nullable to Nonnull d2fb05f [Tarek Auel] [SPARK-8301] added additional null checks 79cb55b [Tarek Auel] [SPARK-8301] null check. Added test cases for null check. b17909e [Tarek Auel] [SPARK-8301] removed unnecessary copying of UTF8String. Added a private function startsWith(prefix, offset) to implement the check for startsWith, endsWith and contains.
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java4
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala6
2 files changed, 4 insertions, 6 deletions
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index c4b7f8490a..ed04d2e50e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -312,7 +312,6 @@ public final class UnsafeRow extends BaseMutableRow {
public UTF8String getUTF8String(int i) {
assertIndexIsValid(i);
- final UTF8String str = new UTF8String();
final long offsetToStringSize = getLong(i);
final int stringSizeInBytes =
(int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize);
@@ -324,8 +323,7 @@ public final class UnsafeRow extends BaseMutableRow {
PlatformDependent.BYTE_ARRAY_OFFSET,
stringSizeInBytes
);
- str.set(strBytes);
- return str;
+ return UTF8String.fromBytes(strBytes);
}
@Override
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index b20086bcc4..ad920f2878 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -438,17 +438,17 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
case (BinaryType, StringType) =>
defineCodeGen (ctx, ev, c =>
- s"new ${ctx.stringType}().set($c)")
+ s"${ctx.stringType}.fromBytes($c)")
case (DateType, StringType) =>
defineCodeGen(ctx, ev, c =>
- s"""new ${ctx.stringType}().set(
+ s"""${ctx.stringType}.fromString(
org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""")
// Special handling required for timestamps in hive test cases since the toString function
// does not match the expected output.
case (TimestampType, StringType) =>
super.genCode(ctx, ev)
case (_, StringType) =>
- defineCodeGen(ctx, ev, c => s"new ${ctx.stringType}().set(String.valueOf($c))")
+ defineCodeGen(ctx, ev, c => s"${ctx.stringType}.fromString(String.valueOf($c))")
// fallback for DecimalType, this must be before other numeric types
case (_, dt: DecimalType) =>