aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-07-07 17:57:17 -0700
committerReynold Xin <rxin@databricks.com>2015-07-07 17:57:17 -0700
commit4ca90935c5ff15ccca13452572e317b86814f238 (patch)
treea5b0eba4f52ca6fb0e481b464859f3fcae191271 /sql
parent770ff1025e751ad139b6284c1d7cc3de778b1d87 (diff)
downloadspark-4ca90935c5ff15ccca13452572e317b86814f238.tar.gz
spark-4ca90935c5ff15ccca13452572e317b86814f238.tar.bz2
spark-4ca90935c5ff15ccca13452572e317b86814f238.zip
[SPARK-7190] [SPARK-8804] [SPARK-7815] [SQL] unsafe UTF8String
Let UTF8String work with binary buffer. Before we have better idea on manage the lifecycle of UTF8String in Row, we still do the copy when calling `UnsafeRow.get()` for StringType. cc rxin JoshRosen Author: Davies Liu <davies@databricks.com> Closes #7197 from davies/unsafe_string and squashes the following commits: 51b0ea0 [Davies Liu] fix test 50c1ebf [Davies Liu] remove optimization for upper/lower case 315d491 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_string 93fce17 [Davies Liu] address comment e9ff7ba [Davies Liu] clean up 67ec266 [Davies Liu] fix bug 7b74b1f [Davies Liu] fallback to String if local dependent ab7857c [Davies Liu] address comments 7da92f5 [Davies Liu] handle local in toUpperCase/toLowerCase 59dbb23 [Davies Liu] revert python change d1e0716 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_string 002e35f [Davies Liu] rollback hashCode change a87b7a8 [Davies Liu] improve toLowerCase and toUpperCase 76e794a [Davies Liu] fix test 8b2d5ce [Davies Liu] fix tests fd3f0a6 [Davies Liu] bug fix c4e9c88 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_string c45d921 [Davies Liu] address comments 175405f [Davies Liu] unsafe UTF8String
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java1
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala2
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala6
3 files changed, 5 insertions, 4 deletions
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index f077064a02..aeb64b0458 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -264,6 +264,7 @@ public final class UnsafeRow extends MutableRow {
int offset = (int) ((v >> OFFSET_BITS) & Integer.MAX_VALUE);
int size = (int) (v & Integer.MAX_VALUE);
final byte[] bytes = new byte[size];
+ // TODO(davies): Avoid the copy once we can manage the life cycle of Row well.
PlatformDependent.copyMemory(
baseObject,
baseOffset + offset,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index cfe201bf83..662ceeca77 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -139,7 +139,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
// UDFToBoolean
private[this] def castToBoolean(from: DataType): Any => Any = from match {
case StringType =>
- buildCast[UTF8String](_, _.length() != 0)
+ buildCast[UTF8String](_, _.numBytes() != 0)
case TimestampType =>
buildCast[Long](_, t => t != 0)
case DateType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 2e51cbb28c..47fc7cdaa8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -250,7 +250,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
val (st, end) = slicePos(start, length, () => ba.length)
ba.slice(st, end)
case s: UTF8String =>
- val (st, end) = slicePos(start, length, () => s.length())
+ val (st, end) = slicePos(start, length, () => s.numChars())
s.substring(st, end)
}
}
@@ -265,10 +265,10 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
override def inputTypes: Seq[DataType] = Seq(StringType)
protected override def nullSafeEval(string: Any): Any =
- string.asInstanceOf[UTF8String].length
+ string.asInstanceOf[UTF8String].numChars
override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
- defineCodeGen(ctx, ev, c => s"($c).length()")
+ defineCodeGen(ctx, ev, c => s"($c).numChars()")
}
override def prettyName: String = "length"