5 files changed, 144 insertions, 138 deletions
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 37d2ecf48e..33c2cbd293 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -116,7 +116,7 @@ This file is divided into 3 sections:
 
   <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
 
-  <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
+  <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
 
   <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java
deleted file mode 100644
index 01f89112a7..0000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.parser;
-
-import java.nio.charset.StandardCharsets;
-
-/**
- * A couple of utility methods that help with parsing ASTs.
- *
- * The 'unescapeSQLString' method in this class was take from the SemanticAnalyzer in Hive:
- * ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
- */
-public final class ParseUtils {
-  private ParseUtils() {
-    super();
-  }
-
-  private static final int[] multiplier = new int[] {1000, 100, 10, 1};
-
-  @SuppressWarnings("nls")
-  public static String unescapeSQLString(String b) {
-    Character enclosure = null;
-
-    // Some of the strings can be passed in as unicode. For example, the
-    // delimiter can be passed in as \002 - So, we first check if the
-    // string is a unicode number, else go back to the old behavior
-    StringBuilder sb = new StringBuilder(b.length());
-    for (int i = 0; i < b.length(); i++) {
-
-      char currentChar = b.charAt(i);
-      if (enclosure == null) {
-        if (currentChar == '\'' || b.charAt(i) == '\"') {
-          enclosure = currentChar;
-        }
-        // ignore all other chars outside the enclosure
-        continue;
-      }
-
-      if (enclosure.equals(currentChar)) {
-        enclosure = null;
-        continue;
-      }
-
-      if (currentChar == '\\' && (i + 6 < b.length()) && b.charAt(i + 1) == 'u') {
-        int code = 0;
-        int base = i + 2;
-        for (int j = 0; j < 4; j++) {
-          int digit = Character.digit(b.charAt(j + base), 16);
-          code += digit * multiplier[j];
-        }
-        sb.append((char)code);
-        i += 5;
-        continue;
-      }
-
-      if (currentChar == '\\' && (i + 4 < b.length())) {
-        char i1 = b.charAt(i + 1);
-        char i2 = b.charAt(i + 2);
-        char i3 = b.charAt(i + 3);
-        if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7')
-            && (i3 >= '0' && i3 <= '7')) {
-          byte bVal = (byte) ((i3 - '0') + ((i2 - '0') * 8) + ((i1 - '0') * 8 * 8));
-          byte[] bValArr = new byte[1];
-          bValArr[0] = bVal;
-          String tmp = new String(bValArr, StandardCharsets.UTF_8);
-          sb.append(tmp);
-          i += 3;
-          continue;
-        }
-      }
-
-      if (currentChar == '\\' && (i + 2 < b.length())) {
-        char n = b.charAt(i + 1);
-        switch (n) {
-        case '0':
-          sb.append("\0");
-          break;
-        case '\'':
-          sb.append("'");
-          break;
-        case '"':
-          sb.append("\"");
-          break;
-        case 'b':
-          sb.append("\b");
-          break;
-        case 'n':
-          sb.append("\n");
-          break;
-        case 'r':
-          sb.append("\r");
-          break;
-        case 't':
-          sb.append("\t");
-          break;
-        case 'Z':
-          sb.append("\u001A");
-          break;
-        case '\\':
-          sb.append("\\");
-          break;
-        // The following 2 lines are exactly what MySQL does TODO: why do we do this?
-        case '%':
-          sb.append("\\%");
-          break;
-        case '_':
-          sb.append("\\_");
-          break;
-        default:
-          sb.append(n);
-        }
-        i++;
-      } else {
-        sb.append(currentChar);
-      }
-    }
-    return sb.toString();
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 90b76dc314..cb9fefec8f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -16,11 +16,12 @@
  */
 package org.apache.spark.sql.catalyst.parser
 
+import scala.collection.mutable.StringBuilder
+
 import org.antlr.v4.runtime.{CharStream, ParserRuleContext, Token}
 import org.antlr.v4.runtime.misc.Interval
 import org.antlr.v4.runtime.tree.TerminalNode
 
-import org.apache.spark.sql.catalyst.parser.ParseUtils.unescapeSQLString
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
 
@@ -87,6 +88,81 @@ object ParserUtils {
     }
   }
 
+  /** Unescape baskslash-escaped string enclosed by quotes. */
+  def unescapeSQLString(b: String): String = {
+    var enclosure: Character = null
+    val sb = new StringBuilder(b.length())
+
+    def appendEscapedChar(n: Char) {
+      n match {
+        case '0' => sb.append('\u0000')
+        case '\'' => sb.append('\'')
+        case '"' => sb.append('\"')
+        case 'b' => sb.append('\b')
+        case 'n' => sb.append('\n')
+        case 'r' => sb.append('\r')
+        case 't' => sb.append('\t')
+        case 'Z' => sb.append('\u001A')
+        case '\\' => sb.append('\\')
+        // The following 2 lines are exactly what MySQL does TODO: why do we do this?
+        case '%' => sb.append("\\%")
+        case '_' => sb.append("\\_")
+        case _ => sb.append(n)
+      }
+    }
+
+    var i = 0
+    val strLength = b.length
+    while (i < strLength) {
+      val currentChar = b.charAt(i)
+      if (enclosure == null) {
+        if (currentChar == '\'' || currentChar == '\"') {
+          enclosure = currentChar
+        }
+      } else if (enclosure == currentChar) {
+        enclosure = null
+      } else if (currentChar == '\\') {
+
+        if ((i + 6 < strLength) && b.charAt(i + 1) == 'u') {
+          // \u0000 style character literals.
+
+          val base = i + 2
+          val code = (0 until 4).foldLeft(0) { (mid, j) =>
+            val digit = Character.digit(b.charAt(j + base), 16)
+            (mid << 4) + digit
+          }
+          sb.append(code.asInstanceOf[Char])
+          i += 5
+        } else if (i + 4 < strLength) {
+          // \000 style character literals.
+
+          val i1 = b.charAt(i + 1)
+          val i2 = b.charAt(i + 2)
+          val i3 = b.charAt(i + 3)
+
+          if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7') && (i3 >= '0' && i3 <= '7')) {
+            val tmp = ((i3 - '0') + ((i2 - '0') << 3) + ((i1 - '0') << 6)).asInstanceOf[Char]
+            sb.append(tmp)
+            i += 3
+          } else {
+            appendEscapedChar(i1)
+            i += 1
+          }
+        } else if (i + 2 < strLength) {
+          // escaped character literals.
+          val n = b.charAt(i + 1)
+          appendEscapedChar(n)
+          i += 1
+        }
+      } else {
+        // non-escaped character literals.
+        sb.append(currentChar)
+      }
+      i += 1
+    }
+    sb.toString()
+  }
+
   /** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
   implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
     /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index a80d29ce5d..6f40ec67ec 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -415,7 +415,7 @@ class ExpressionParserSuite extends PlanTest {
     assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!")
 
     // Unicode
-    assertEqual("'\\u0087\\u0111\\u0114\\u0108\\u0100\\u0032\\u0058\\u0041'", "World :)")
+    assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)")
   }
 
   test("intervals") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
new file mode 100644
index 0000000000..d090daf7b4
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.parser
+
+import org.apache.spark.SparkFunSuite
+
+class ParserUtilsSuite extends SparkFunSuite {
+
+  import ParserUtils._
+
+  test("unescapeSQLString") {
+    // scalastyle:off nonascii
+
+    // String not including escaped characters and enclosed by double quotes.
+    assert(unescapeSQLString(""""abcdefg"""") == "abcdefg")
+
+    // String enclosed by single quotes.
+    assert(unescapeSQLString("""'C0FFEE'""") == "C0FFEE")
+
+    // Strings including single escaped characters.
+    assert(unescapeSQLString("""'\0'""") == "\u0000")
+    assert(unescapeSQLString(""""\'"""") == "\'")
+    assert(unescapeSQLString("""'\"'""") == "\"")
+    assert(unescapeSQLString(""""\b"""") == "\b")
+    assert(unescapeSQLString("""'\n'""") == "\n")
+    assert(unescapeSQLString(""""\r"""") == "\r")
+    assert(unescapeSQLString("""'\t'""") == "\t")
+    assert(unescapeSQLString(""""\Z"""") == "\u001A")
+    assert(unescapeSQLString("""'\\'""") == "\\")
+    assert(unescapeSQLString(""""\%"""") == "\\%")
+    assert(unescapeSQLString("""'\_'""") == "\\_")
+
+    // String including '\000' style literal characters.
+    assert(unescapeSQLString("""'3 + 5 = \070'""") == "3 + 5 = \u0038")
+    assert(unescapeSQLString(""""\000"""") == "\u0000")
+
+    // String including invalid '\000' style literal characters.
+    assert(unescapeSQLString(""""\256"""") == "256")
+
+    // String including a '\u0000' style literal characters (\u732B is a cat in Kanji).
+    assert(unescapeSQLString(""""How cute \u732B are"""")  == "How cute \u732B are")
+
+    // String including a surrogate pair character
+    // (\uD867\uDE3D is Okhotsk atka mackerel in Kanji).
+    assert(unescapeSQLString(""""\uD867\uDE3D is a fish"""") == "\uD867\uDE3D is a fish")
+
+    // scalastyle:on nonascii
+  }
+
+  // TODO: Add test cases for other methods in ParserUtils
+}