aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scalastyle-config.xml2
-rw-r--r--sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java135
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala78
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala2
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala65
5 files changed, 144 insertions, 138 deletions
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 37d2ecf48e..33c2cbd293 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -116,7 +116,7 @@ This file is divided into 3 sections:
<check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
+ <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java
deleted file mode 100644
index 01f89112a7..0000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/parser/ParseUtils.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.parser;
-
-import java.nio.charset.StandardCharsets;
-
-/**
- * A couple of utility methods that help with parsing ASTs.
- *
- * The 'unescapeSQLString' method in this class was take from the SemanticAnalyzer in Hive:
- * ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
- */
-public final class ParseUtils {
- private ParseUtils() {
- super();
- }
-
- private static final int[] multiplier = new int[] {1000, 100, 10, 1};
-
- @SuppressWarnings("nls")
- public static String unescapeSQLString(String b) {
- Character enclosure = null;
-
- // Some of the strings can be passed in as unicode. For example, the
- // delimiter can be passed in as \002 - So, we first check if the
- // string is a unicode number, else go back to the old behavior
- StringBuilder sb = new StringBuilder(b.length());
- for (int i = 0; i < b.length(); i++) {
-
- char currentChar = b.charAt(i);
- if (enclosure == null) {
- if (currentChar == '\'' || b.charAt(i) == '\"') {
- enclosure = currentChar;
- }
- // ignore all other chars outside the enclosure
- continue;
- }
-
- if (enclosure.equals(currentChar)) {
- enclosure = null;
- continue;
- }
-
- if (currentChar == '\\' && (i + 6 < b.length()) && b.charAt(i + 1) == 'u') {
- int code = 0;
- int base = i + 2;
- for (int j = 0; j < 4; j++) {
- int digit = Character.digit(b.charAt(j + base), 16);
- code += digit * multiplier[j];
- }
- sb.append((char)code);
- i += 5;
- continue;
- }
-
- if (currentChar == '\\' && (i + 4 < b.length())) {
- char i1 = b.charAt(i + 1);
- char i2 = b.charAt(i + 2);
- char i3 = b.charAt(i + 3);
- if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7')
- && (i3 >= '0' && i3 <= '7')) {
- byte bVal = (byte) ((i3 - '0') + ((i2 - '0') * 8) + ((i1 - '0') * 8 * 8));
- byte[] bValArr = new byte[1];
- bValArr[0] = bVal;
- String tmp = new String(bValArr, StandardCharsets.UTF_8);
- sb.append(tmp);
- i += 3;
- continue;
- }
- }
-
- if (currentChar == '\\' && (i + 2 < b.length())) {
- char n = b.charAt(i + 1);
- switch (n) {
- case '0':
- sb.append("\0");
- break;
- case '\'':
- sb.append("'");
- break;
- case '"':
- sb.append("\"");
- break;
- case 'b':
- sb.append("\b");
- break;
- case 'n':
- sb.append("\n");
- break;
- case 'r':
- sb.append("\r");
- break;
- case 't':
- sb.append("\t");
- break;
- case 'Z':
- sb.append("\u001A");
- break;
- case '\\':
- sb.append("\\");
- break;
- // The following 2 lines are exactly what MySQL does TODO: why do we do this?
- case '%':
- sb.append("\\%");
- break;
- case '_':
- sb.append("\\_");
- break;
- default:
- sb.append(n);
- }
- i++;
- } else {
- sb.append(currentChar);
- }
- }
- return sb.toString();
- }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 90b76dc314..cb9fefec8f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -16,11 +16,12 @@
*/
package org.apache.spark.sql.catalyst.parser
+import scala.collection.mutable.StringBuilder
+
import org.antlr.v4.runtime.{CharStream, ParserRuleContext, Token}
import org.antlr.v4.runtime.misc.Interval
import org.antlr.v4.runtime.tree.TerminalNode
-import org.apache.spark.sql.catalyst.parser.ParseUtils.unescapeSQLString
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
@@ -87,6 +88,81 @@ object ParserUtils {
}
}
+ /** Unescape baskslash-escaped string enclosed by quotes. */
+ def unescapeSQLString(b: String): String = {
+ var enclosure: Character = null
+ val sb = new StringBuilder(b.length())
+
+ def appendEscapedChar(n: Char) {
+ n match {
+ case '0' => sb.append('\u0000')
+ case '\'' => sb.append('\'')
+ case '"' => sb.append('\"')
+ case 'b' => sb.append('\b')
+ case 'n' => sb.append('\n')
+ case 'r' => sb.append('\r')
+ case 't' => sb.append('\t')
+ case 'Z' => sb.append('\u001A')
+ case '\\' => sb.append('\\')
+ // The following 2 lines are exactly what MySQL does TODO: why do we do this?
+ case '%' => sb.append("\\%")
+ case '_' => sb.append("\\_")
+ case _ => sb.append(n)
+ }
+ }
+
+ var i = 0
+ val strLength = b.length
+ while (i < strLength) {
+ val currentChar = b.charAt(i)
+ if (enclosure == null) {
+ if (currentChar == '\'' || currentChar == '\"') {
+ enclosure = currentChar
+ }
+ } else if (enclosure == currentChar) {
+ enclosure = null
+ } else if (currentChar == '\\') {
+
+ if ((i + 6 < strLength) && b.charAt(i + 1) == 'u') {
+ // \u0000 style character literals.
+
+ val base = i + 2
+ val code = (0 until 4).foldLeft(0) { (mid, j) =>
+ val digit = Character.digit(b.charAt(j + base), 16)
+ (mid << 4) + digit
+ }
+ sb.append(code.asInstanceOf[Char])
+ i += 5
+ } else if (i + 4 < strLength) {
+ // \000 style character literals.
+
+ val i1 = b.charAt(i + 1)
+ val i2 = b.charAt(i + 2)
+ val i3 = b.charAt(i + 3)
+
+ if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7') && (i3 >= '0' && i3 <= '7')) {
+ val tmp = ((i3 - '0') + ((i2 - '0') << 3) + ((i1 - '0') << 6)).asInstanceOf[Char]
+ sb.append(tmp)
+ i += 3
+ } else {
+ appendEscapedChar(i1)
+ i += 1
+ }
+ } else if (i + 2 < strLength) {
+ // escaped character literals.
+ val n = b.charAt(i + 1)
+ appendEscapedChar(n)
+ i += 1
+ }
+ } else {
+ // non-escaped character literals.
+ sb.append(currentChar)
+ }
+ i += 1
+ }
+ sb.toString()
+ }
+
/** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
/**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index a80d29ce5d..6f40ec67ec 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -415,7 +415,7 @@ class ExpressionParserSuite extends PlanTest {
assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!")
// Unicode
- assertEqual("'\\u0087\\u0111\\u0114\\u0108\\u0100\\u0032\\u0058\\u0041'", "World :)")
+ assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)")
}
test("intervals") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
new file mode 100644
index 0000000000..d090daf7b4
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.parser
+
+import org.apache.spark.SparkFunSuite
+
+class ParserUtilsSuite extends SparkFunSuite {
+
+ import ParserUtils._
+
+ test("unescapeSQLString") {
+ // scalastyle:off nonascii
+
+ // String not including escaped characters and enclosed by double quotes.
+ assert(unescapeSQLString(""""abcdefg"""") == "abcdefg")
+
+ // String enclosed by single quotes.
+ assert(unescapeSQLString("""'C0FFEE'""") == "C0FFEE")
+
+ // Strings including single escaped characters.
+ assert(unescapeSQLString("""'\0'""") == "\u0000")
+ assert(unescapeSQLString(""""\'"""") == "\'")
+ assert(unescapeSQLString("""'\"'""") == "\"")
+ assert(unescapeSQLString(""""\b"""") == "\b")
+ assert(unescapeSQLString("""'\n'""") == "\n")
+ assert(unescapeSQLString(""""\r"""") == "\r")
+ assert(unescapeSQLString("""'\t'""") == "\t")
+ assert(unescapeSQLString(""""\Z"""") == "\u001A")
+ assert(unescapeSQLString("""'\\'""") == "\\")
+ assert(unescapeSQLString(""""\%"""") == "\\%")
+ assert(unescapeSQLString("""'\_'""") == "\\_")
+
+ // String including '\000' style literal characters.
+ assert(unescapeSQLString("""'3 + 5 = \070'""") == "3 + 5 = \u0038")
+ assert(unescapeSQLString(""""\000"""") == "\u0000")
+
+ // String including invalid '\000' style literal characters.
+ assert(unescapeSQLString(""""\256"""") == "256")
+
+ // String including a '\u0000' style literal characters (\u732B is a cat in Kanji).
+ assert(unescapeSQLString(""""How cute \u732B are"""") == "How cute \u732B are")
+
+ // String including a surrogate pair character
+ // (\uD867\uDE3D is Okhotsk atka mackerel in Kanji).
+ assert(unescapeSQLString(""""\uD867\uDE3D is a fish"""") == "\uD867\uDE3D is a fish")
+
+ // scalastyle:on nonascii
+ }
+
+ // TODO: Add test cases for other methods in ParserUtils
+}