1 files changed, 406 insertions, 0 deletions
diff --git a/sql/hive/src/main/java/org/apache/spark/sql/parser/SemanticAnalyzer.java b/sql/hive/src/main/java/org/apache/spark/sql/parser/SemanticAnalyzer.java
new file mode 100644
index 0000000000..4b2015e0df
--- /dev/null
+++ b/sql/hive/src/main/java/org/apache/spark/sql/parser/SemanticAnalyzer.java
@@ -0,0 +1,406 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parser;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.antlr.runtime.tree.Tree;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
+
+/**
+ * SemanticAnalyzer.
+ *
+ */
+public abstract class SemanticAnalyzer {
+  public static String charSetString(String charSetName, String charSetString)
+      throws SemanticException {
+    try {
+      // The character set name starts with a _, so strip that
+      charSetName = charSetName.substring(1);
+      if (charSetString.charAt(0) == '\'') {
+        return new String(unescapeSQLString(charSetString).getBytes(),
+            charSetName);
+      } else // hex input is also supported
+      {
+        assert charSetString.charAt(0) == '0';
+        assert charSetString.charAt(1) == 'x';
+        charSetString = charSetString.substring(2);
+
+        byte[] bArray = new byte[charSetString.length() / 2];
+        int j = 0;
+        for (int i = 0; i < charSetString.length(); i += 2) {
+          int val = Character.digit(charSetString.charAt(i), 16) * 16
+              + Character.digit(charSetString.charAt(i + 1), 16);
+          if (val > 127) {
+            val = val - 256;
+          }
+          bArray[j++] = (byte)val;
+        }
+
+        String res = new String(bArray, charSetName);
+        return res;
+      }
+    } catch (UnsupportedEncodingException e) {
+      throw new SemanticException(e);
+    }
+  }
+
+  /**
+   * Remove the encapsulating "`" pair from the identifier. We allow users to
+   * use "`" to escape identifier for table names, column names and aliases, in
+   * case that coincide with Hive language keywords.
+   */
+  public static String unescapeIdentifier(String val) {
+    if (val == null) {
+      return null;
+    }
+    if (val.charAt(0) == '`' && val.charAt(val.length() - 1) == '`') {
+      val = val.substring(1, val.length() - 1);
+    }
+    return val;
+  }
+
+  /**
+   * Converts parsed key/value properties pairs into a map.
+   *
+   * @param prop ASTNode parent of the key/value pairs
+   *
+   * @param mapProp property map which receives the mappings
+   */
+  public static void readProps(
+    ASTNode prop, Map<String, String> mapProp) {
+
+    for (int propChild = 0; propChild < prop.getChildCount(); propChild++) {
+      String key = unescapeSQLString(prop.getChild(propChild).getChild(0)
+          .getText());
+      String value = null;
+      if (prop.getChild(propChild).getChild(1) != null) {
+        value = unescapeSQLString(prop.getChild(propChild).getChild(1).getText());
+      }
+      mapProp.put(key, value);
+    }
+  }
+
+  private static final int[] multiplier = new int[] {1000, 100, 10, 1};
+
+  @SuppressWarnings("nls")
+  public static String unescapeSQLString(String b) {
+    Character enclosure = null;
+
+    // Some of the strings can be passed in as unicode. For example, the
+    // delimiter can be passed in as \002 - So, we first check if the
+    // string is a unicode number, else go back to the old behavior
+    StringBuilder sb = new StringBuilder(b.length());
+    for (int i = 0; i < b.length(); i++) {
+
+      char currentChar = b.charAt(i);
+      if (enclosure == null) {
+        if (currentChar == '\'' || b.charAt(i) == '\"') {
+          enclosure = currentChar;
+        }
+        // ignore all other chars outside the enclosure
+        continue;
+      }
+
+      if (enclosure.equals(currentChar)) {
+        enclosure = null;
+        continue;
+      }
+
+      if (currentChar == '\\' && (i + 6 < b.length()) && b.charAt(i + 1) == 'u') {
+        int code = 0;
+        int base = i + 2;
+        for (int j = 0; j < 4; j++) {
+          int digit = Character.digit(b.charAt(j + base), 16);
+          code += digit * multiplier[j];
+        }
+        sb.append((char)code);
+        i += 5;
+        continue;
+      }
+
+      if (currentChar == '\\' && (i + 4 < b.length())) {
+        char i1 = b.charAt(i + 1);
+        char i2 = b.charAt(i + 2);
+        char i3 = b.charAt(i + 3);
+        if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7')
+            && (i3 >= '0' && i3 <= '7')) {
+          byte bVal = (byte) ((i3 - '0') + ((i2 - '0') * 8) + ((i1 - '0') * 8 * 8));
+          byte[] bValArr = new byte[1];
+          bValArr[0] = bVal;
+          String tmp = new String(bValArr);
+          sb.append(tmp);
+          i += 3;
+          continue;
+        }
+      }
+
+      if (currentChar == '\\' && (i + 2 < b.length())) {
+        char n = b.charAt(i + 1);
+        switch (n) {
+        case '0':
+          sb.append("\0");
+          break;
+        case '\'':
+          sb.append("'");
+          break;
+        case '"':
+          sb.append("\"");
+          break;
+        case 'b':
+          sb.append("\b");
+          break;
+        case 'n':
+          sb.append("\n");
+          break;
+        case 'r':
+          sb.append("\r");
+          break;
+        case 't':
+          sb.append("\t");
+          break;
+        case 'Z':
+          sb.append("\u001A");
+          break;
+        case '\\':
+          sb.append("\\");
+          break;
+        // The following 2 lines are exactly what MySQL does TODO: why do we do this?
+        case '%':
+          sb.append("\\%");
+          break;
+        case '_':
+          sb.append("\\_");
+          break;
+        default:
+          sb.append(n);
+        }
+        i++;
+      } else {
+        sb.append(currentChar);
+      }
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Get the list of FieldSchema out of the ASTNode.
+   */
+  public static List<FieldSchema> getColumns(ASTNode ast, boolean lowerCase) throws SemanticException {
+    List<FieldSchema> colList = new ArrayList<FieldSchema>();
+    int numCh = ast.getChildCount();
+    for (int i = 0; i < numCh; i++) {
+      FieldSchema col = new FieldSchema();
+      ASTNode child = (ASTNode) ast.getChild(i);
+      Tree grandChild = child.getChild(0);
+      if(grandChild != null) {
+        String name = grandChild.getText();
+        if(lowerCase) {
+          name = name.toLowerCase();
+        }
+        // child 0 is the name of the column
+        col.setName(unescapeIdentifier(name));
+        // child 1 is the type of the column
+        ASTNode typeChild = (ASTNode) (child.getChild(1));
+        col.setType(getTypeStringFromAST(typeChild));
+
+        // child 2 is the optional comment of the column
+        if (child.getChildCount() == 3) {
+          col.setComment(unescapeSQLString(child.getChild(2).getText()));
+        }
+      }
+      colList.add(col);
+    }
+    return colList;
+  }
+
+  protected static String getTypeStringFromAST(ASTNode typeNode)
+      throws SemanticException {
+    switch (typeNode.getType()) {
+    case SparkSqlParser.TOK_LIST:
+      return serdeConstants.LIST_TYPE_NAME + "<"
+          + getTypeStringFromAST((ASTNode) typeNode.getChild(0)) + ">";
+    case SparkSqlParser.TOK_MAP:
+      return serdeConstants.MAP_TYPE_NAME + "<"
+          + getTypeStringFromAST((ASTNode) typeNode.getChild(0)) + ","
+          + getTypeStringFromAST((ASTNode) typeNode.getChild(1)) + ">";
+    case SparkSqlParser.TOK_STRUCT:
+      return getStructTypeStringFromAST(typeNode);
+    case SparkSqlParser.TOK_UNIONTYPE:
+      return getUnionTypeStringFromAST(typeNode);
+    default:
+      return getTypeName(typeNode);
+    }
+  }
+
+  private static String getStructTypeStringFromAST(ASTNode typeNode)
+      throws SemanticException {
+    String typeStr = serdeConstants.STRUCT_TYPE_NAME + "<";
+    typeNode = (ASTNode) typeNode.getChild(0);
+    int children = typeNode.getChildCount();
+    if (children <= 0) {
+      throw new SemanticException("empty struct not allowed.");
+    }
+    StringBuilder buffer = new StringBuilder(typeStr);
+    for (int i = 0; i < children; i++) {
+      ASTNode child = (ASTNode) typeNode.getChild(i);
+      buffer.append(unescapeIdentifier(child.getChild(0).getText())).append(":");
+      buffer.append(getTypeStringFromAST((ASTNode) child.getChild(1)));
+      if (i < children - 1) {
+        buffer.append(",");
+      }
+    }
+
+    buffer.append(">");
+    return buffer.toString();
+  }
+
+  private static String getUnionTypeStringFromAST(ASTNode typeNode)
+      throws SemanticException {
+    String typeStr = serdeConstants.UNION_TYPE_NAME + "<";
+    typeNode = (ASTNode) typeNode.getChild(0);
+    int children = typeNode.getChildCount();
+    if (children <= 0) {
+      throw new SemanticException("empty union not allowed.");
+    }
+    StringBuilder buffer = new StringBuilder(typeStr);
+    for (int i = 0; i < children; i++) {
+      buffer.append(getTypeStringFromAST((ASTNode) typeNode.getChild(i)));
+      if (i < children - 1) {
+        buffer.append(",");
+      }
+    }
+    buffer.append(">");
+    typeStr = buffer.toString();
+    return typeStr;
+  }
+
+  public static String getAstNodeText(ASTNode tree) {
+    return tree.getChildCount() == 0?tree.getText() :
+        getAstNodeText((ASTNode)tree.getChild(tree.getChildCount() - 1));
+  }
+
+  public static String generateErrorMessage(ASTNode ast, String message) {
+    StringBuilder sb = new StringBuilder();
+    if (ast == null) {
+      sb.append(message).append(". Cannot tell the position of null AST.");
+      return sb.toString();
+    }
+    sb.append(ast.getLine());
+    sb.append(":");
+    sb.append(ast.getCharPositionInLine());
+    sb.append(" ");
+    sb.append(message);
+    sb.append(". Error encountered near token '");
+    sb.append(getAstNodeText(ast));
+    sb.append("'");
+    return sb.toString();
+  }
+
+  private static final Map<Integer, String> TokenToTypeName = new HashMap<Integer, String>();
+
+  static {
+    TokenToTypeName.put(SparkSqlParser.TOK_BOOLEAN, serdeConstants.BOOLEAN_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_TINYINT, serdeConstants.TINYINT_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_SMALLINT, serdeConstants.SMALLINT_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_INT, serdeConstants.INT_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_BIGINT, serdeConstants.BIGINT_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_FLOAT, serdeConstants.FLOAT_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_DOUBLE, serdeConstants.DOUBLE_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_STRING, serdeConstants.STRING_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_CHAR, serdeConstants.CHAR_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_VARCHAR, serdeConstants.VARCHAR_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_BINARY, serdeConstants.BINARY_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_DATE, serdeConstants.DATE_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_DATETIME, serdeConstants.DATETIME_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_TIMESTAMP, serdeConstants.TIMESTAMP_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_INTERVAL_YEAR_MONTH, serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_INTERVAL_DAY_TIME, serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME);
+    TokenToTypeName.put(SparkSqlParser.TOK_DECIMAL, serdeConstants.DECIMAL_TYPE_NAME);
+  }
+
+  public static String getTypeName(ASTNode node) throws SemanticException {
+    int token = node.getType();
+    String typeName;
+
+    // datetime type isn't currently supported
+    if (token == SparkSqlParser.TOK_DATETIME) {
+      throw new SemanticException(ErrorMsg.UNSUPPORTED_TYPE.getMsg());
+    }
+
+    switch (token) {
+      case SparkSqlParser.TOK_CHAR:
+        CharTypeInfo charTypeInfo = ParseUtils.getCharTypeInfo(node);
+        typeName = charTypeInfo.getQualifiedName();
+        break;
+      case SparkSqlParser.TOK_VARCHAR:
+        VarcharTypeInfo varcharTypeInfo = ParseUtils.getVarcharTypeInfo(node);
+        typeName = varcharTypeInfo.getQualifiedName();
+        break;
+      case SparkSqlParser.TOK_DECIMAL:
+        DecimalTypeInfo decTypeInfo = ParseUtils.getDecimalTypeTypeInfo(node);
+        typeName = decTypeInfo.getQualifiedName();
+        break;
+      default:
+        typeName = TokenToTypeName.get(token);
+    }
+    return typeName;
+  }
+
+  public static String relativeToAbsolutePath(HiveConf conf, String location) throws SemanticException {
+    boolean testMode = conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODE);
+    if (testMode) {
+      URI uri = new Path(location).toUri();
+      String scheme = uri.getScheme();
+      String authority = uri.getAuthority();
+      String path = uri.getPath();
+      if (!path.startsWith("/")) {
+        path = (new Path(System.getProperty("test.tmp.dir"),
+            path)).toUri().getPath();
+      }
+      if (StringUtils.isEmpty(scheme)) {
+        scheme = "pfile";
+      }
+      try {
+        uri = new URI(scheme, authority, path, null, null);
+      } catch (URISyntaxException e) {
+        throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(), e);
+      }
+      return uri.toString();
+    } else {
+      //no-op for non-test mode for now
+      return location;
+    }
+  }
+}