aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorMichael Armbrust <michael@databricks.com>2014-04-03 15:45:34 -0700
committerMatei Zaharia <matei@databricks.com>2014-04-03 15:45:34 -0700
commitb8f534196f9a8c99f75728a06e62282d139dee28 (patch)
tree426e03d88695719bc58ff6f0aaa4794bd2fb40a2 /sql
parentc1ea3afb516c204925259f0928dfb17d0fa89621 (diff)
downloadspark-b8f534196f9a8c99f75728a06e62282d139dee28.tar.gz
spark-b8f534196f9a8c99f75728a06e62282d139dee28.tar.bz2
spark-b8f534196f9a8c99f75728a06e62282d139dee28.zip
[SQL] SPARK-1333 First draft of java API
WIP: Some work remains... * [x] Hive support * [x] Tests * [x] Update docs Feedback welcome! Author: Michael Armbrust <michael@databricks.com> Closes #248 from marmbrus/javaSchemaRDD and squashes the following commits: b393913 [Michael Armbrust] @srowen 's java style suggestions. f531eb1 [Michael Armbrust] Address matei's comments. 33a1b1a [Michael Armbrust] Ignore JavaHiveSuite. 822f626 [Michael Armbrust] improve docs. ab91750 [Michael Armbrust] Improve Java SQL API: * Change JavaRow => Row * Add support for querying RDDs of JavaBeans * Docs * Tests * Hive support 0b859c8 [Michael Armbrust] First draft of java API.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala42
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala66
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala100
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala48
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala93
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala53
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala12
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala42
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveSuite.scala41
9 files changed, 460 insertions, 37 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index 770cabcb31..a62cb8aa13 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -17,13 +17,13 @@
package org.apache.spark.sql
+import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
import org.apache.spark.sql.catalyst.types.BooleanType
-import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext}
/**
* <span class="badge" style="float: right; background-color: darkblue;">ALPHA COMPONENT</span>
@@ -92,23 +92,10 @@ import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext}
*/
class SchemaRDD(
@transient val sqlContext: SQLContext,
- @transient val logicalPlan: LogicalPlan)
- extends RDD[Row](sqlContext.sparkContext, Nil) {
+ @transient protected[spark] val logicalPlan: LogicalPlan)
+ extends RDD[Row](sqlContext.sparkContext, Nil) with SchemaRDDLike {
- /**
- * A lazily computed query execution workflow. All other RDD operations are passed
- * through to the RDD that is produced by this workflow.
- *
- * We want this to be lazy because invoking the whole query optimization pipeline can be
- * expensive.
- */
- @transient
- protected[spark] lazy val queryExecution = sqlContext.executePlan(logicalPlan)
-
- override def toString =
- s"""${super.toString}
- |== Query Plan ==
- |${queryExecution.executedPlan}""".stripMargin.trim
+ def baseSchemaRDD = this
// =========================================================================================
// RDD functions: Copy the interal row representation so we present immutable data to users.
@@ -313,30 +300,11 @@ class SchemaRDD(
InsertIntoTable(UnresolvedRelation(None, tableName), Map.empty, logicalPlan, overwrite))
/**
- * Saves the contents of this `SchemaRDD` as a parquet file, preserving the schema. Files that
- * are written out using this method can be read back in as a SchemaRDD using the ``function
- *
- * @group schema
- */
- def saveAsParquetFile(path: String): Unit = {
- sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
- }
-
- /**
- * Registers this RDD as a temporary table using the given name. The lifetime of this temporary
- * table is tied to the [[SQLContext]] that was used to create this SchemaRDD.
- *
- * @group schema
- */
- def registerAsTable(tableName: String): Unit = {
- sqlContext.registerRDDAsTable(this, tableName)
- }
-
- /**
* Returns this RDD as a SchemaRDD.
* @group schema
*/
def toSchemaRDD = this
+ /** FOR INTERNAL USE ONLY */
def analyze = sqlContext.analyzer(logicalPlan)
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
new file mode 100644
index 0000000000..840803a52c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -0,0 +1,66 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.plans.logical._
+
+/**
+ * Contains functions that are shared between all SchemaRDD types (i.e., Scala, Java)
+ */
+trait SchemaRDDLike {
+ @transient val sqlContext: SQLContext
+ @transient protected[spark] val logicalPlan: LogicalPlan
+
+ private[sql] def baseSchemaRDD: SchemaRDD
+
+ /**
+ * A lazily computed query execution workflow. All other RDD operations are passed
+ * through to the RDD that is produced by this workflow.
+ *
+ * We want this to be lazy because invoking the whole query optimization pipeline can be
+ * expensive.
+ */
+ @transient
+ protected[spark] lazy val queryExecution = sqlContext.executePlan(logicalPlan)
+
+ override def toString =
+ s"""${super.toString}
+ |== Query Plan ==
+ |${queryExecution.executedPlan}""".stripMargin.trim
+
+
+ /**
+ * Saves the contents of this `SchemaRDD` as a parquet file, preserving the schema. Files that
+ * are written out using this method can be read back in as a SchemaRDD using the ``function
+ *
+ * @group schema
+ */
+ def saveAsParquetFile(path: String): Unit = {
+ sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
+ }
+
+ /**
+ * Registers this RDD as a temporary table using the given name. The lifetime of this temporary
+ * table is tied to the [[SQLContext]] that was used to create this SchemaRDD.
+ *
+ * @group schema
+ */
+ def registerAsTable(tableName: String): Unit = {
+ sqlContext.registerRDDAsTable(baseSchemaRDD, tableName)
+ }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
new file mode 100644
index 0000000000..7b41aa1f1b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
@@ -0,0 +1,100 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.api.java
+
+import java.beans.{Introspector, PropertyDescriptor}
+
+import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRow, Row => ScalaRow}
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.parquet.ParquetRelation
+import org.apache.spark.sql.execution.{ExistingRdd, SparkLogicalPlan}
+
+/**
+ * The entry point for executing Spark SQL queries from a Java program.
+ */
+class JavaSQLContext(sparkContext: JavaSparkContext) {
+
+ val sqlContext = new SQLContext(sparkContext.sc)
+
+ /**
+ * Executes a query expressed in SQL, returning the result as a JavaSchemaRDD
+ */
+ def sql(sqlQuery: String): JavaSchemaRDD = {
+ val result = new JavaSchemaRDD(sqlContext, sqlContext.parseSql(sqlQuery))
+ // We force query optimization to happen right away instead of letting it happen lazily like
+ // when using the query DSL. This is so DDL commands behave as expected. This is only
+ // generates the RDD lineage for DML queries, but do not perform any execution.
+ result.queryExecution.toRdd
+ result
+ }
+
+ /**
+ * Applies a schema to an RDD of Java Beans.
+ */
+ def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): JavaSchemaRDD = {
+ // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific.
+ val beanInfo = Introspector.getBeanInfo(beanClass)
+
+ val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
+ val schema = fields.map { property =>
+ val dataType = property.getPropertyType match {
+ case c: Class[_] if c == classOf[java.lang.String] => StringType
+ case c: Class[_] if c == java.lang.Short.TYPE => ShortType
+ case c: Class[_] if c == java.lang.Integer.TYPE => IntegerType
+ case c: Class[_] if c == java.lang.Long.TYPE => LongType
+ case c: Class[_] if c == java.lang.Double.TYPE => DoubleType
+ case c: Class[_] if c == java.lang.Byte.TYPE => ByteType
+ case c: Class[_] if c == java.lang.Float.TYPE => FloatType
+ case c: Class[_] if c == java.lang.Boolean.TYPE => BooleanType
+ }
+
+ AttributeReference(property.getName, dataType, true)()
+ }
+
+ val className = beanClass.getCanonicalName
+ val rowRdd = rdd.rdd.mapPartitions { iter =>
+ // BeanInfo is not serializable so we must rediscover it remotely for each partition.
+ val localBeanInfo = Introspector.getBeanInfo(Class.forName(className))
+ val extractors =
+ localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
+
+ iter.map { row =>
+ new GenericRow(extractors.map(e => e.invoke(row)).toArray[Any]): ScalaRow
+ }
+ }
+ new JavaSchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(schema, rowRdd)))
+ }
+
+
+ /**
+ * Loads a parquet file, returning the result as a [[JavaSchemaRDD]].
+ */
+ def parquetFile(path: String): JavaSchemaRDD =
+ new JavaSchemaRDD(sqlContext, ParquetRelation("ParquetFile", path))
+
+
+ /**
+ * Registers the given RDD as a temporary table in the catalog. Temporary tables exist only
+ * during the lifetime of this instance of SQLContext.
+ */
+ def registerRDDAsTable(rdd: JavaSchemaRDD, tableName: String): Unit = {
+ sqlContext.registerRDDAsTable(rdd.baseSchemaRDD, tableName)
+ }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
new file mode 100644
index 0000000000..d43d672938
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java
+
+import org.apache.spark.api.java.{JavaRDDLike, JavaRDD}
+import org.apache.spark.sql.{SQLContext, SchemaRDD, SchemaRDDLike}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.rdd.RDD
+
+/**
+ * An RDD of [[Row]] objects that is returned as the result of a Spark SQL query. In addition to
+ * standard RDD operations, a JavaSchemaRDD can also be registered as a table in the JavaSQLContext
+ * that was used to create. Registering a JavaSchemaRDD allows its contents to be queried in
+ * future SQL statement.
+ *
+ * @groupname schema SchemaRDD Functions
+ * @groupprio schema -1
+ * @groupname Ungrouped Base RDD Functions
+ */
+class JavaSchemaRDD(
+ @transient val sqlContext: SQLContext,
+ @transient protected[spark] val logicalPlan: LogicalPlan)
+ extends JavaRDDLike[Row, JavaRDD[Row]]
+ with SchemaRDDLike {
+
+ private[sql] val baseSchemaRDD = new SchemaRDD(sqlContext, logicalPlan)
+
+ override val classTag = scala.reflect.classTag[Row]
+
+ override def wrapRDD(rdd: RDD[Row]): JavaRDD[Row] = JavaRDD.fromRDD(rdd)
+
+ val rdd = baseSchemaRDD.map(new Row(_))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
new file mode 100644
index 0000000000..362fe76958
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java
+
+import org.apache.spark.sql.catalyst.expressions.{Row => ScalaRow}
+
+/**
+ * A result row from a SparkSQL query.
+ */
+class Row(row: ScalaRow) extends Serializable {
+
+ /** Returns the number of columns present in this Row. */
+ def length: Int = row.length
+
+ /** Returns the value of column `i`. */
+ def get(i: Int): Any =
+ row(i)
+
+ /** Returns true if value at column `i` is NULL. */
+ def isNullAt(i: Int) = get(i) == null
+
+ /**
+ * Returns the value of column `i` as an int. This function will throw an exception if the value
+ * is at `i` is not an integer, or if it is null.
+ */
+ def getInt(i: Int): Int =
+ row.getInt(i)
+
+ /**
+ * Returns the value of column `i` as a long. This function will throw an exception if the value
+ * is at `i` is not a long, or if it is null.
+ */
+ def getLong(i: Int): Long =
+ row.getLong(i)
+
+ /**
+ * Returns the value of column `i` as a double. This function will throw an exception if the
+ * value is at `i` is not a double, or if it is null.
+ */
+ def getDouble(i: Int): Double =
+ row.getDouble(i)
+
+ /**
+ * Returns the value of column `i` as a bool. This function will throw an exception if the value
+ * is at `i` is not a boolean, or if it is null.
+ */
+ def getBoolean(i: Int): Boolean =
+ row.getBoolean(i)
+
+ /**
+ * Returns the value of column `i` as a short. This function will throw an exception if the value
+ * is at `i` is not a short, or if it is null.
+ */
+ def getShort(i: Int): Short =
+ row.getShort(i)
+
+ /**
+ * Returns the value of column `i` as a byte. This function will throw an exception if the value
+ * is at `i` is not a byte, or if it is null.
+ */
+ def getByte(i: Int): Byte =
+ row.getByte(i)
+
+ /**
+ * Returns the value of column `i` as a float. This function will throw an exception if the value
+ * is at `i` is not a float, or if it is null.
+ */
+ def getFloat(i: Int): Float =
+ row.getFloat(i)
+
+ /**
+ * Returns the value of column `i` as a String. This function will throw an exception if the
+ * value is at `i` is not a String.
+ */
+ def getString(i: Int): String =
+ row.getString(i)
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
new file mode 100644
index 0000000000..def0e046a3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java
+
+import scala.beans.BeanProperty
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql.test.TestSQLContext
+
+// Implicits
+import scala.collection.JavaConversions._
+
+class PersonBean extends Serializable {
+ @BeanProperty
+ var name: String = _
+
+ @BeanProperty
+ var age: Int = _
+}
+
+class JavaSQLSuite extends FunSuite {
+ val javaCtx = new JavaSparkContext(TestSQLContext.sparkContext)
+ val javaSqlCtx = new JavaSQLContext(javaCtx)
+
+ test("schema from JavaBeans") {
+ val person = new PersonBean
+ person.setName("Michael")
+ person.setAge(29)
+
+ val rdd = javaCtx.parallelize(person :: Nil)
+ val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[PersonBean])
+
+ schemaRDD.registerAsTable("people")
+ javaSqlCtx.sql("SELECT * FROM people").collect()
+ }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 46febbfad0..ff8eaacded 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -71,6 +71,18 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
override def executePlan(plan: LogicalPlan): this.QueryExecution =
new this.QueryExecution { val logical = plan }
+ /**
+ * Executes a query expressed in HiveQL using Spark, returning the result as a SchemaRDD.
+ */
+ def hql(hqlQuery: String): SchemaRDD = {
+ val result = new SchemaRDD(this, HiveQl.parseSql(hqlQuery))
+ // We force query optimization to happen right away instead of letting it happen lazily like
+ // when using the query DSL. This is so DDL commands behave as expected. This is only
+ // generates the RDD lineage for DML queries, but do not perform any execution.
+ result.queryExecution.toRdd
+ result
+ }
+
// Circular buffer to hold what hive prints to STDOUT and ERR. Only printed when failures occur.
@transient
protected val outputBuffer = new java.io.OutputStream {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
new file mode 100644
index 0000000000..6df76fa825
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.api.java
+
+import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD}
+import org.apache.spark.sql.hive.{HiveContext, HiveQl}
+
+/**
+ * The entry point for executing Spark SQL queries from a Java program.
+ */
+class JavaHiveContext(sparkContext: JavaSparkContext) extends JavaSQLContext(sparkContext) {
+
+ override val sqlContext = new HiveContext(sparkContext)
+
+ /**
+ * Executes a query expressed in HiveQL, returning the result as a JavaSchemaRDD.
+ */
+ def hql(hqlQuery: String): JavaSchemaRDD = {
+ val result = new JavaSchemaRDD(sqlContext, HiveQl.parseSql(hqlQuery))
+ // We force query optimization to happen right away instead of letting it happen lazily like
+ // when using the query DSL. This is so DDL commands behave as expected. This is only
+ // generates the RDD lineage for DML queries, but do not perform any execution.
+ result.queryExecution.toRdd
+ result
+ }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveSuite.scala
new file mode 100644
index 0000000000..8137f99b22
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveSuite.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.api.java
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.hive.TestHive
+
+// Implicits
+import scala.collection.JavaConversions._
+
+class JavaHiveSQLSuite extends FunSuite {
+ ignore("SELECT * FROM src") {
+ val javaCtx = new JavaSparkContext(TestSQLContext.sparkContext)
+ // There is a little trickery here to avoid instantiating two HiveContexts in the same JVM
+ val javaSqlCtx = new JavaHiveContext(javaCtx) {
+ override val sqlContext = TestHive
+ }
+
+ assert(
+ javaSqlCtx.hql("SELECT * FROM src").collect().map(_.getInt(0)) ===
+ TestHive.sql("SELECT * FROM src").collect().map(_.getInt(0)).toSeq)
+ }
+}