aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-02-27 00:28:30 +0800
committerCheng Lian <lian@databricks.com>2016-02-27 00:28:30 +0800
commit99dfcedbfd4c83c7b6a343456f03e8c6e29968c5 (patch)
treeeb08a31c82b94e1582a0b6aa265038cb5fd44403 /sql
parent5c3912e5c90ce659146c3056430d100604378b71 (diff)
downloadspark-99dfcedbfd4c83c7b6a343456f03e8c6e29968c5.tar.gz
spark-99dfcedbfd4c83c7b6a343456f03e8c6e29968c5.tar.bz2
spark-99dfcedbfd4c83c7b6a343456f03e8c6e29968c5.zip
[SPARK-13457][SQL] Removes DataFrame RDD operations
## What changes were proposed in this pull request? This is another try of PR #11323. This PR removes DataFrame RDD operations except for `foreach` and `foreachPartitions` (they are actions rather than transformations). Original calls are now replaced by calls to methods of `DataFrame.rdd`. PR #11323 was reverted because it introduced a regression: both `DataFrame.foreach` and `DataFrame.foreachPartitions` wrap underlying RDD operations with `withNewExecutionId` to track Spark jobs. But they are removed in #11323. ## How was the this patch tested? No extra tests are added. Existing tests should do the work. Author: Cheng Lian <lian@databricks.com> Closes #11388 from liancheng/remove-df-rdd-ops.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala24
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala1
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala2
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala2
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala2
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala5
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala2
10 files changed, 12 insertions, 32 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index abb8fe552b..5f5b7f4c19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1427,30 +1427,6 @@ class DataFrame private[sql](
def transform[U](t: DataFrame => DataFrame): DataFrame = t(this)
/**
- * Returns a new RDD by applying a function to all rows of this DataFrame.
- * @group rdd
- * @since 1.3.0
- */
- def map[R: ClassTag](f: Row => R): RDD[R] = rdd.map(f)
-
- /**
- * Returns a new RDD by first applying a function to all rows of this [[DataFrame]],
- * and then flattening the results.
- * @group rdd
- * @since 1.3.0
- */
- def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = rdd.flatMap(f)
-
- /**
- * Returns a new RDD by applying a function to each partition of this DataFrame.
- * @group rdd
- * @since 1.3.0
- */
- def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = {
- rdd.mapPartitions(f)
- }
-
- /**
* Applies a function `f` to all rows.
* @group rdd
* @since 1.3.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index f06d16116e..a7258d742a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -306,6 +306,7 @@ class GroupedData protected[sql](
val values = df.select(pivotColumn)
.distinct()
.sort(pivotColumn) // ensure that the output columns are in a consistent logical order
+ .rdd
.map(_.get(0))
.take(maxValues + 1)
.toSeq
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index d912aeb70d..68a251757c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -100,7 +100,7 @@ private[r] object SQLUtils {
}
def dfToRowRDD(df: DataFrame): JavaRDD[Array[Byte]] = {
- df.map(r => rowToRBytes(r))
+ df.rdd.map(r => rowToRBytes(r))
}
private[this] def doConversion(data: Object, dataType: DataType): Object = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index f54bff9f18..7d96ef6fe0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -257,7 +257,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
}
test("count") {
- assert(testData2.count() === testData2.map(_ => 1).count())
+ assert(testData2.count() === testData2.rdd.map(_ => 1).count())
checkAnswer(
testData2.agg(count('a), sumDistinct('a)), // non-partial
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index fbffe867e4..bd51154c58 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -101,7 +101,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
(implicit df: DataFrame): Unit = {
def checkBinaryAnswer(df: DataFrame, expected: Seq[Row]) = {
assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).sorted) {
- df.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
+ df.rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 3c74464d57..c85eeddc2c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -599,7 +599,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
test("null and non-null strings") {
// Create a dataset where the first values are NULL and then some non-null values. The
// number of non-nulls needs to be bigger than the ParquetReader batch size.
- val data = sqlContext.range(200).map { i =>
+ val data = sqlContext.range(200).rdd.map { i =>
if (i.getLong(0) < 150) Row(None)
else Row("a")
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index f141a9bd0f..12a5542bd4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -210,7 +210,7 @@ object SparkSubmitClassLoaderTest extends Logging {
}
// Second, we load classes at the executor side.
logInfo("Testing load classes at the executor side.")
- val result = df.mapPartitions { x =>
+ val result = df.rdd.mapPartitions { x =>
var exception: String = null
try {
Utils.classForName(args(0))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 3208ebc9ff..1002487447 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -664,11 +664,13 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
test("implement identity function using case statement") {
val actual = sql("SELECT (CASE key WHEN key THEN key END) FROM src")
+ .rdd
.map { case Row(i: Int) => i }
.collect()
.toSet
val expected = sql("SELECT key FROM src")
+ .rdd
.map { case Row(i: Int) => i }
.collect()
.toSet
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index b11d1d9de0..68249517f5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -119,6 +119,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
// expr = (not leaf-0)
assertResult(10) {
sql("SELECT name, contacts FROM t where age > 5")
+ .rdd
.flatMap(_.getAs[Seq[_]]("contacts"))
.count()
}
@@ -131,7 +132,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
val df = sql("SELECT name, contacts FROM t WHERE age > 5 AND age < 8")
assert(df.count() === 2)
assertResult(4) {
- df.flatMap(_.getAs[Seq[_]]("contacts")).count()
+ df.rdd.flatMap(_.getAs[Seq[_]]("contacts")).count()
}
}
@@ -143,7 +144,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
val df = sql("SELECT name, contacts FROM t WHERE age < 2 OR age > 8")
assert(df.count() === 3)
assertResult(6) {
- df.flatMap(_.getAs[Seq[_]]("contacts")).count()
+ df.rdd.flatMap(_.getAs[Seq[_]]("contacts")).count()
}
}
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 68d5c7da1f..a127cf6e4b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -854,7 +854,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
test(s"hive udfs $table") {
checkAnswer(
sql(s"SELECT concat(stringField, stringField) FROM $table"),
- sql(s"SELECT stringField FROM $table").map {
+ sql(s"SELECT stringField FROM $table").rdd.map {
case Row(s: String) => Row(s + s)
}.collect().toSeq)
}