diff options
author | Michael Armbrust <michael@databricks.com> | 2014-11-16 21:55:57 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2014-11-16 21:56:30 -0800 |
commit | 8305e803e23808507b68fa9a6876ee455e58ac27 (patch) | |
tree | 9120aa92a106e61a112c139339767d3e74c743bb | |
parent | f3b93c1bac292fccb05bf16d1da4b1863b3031fd (diff) | |
download | spark-8305e803e23808507b68fa9a6876ee455e58ac27.tar.gz spark-8305e803e23808507b68fa9a6876ee455e58ac27.tar.bz2 spark-8305e803e23808507b68fa9a6876ee455e58ac27.zip |
[SPARK-4410][SQL] Add support for external sort
Adds a new operator that uses Spark's `ExternalSort` class. It is off by default now, but we might consider making it the default if benchmarks show that it does not regress performance.
Author: Michael Armbrust <michael@databricks.com>
Closes #3268 from marmbrus/externalSort and squashes the following commits:
48b9726 [Michael Armbrust] comments
b98799d [Michael Armbrust] Add test
afd7562 [Michael Armbrust] Add support for external sort.
(cherry picked from commit 64c6b9bad559c21f25cd9fbe37c8813cdab939f2)
Signed-off-by: Reynold Xin <rxin@databricks.com>
4 files changed, 59 insertions, 6 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index cd7d78e684..9697beb132 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -39,6 +39,10 @@ private[spark] object SQLConf { val COLUMN_NAME_OF_CORRUPT_RECORD = "spark.sql.columnNameOfCorruptRecord" + // Options that control which operators can be chosen by the query planner. These should be + // considered hints and may be ignored by future versions of Spark SQL. + val EXTERNAL_SORT = "spark.sql.planner.externalSort" + // This is only used for the thriftserver val THRIFTSERVER_POOL = "spark.sql.thriftserver.scheduler.pool" @@ -96,6 +100,9 @@ private[sql] trait SQLConf { private[spark] def parquetFilterPushDown = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED, "false").toBoolean + /** When true the planner will use the external sort, which may spill to disk. */ + private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT, "false").toBoolean + /** * When set to true, Spark SQL will use the Scala compiler at runtime to generate custom bytecode * that evaluates expressions found in queries. In general this custom code runs much faster diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 03cd5bd627..7ef1f9f2c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -263,9 +263,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case logical.Distinct(child) => execution.Distinct(partial = false, execution.Distinct(partial = true, planLater(child))) :: Nil + + case logical.Sort(sortExprs, child) if sqlContext.externalSortEnabled => + execution.ExternalSort(sortExprs, global = true, planLater(child)):: Nil case logical.Sort(sortExprs, child) => - // This sort is a global sort. Its requiredDistribution will be an OrderedDistribution. execution.Sort(sortExprs, global = true, planLater(child)):: Nil + case logical.SortPartitions(sortExprs, child) => // This sort only sorts tuples within a partition. Its requiredDistribution will be // an UnspecifiedDistribution. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala index 1b8ba3ace2..e53723c176 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, OrderedDistribution, SinglePartition, UnspecifiedDistribution} import org.apache.spark.util.MutablePair +import org.apache.spark.util.collection.ExternalSorter /** * :: DeveloperApi :: @@ -189,6 +190,9 @@ case class TakeOrdered(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan) /** * :: DeveloperApi :: + * Performs a sort on-heap. + * @param global when true performs a global sort of all partitions by shuffling the data first + * if necessary. */ @DeveloperApi case class Sort( @@ -199,12 +203,37 @@ case class Sort( override def requiredChildDistribution = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil + override def execute() = attachTree(this, "sort") { + child.execute().mapPartitions( { iterator => + val ordering = newOrdering(sortOrder, child.output) + iterator.map(_.copy()).toArray.sorted(ordering).iterator + }, preservesPartitioning = true) + } + + override def output = child.output +} + +/** + * :: DeveloperApi :: + * Performs a sort, spilling to disk as needed. + * @param global when true performs a global sort of all partitions by shuffling the data first + * if necessary. + */ +@DeveloperApi +case class ExternalSort( + sortOrder: Seq[SortOrder], + global: Boolean, + child: SparkPlan) + extends UnaryNode { + override def requiredChildDistribution = + if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil override def execute() = attachTree(this, "sort") { - child.execute() - .mapPartitions( { iterator => - val ordering = newOrdering(sortOrder, child.output) - iterator.map(_.copy()).toArray.sorted(ordering).iterator + child.execute().mapPartitions( { iterator => + val ordering = newOrdering(sortOrder, child.output) + val sorter = new ExternalSorter[Row, Null, Row](ordering = Some(ordering)) + sorter.insertAll(iterator.map(r => (r, null))) + sorter.iterator.map(_._1) }, preservesPartitioning = true) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index ce5672c086..a63515464c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -196,7 +196,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { Seq(Seq("1"))) } - test("sorting") { + def sortTest() = { checkAnswer( sql("SELECT * FROM testData2 ORDER BY a ASC, b ASC"), Seq((1,1), (1,2), (2,1), (2,2), (3,1), (3,2))) @@ -238,6 +238,20 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { mapData.collect().sortBy(_.data(1)).reverse.toSeq) } + test("sorting") { + val before = externalSortEnabled + setConf(SQLConf.EXTERNAL_SORT, "false") + sortTest() + setConf(SQLConf.EXTERNAL_SORT, before.toString) + } + + test("external sorting") { + val before = externalSortEnabled + setConf(SQLConf.EXTERNAL_SORT, "true") + sortTest() + setConf(SQLConf.EXTERNAL_SORT, before.toString) + } + test("limit") { checkAnswer( sql("SELECT * FROM testData LIMIT 10"), |