diff options
author | Reynold Xin <rxin@databricks.com> | 2015-06-23 19:30:25 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-06-23 19:30:25 -0700 |
commit | a458efc66c31dc281af379b914bfa2b077ca6635 (patch) | |
tree | bc5955310ec43cb175ea77a147fc3bd99340e27b /sql | |
parent | 0401cbaa8ee51c71f43604f338b65022a479da0a (diff) | |
download | spark-a458efc66c31dc281af379b914bfa2b077ca6635.tar.gz spark-a458efc66c31dc281af379b914bfa2b077ca6635.tar.bz2 spark-a458efc66c31dc281af379b914bfa2b077ca6635.zip |
Revert "[SPARK-7157][SQL] add sampleBy to DataFrame"
This reverts commit 0401cbaa8ee51c71f43604f338b65022a479da0a.
The new test case on Jenkins is failing.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 24 | ||||
-rw-r--r-- | sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala | 12 |
2 files changed, 2 insertions, 34 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 955d28771b..edb9ed7bba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql -import java.util.UUID - import org.apache.spark.annotation.Experimental import org.apache.spark.sql.execution.stat._ @@ -165,26 +163,4 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { def freqItems(cols: Seq[String]): DataFrame = { FrequentItems.singlePassFreqItems(df, cols, 0.01) } - - /** - * Returns a stratified sample without replacement based on the fraction given on each stratum. - * @param col column that defines strata - * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat - * its fraction as zero. - * @param seed random seed - * @return a new [[DataFrame]] that represents the stratified sample - * - * @since 1.5.0 - */ - def sampleBy(col: String, fractions: Map[Any, Double], seed: Long): DataFrame = { - require(fractions.values.forall(p => p >= 0.0 && p <= 1.0), - s"Fractions must be in [0, 1], but got $fractions.") - import org.apache.spark.sql.functions.rand - val c = Column(col) - val r = rand(seed).as("rand_" + UUID.randomUUID().toString.take(8)) - val expr = fractions.toSeq.map { case (k, v) => - (c === k) && (r < v) - }.reduce(_ || _) || false - df.filter(expr) - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 3dd4688912..0d3ff899da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql import org.scalatest.Matchers._ -import org.apache.spark.sql.functions.col +import org.apache.spark.SparkFunSuite -class DataFrameStatSuite extends QueryTest { +class DataFrameStatSuite extends SparkFunSuite { private val sqlCtx = org.apache.spark.sql.test.TestSQLContext import sqlCtx.implicits._ @@ -98,12 +98,4 @@ class DataFrameStatSuite extends QueryTest { val items2 = singleColResults.collect().head items2.getSeq[Double](0) should contain (-1.0) } - - test("sampleBy") { - val df = sqlCtx.range(0, 100).select((col("id") % 3).as("key")) - val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L) - checkAnswer( - sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 4), Row(1, 9))) - } } |