aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-08-24 13:48:01 -0700
committerReynold Xin <rxin@databricks.com>2015-08-24 13:48:01 -0700
commit9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894 (patch)
tree361725d0a63c2b909faa3aed7e86e66a1226434a /sql
parent7478c8b66d6a2b1179f20c38b49e27e37b0caec3 (diff)
downloadspark-9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894.tar.gz
spark-9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894.tar.bz2
spark-9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894.zip
[SPARK-7710] [SPARK-7998] [DOCS] Docs for DataFrameStatFunctions
This PR contains examples on how to use some of the Stat Functions available for DataFrames under `df.stat`. rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #8378 from brkyvz/update-sql-docs.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala101
2 files changed, 102 insertions, 1 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d6688b24ae..791c10c3d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -684,7 +684,7 @@ class DataFrame private[sql](
// make it a NamedExpression.
case Column(u: UnresolvedAttribute) => UnresolvedAlias(u)
case Column(expr: NamedExpression) => expr
- // Leave an unaliased explode with an empty list of names since the analzyer will generate the
+ // Leave an unaliased explode with an empty list of names since the analyzer will generate the
// correct defaults after the nested expression's type has been resolved.
case Column(explode: Explode) => MultiAlias(explode, Nil)
case Column(expr: Expression) => Alias(expr, expr.prettyString)()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 2e68e358f2..69c9847175 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -39,6 +39,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the second column
* @return the covariance of the two columns.
*
+ * {{{
+ * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
+ * .withColumn("rand2", rand(seed=27))
+ * df.stat.cov("rand1", "rand2")
+ * res1: Double = 0.065...
+ * }}}
+ *
* @since 1.4.0
*/
def cov(col1: String, col2: String): Double = {
@@ -54,6 +61,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
+ * {{{
+ * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
+ * .withColumn("rand2", rand(seed=27))
+ * df.stat.corr("rand1", "rand2")
+ * res1: Double = 0.613...
+ * }}}
+ *
* @since 1.4.0
*/
def corr(col1: String, col2: String, method: String): Double = {
@@ -69,6 +83,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
+ * {{{
+ * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
+ * .withColumn("rand2", rand(seed=27))
+ * df.stat.corr("rand1", "rand2", "pearson")
+ * res1: Double = 0.613...
+ * }}}
+ *
* @since 1.4.0
*/
def corr(col1: String, col2: String): Double = {
@@ -92,6 +113,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* of the DataFrame.
* @return A DataFrame containing for the contingency table.
*
+ * {{{
+ * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+ * (3, 3))).toDF("key", "value")
+ * val ct = df.stat.crosstab("key", "value")
+ * ct.show()
+ * +---------+---+---+---+
+ * |key_value| 1| 2| 3|
+ * +---------+---+---+---+
+ * | 2| 2| 0| 1|
+ * | 1| 1| 1| 0|
+ * | 3| 0| 1| 1|
+ * +---------+---+---+---+
+ * }}}
+ *
* @since 1.4.0
*/
def crosstab(col1: String, col2: String): DataFrame = {
@@ -112,6 +147,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* than 1e-4.
* @return A Local DataFrame with the Array of frequent items for each column.
*
+ * {{{
+ * val rows = Seq.tabulate(100) { i =>
+ * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
+ * }
+ * val df = sqlContext.createDataFrame(rows).toDF("a", "b")
+ * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
+ * // "a" and "b"
+ * val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4)
+ * freqSingles.show()
+ * +-----------+-------------+
+ * |a_freqItems| b_freqItems|
+ * +-----------+-------------+
+ * | [1, 99]|[-1.0, -99.0]|
+ * +-----------+-------------+
+ * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
+ * val pairDf = df.select(struct("a", "b").as("a-b"))
+ * val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1)
+ * freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
+ * +----------+
+ * | freq_ab|
+ * +----------+
+ * | [1,-1.0]|
+ * | ... |
+ * +----------+
+ * }}}
+ *
* @since 1.4.0
*/
def freqItems(cols: Array[String], support: Double): DataFrame = {
@@ -147,6 +208,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
*
+ * {{{
+ * val rows = Seq.tabulate(100) { i =>
+ * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
+ * }
+ * val df = sqlContext.createDataFrame(rows).toDF("a", "b")
+ * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
+ * // "a" and "b"
+ * val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)
+ * freqSingles.show()
+ * +-----------+-------------+
+ * |a_freqItems| b_freqItems|
+ * +-----------+-------------+
+ * | [1, 99]|[-1.0, -99.0]|
+ * +-----------+-------------+
+ * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
+ * val pairDf = df.select(struct("a", "b").as("a-b"))
+ * val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1)
+ * freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
+ * +----------+
+ * | freq_ab|
+ * +----------+
+ * | [1,-1.0]|
+ * | ... |
+ * +----------+
+ * }}}
+ *
* @since 1.4.0
*/
def freqItems(cols: Seq[String], support: Double): DataFrame = {
@@ -180,6 +267,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @tparam T stratum type
* @return a new [[DataFrame]] that represents the stratified sample
*
+ * {{{
+ * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+ * (3, 3))).toDF("key", "value")
+ * val fractions = Map(1 -> 1.0, 3 -> 0.5)
+ * df.stat.sampleBy("key", fractions, 36L).show()
+ * +---+-----+
+ * |key|value|
+ * +---+-----+
+ * | 1| 1|
+ * | 1| 2|
+ * | 3| 2|
+ * +---+-----+
+ * }}}
+ *
* @since 1.5.0
*/
def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = {