aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-05-06 08:50:56 -0700
committerReynold Xin <rxin@databricks.com>2015-05-06 08:50:56 -0700
commit322e7e7f689947aef29909572ee0c0e110ea23b8 (patch)
treee615ab3f8f4ecbda3e4917b529d85237b8e90b07
parent32cdc815c6fc19b5c8c4eca35f88a61302d67cd5 (diff)
downloadspark-322e7e7f689947aef29909572ee0c0e110ea23b8.tar.gz
spark-322e7e7f689947aef29909572ee0c0e110ea23b8.tar.bz2
spark-322e7e7f689947aef29909572ee0c0e110ea23b8.zip
[SQL] JavaDoc update for various DataFrame functions.
Author: Reynold Xin <rxin@databricks.com> Closes #5935 from rxin/df-doc1 and squashes the following commits: aaeaadb [Reynold Xin] [SQL] JavaDoc update for various DataFrame functions.
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Column.scala4
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala16
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala19
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/functions.scala14
4 files changed, 32 insertions, 21 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index c0503bf047..8eb632d3d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -84,14 +84,14 @@ class Column(protected[sql] val expr: Expression) extends Logging {
/**
* Inversion of boolean expression, i.e. NOT.
- * {{
+ * {{{
* // Scala: select rows that are not active (isActive === false)
* df.filter( !df("isActive") )
*
* // Java:
* import static org.apache.spark.sql.functions.*;
* df.filter( not(df.col("isActive")) );
- * }}
+ * }}}
*
* @group expr_ops
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index aceb7a9627..9d2cd7aae3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -681,11 +681,11 @@ class DataFrame private[sql](
/**
* (Scala-specific) Aggregates on the entire [[DataFrame]] without groups.
- * {{
+ * {{{
* // df.agg(...) is a shorthand for df.groupBy().agg(...)
* df.agg("age" -> "max", "salary" -> "avg")
* df.groupBy().agg("age" -> "max", "salary" -> "avg")
- * }}
+ * }}}
* @group dfops
*/
def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = {
@@ -694,33 +694,33 @@ class DataFrame private[sql](
/**
* (Scala-specific) Aggregates on the entire [[DataFrame]] without groups.
- * {{
+ * {{{
* // df.agg(...) is a shorthand for df.groupBy().agg(...)
* df.agg(Map("age" -> "max", "salary" -> "avg"))
* df.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
- * }}
+ * }}}
* @group dfops
*/
def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs)
/**
* (Java-specific) Aggregates on the entire [[DataFrame]] without groups.
- * {{
+ * {{{
* // df.agg(...) is a shorthand for df.groupBy().agg(...)
* df.agg(Map("age" -> "max", "salary" -> "avg"))
* df.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
- * }}
+ * }}}
* @group dfops
*/
def agg(exprs: java.util.Map[String, String]): DataFrame = groupBy().agg(exprs)
/**
* Aggregates on the entire [[DataFrame]] without groups.
- * {{
+ * {{{
* // df.agg(...) is a shorthand for df.groupBy().agg(...)
* df.agg(max($"age"), avg($"salary"))
* df.groupBy().agg(max($"age"), avg($"salary"))
- * }}
+ * }}}
* @group dfops
*/
@scala.annotation.varargs
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index cb88deab35..a1e74470af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -37,7 +37,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
StatFunctions.calculateCov(df, Seq(col1, col2))
}
- /*
+ /**
* Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
* Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
* MLlib's Statistics.
@@ -75,7 +75,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* each row.
* @param col2 The name of the second column. Distinct items will make the column names
* of the DataFrame.
- * @return A Local DataFrame containing the table
+ * @return A DataFrame containing for the contingency table.
*/
def crosstab(col1: String, col2: String): DataFrame = {
StatFunctions.crossTabulate(df, col1, col2)
@@ -110,14 +110,25 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
}
/**
- * Python friendly implementation for `freqItems`
+ * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
+ * frequent element count algorithm described in
+ * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+ *
+ * @param cols the names of the columns to search frequent items in.
+ * @return A Local DataFrame with the Array of frequent items for each column.
*/
def freqItems(cols: Seq[String], support: Double): DataFrame = {
FrequentItems.singlePassFreqItems(df, cols, support)
}
/**
- * Python friendly implementation for `freqItems` with a default `support` of 1%.
+ * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
+ * frequent element count algorithm described in
+ * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+ * Uses a `default` support of 1%.
+ *
+ * @param cols the names of the columns to search frequent items in.
+ * @return A Local DataFrame with the Array of frequent items for each column.
*/
def freqItems(cols: Seq[String]): DataFrame = {
FrequentItems.singlePassFreqItems(df, cols, 0.01)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index f2bb4534c7..830b501771 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -22,7 +22,7 @@ import scala.reflect.runtime.universe.{TypeTag, typeTag}
import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, Star}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.mathfuncs._
import org.apache.spark.sql.types._
@@ -86,10 +86,10 @@ object functions {
/**
* Returns a sort expression based on ascending order of the column.
- * {{
+ * {{{
* // Sort by dept in ascending order, and then age in descending order.
* df.sort(asc("dept"), desc("age"))
- * }}
+ * }}}
*
* @group sort_funcs
*/
@@ -97,10 +97,10 @@ object functions {
/**
* Returns a sort expression based on the descending order of the column.
- * {{
+ * {{{
* // Sort by dept in ascending order, and then age in descending order.
* df.sort(asc("dept"), desc("age"))
- * }}
+ * }}}
*
* @group sort_funcs
*/
@@ -353,13 +353,13 @@ object functions {
/**
* Inversion of boolean expression, i.e. NOT.
- * {{
+ * {{{
* // Scala: select rows that are not active (isActive === false)
* df.filter( !df("isActive") )
*
* // Java:
* df.filter( not(df.col("isActive")) );
- * }}
+ * }}}
*
* @group normal_funcs
*/