diff options
author | Cheng Lian <lian@databricks.com> | 2016-03-21 10:06:02 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-03-21 10:06:02 -0700 |
commit | 060a28c633e559376976561248bcb30c4739b76d (patch) | |
tree | 5019a54a6c39cbcf10e2e5be0c0ffd33aa9689d7 /sql | |
parent | a2a90780281115b72ebe2e4334d6e83058436e27 (diff) | |
download | spark-060a28c633e559376976561248bcb30c4739b76d.tar.gz spark-060a28c633e559376976561248bcb30c4739b76d.tar.bz2 spark-060a28c633e559376976561248bcb30c4739b76d.zip |
[SPARK-13826][SQL] Ad-hoc Dataset API ScalaDoc fixes
## What changes were proposed in this pull request?
Ad-hoc Dataset API ScalaDoc fixes
## How was this patch tested?
By building and checking ScalaDoc locally.
Author: Cheng Lian <lian@databricks.com>
Closes #11862 from liancheng/ds-doc-fixes.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 39 |
1 files changed, 21 insertions, 18 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 6e7d208723..295cb67eb4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -67,24 +67,24 @@ private[sql] object Dataset { * * Operations available on Datasets are divided into transformations and actions. Transformations * are the ones that produce new Datasets, and actions are the ones that trigger computation and - * return results. Example transformations include map, filter, select, aggregate (groupBy). + * return results. Example transformations include map, filter, select, and aggregate (`groupBy`). * Example actions count, show, or writing data out to file systems. * * Datasets are "lazy", i.e. computations are only triggered when an action is invoked. Internally, * a Dataset represents a logical plan that describes the computation required to produce the data. * When an action is invoked, Spark's query optimizer optimizes the logical plan and generates a - * physical plan for efficient execution in a parallel or distributed manner. To explore the + * physical plan for efficient execution in a parallel and distributed manner. To explore the * logical plan as well as optimized physical plan, use the `explain` function. * * To efficiently support domain-specific objects, an [[Encoder]] is required. The encoder maps - * the domain specific type T to Spark's internal type system. For example, given a class Person - * with two fields, name (string) and age (int), an encoder is used to tell Spark to generate code - * at runtime to serialize the Person object into a binary structure. This binary structure often - * has much lower memory footprint as well as are optimized for efficiency in data processing + * the domain specific type `T` to Spark's internal type system. For example, given a class `Person` + * with two fields, `name` (string) and `age` (int), an encoder is used to tell Spark to generate + * code at runtime to serialize the `Person` object into a binary structure. This binary structure + * often has much lower memory footprint as well as are optimized for efficiency in data processing * (e.g. in a columnar format). To understand the internal binary representation for data, use the * `schema` function. * - * There are typically two ways to create a Dataset. The most common way to by pointing Spark + * There are typically two ways to create a Dataset. The most common way is by pointing Spark * to some files on storage systems, using the `read` function available on a `SparkSession`. * {{{ * val people = session.read.parquet("...").as[Person] // Scala @@ -98,7 +98,7 @@ private[sql] object Dataset { * Dataset<String> names = people.map((Person p) -> p.name, Encoders.STRING) // in Java 8 * }}} * - * Dataset operations can also be untyped, through the various domain-specific-language (DSL) + * Dataset operations can also be untyped, through various domain-specific-language (DSL) * functions defined in: [[Dataset]] (this class), [[Column]], and [[functions]]. These operations * are very similar to the operations available in the data frame abstraction in R or Python. * @@ -118,8 +118,8 @@ private[sql] object Dataset { * A more concrete example in Scala: * {{{ * // To create Dataset[Row] using SQLContext - * val people = sqlContext.read.parquet("...") - * val department = sqlContext.read.parquet("...") + * val people = session.read.parquet("...") + * val department = session.read.parquet("...") * * people.filter("age > 30") * .join(department, people("deptId") === department("id")) @@ -130,8 +130,8 @@ private[sql] object Dataset { * and in Java: * {{{ * // To create Dataset<Row> using SQLContext - * Dataset<Row> people = sqlContext.read().parquet("..."); - * Dataset<Row> department = sqlContext.read().parquet("..."); + * Dataset<Row> people = session.read().parquet("..."); + * Dataset<Row> department = session.read().parquet("..."); * * people.filter("age".gt(30)) * .join(department, people.col("deptId").equalTo(department("id"))) @@ -1106,7 +1106,7 @@ class Dataset[T] private[sql]( } /** - * Groups the [[Dataset]] using the specified columns, so we can run aggregation on them. + * Groups the [[Dataset]] using the specified columns, so that we can run aggregation on them. * See [[RelationalGroupedDataset]] for all the available aggregate functions. * * This is a variant of groupBy that can only group by existing columns using column names @@ -1341,7 +1341,7 @@ class Dataset[T] private[sql]( } /** - * Returns a new [[Dataset]] containing union of rows in this frame and another frame. + * Returns a new [[Dataset]] containing union of rows in this Dataset and another Dataset. * This is equivalent to `UNION ALL` in SQL. * * To do a SQL-style set union (that does deduplication of elements), use this function followed @@ -1357,7 +1357,7 @@ class Dataset[T] private[sql]( } /** - * Returns a new [[Dataset]] containing union of rows in this frame and another frame. + * Returns a new [[Dataset]] containing union of rows in this Dataset and another Dataset. * This is equivalent to `UNION ALL` in SQL. * * @group typedrel @@ -1366,7 +1366,7 @@ class Dataset[T] private[sql]( def union(other: Dataset[T]): Dataset[T] = unionAll(other) /** - * Returns a new [[Dataset]] containing rows only in both this frame and another frame. + * Returns a new [[Dataset]] containing rows only in both this Dataset and another Dataset. * This is equivalent to `INTERSECT` in SQL. * * Note that, equality checking is performed directly on the encoded representation of the data @@ -1380,7 +1380,7 @@ class Dataset[T] private[sql]( } /** - * Returns a new [[Dataset]] containing rows in this frame but not in another frame. + * Returns a new [[Dataset]] containing rows in this Dataset but not in another Dataset. * This is equivalent to `EXCEPT` in SQL. * * Note that, equality checking is performed directly on the encoded representation of the data @@ -1394,9 +1394,12 @@ class Dataset[T] private[sql]( } /** - * Returns a new [[Dataset]] containing rows in this frame but not in another frame. + * Returns a new [[Dataset]] containing rows in this Dataset but not in another Dataset. * This is equivalent to `EXCEPT` in SQL. * + * Note that, equality checking is performed directly on the encoded representation of the data + * and thus is not affected by a custom `equals` function defined on `T`. + * * @group typedrel * @since 2.0.0 */ |