diff options
author | Reynold Xin <rxin@databricks.com> | 2015-03-28 23:59:27 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-03-28 23:59:36 -0700 |
commit | 3db08444bcb498cdd962f317c15bf6dfb34b0de0 (patch) | |
tree | c02cd7d5e4a5264734ef2ed0b729aa07b846cc88 | |
parent | 5e04f451875102e30e174491000515c39df8260c (diff) | |
download | spark-3db08444bcb498cdd962f317c15bf6dfb34b0de0.tar.gz spark-3db08444bcb498cdd962f317c15bf6dfb34b0de0.tar.bz2 spark-3db08444bcb498cdd962f317c15bf6dfb34b0de0.zip |
[DOC] Improvements to Python docs.
Author: Reynold Xin <rxin@databricks.com>
Closes #5238 from rxin/pyspark-docs and squashes the following commits:
c285951 [Reynold Xin] Reset deprecation warning.
8c1031e [Reynold Xin] inferSchema
dd91b1a [Reynold Xin] [DOC] Improvements to Python docs.
(cherry picked from commit 5eef00d0c6c7cc5448aca7b1c2a2e289a4c43eb0)
Signed-off-by: Reynold Xin <rxin@databricks.com>
-rw-r--r-- | python/docs/index.rst | 8 | ||||
-rw-r--r-- | python/pyspark/sql/__init__.py | 14 | ||||
-rw-r--r-- | python/pyspark/sql/dataframe.py | 9 |
3 files changed, 17 insertions, 14 deletions
diff --git a/python/docs/index.rst b/python/docs/index.rst index d150de9d5c..f7eede9c3c 100644 --- a/python/docs/index.rst +++ b/python/docs/index.rst @@ -29,6 +29,14 @@ Core classes: A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. + :class:`pyspark.sql.SQLContext` + + Main entry point for DataFrame and SQL functionality. + + :class:`pyspark.sql.DataFrame` + + A distributed collection of data grouped into named columns. + Indices and tables ================== diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py index b9ffd6945e..54a01631d8 100644 --- a/python/pyspark/sql/__init__.py +++ b/python/pyspark/sql/__init__.py @@ -19,17 +19,19 @@ public classes of Spark SQL: - L{SQLContext} - Main entry point for SQL functionality. + Main entry point for :class:`DataFrame` and SQL functionality. - L{DataFrame} - A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In - addition to normal RDD operations, DataFrames also support SQL. + A distributed collection of data grouped into named columns. - L{GroupedData} + Aggregation methods, returned by :func:`DataFrame.groupBy`. - L{Column} - Column is a DataFrame with a single column. + A column expression in a :class:`DataFrame`. - L{Row} - A Row of data returned by a Spark SQL query. + A row of data in a :class:`DataFrame`. - L{HiveContext} - Main entry point for accessing data stored in Apache Hive.. + Main entry point for accessing data stored in Apache Hive. + - L{functions} + List of built-in functions available for :class:`DataFrame`. """ from pyspark.sql.context import SQLContext, HiveContext diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index d51309f7ef..23c0e63e77 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -50,13 +50,6 @@ class DataFrame(object): ageCol = people.age - Note that the :class:`Column` type can also be manipulated - through its various functions:: - - # The following creates a new column that increases everybody's age by 10. - people.age + 10 - - A more concrete example:: # To create DataFrame using SQLContext @@ -77,7 +70,7 @@ class DataFrame(object): @property def rdd(self): """ - Return the content of the :class:`DataFrame` as an :class:`RDD` + Return the content of the :class:`DataFrame` as an :class:`pyspark.RDD` of :class:`Row` s. """ if not hasattr(self, '_lazy_rdd'): |