aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-03-28 23:59:27 -0700
committerReynold Xin <rxin@databricks.com>2015-03-28 23:59:27 -0700
commit5eef00d0c6c7cc5448aca7b1c2a2e289a4c43eb0 (patch)
tree853e52d983b4b2e21d8bf2161121ad785e3d05ee /python
parentf75f633b21faaf911f04aeff847f25749b1ecd89 (diff)
downloadspark-5eef00d0c6c7cc5448aca7b1c2a2e289a4c43eb0.tar.gz
spark-5eef00d0c6c7cc5448aca7b1c2a2e289a4c43eb0.tar.bz2
spark-5eef00d0c6c7cc5448aca7b1c2a2e289a4c43eb0.zip
[DOC] Improvements to Python docs.
Author: Reynold Xin <rxin@databricks.com> Closes #5238 from rxin/pyspark-docs and squashes the following commits: c285951 [Reynold Xin] Reset deprecation warning. 8c1031e [Reynold Xin] inferSchema dd91b1a [Reynold Xin] [DOC] Improvements to Python docs.
Diffstat (limited to 'python')
-rw-r--r--python/docs/index.rst8
-rw-r--r--python/pyspark/sql/__init__.py14
-rw-r--r--python/pyspark/sql/dataframe.py9
3 files changed, 17 insertions, 14 deletions
diff --git a/python/docs/index.rst b/python/docs/index.rst
index d150de9d5c..f7eede9c3c 100644
--- a/python/docs/index.rst
+++ b/python/docs/index.rst
@@ -29,6 +29,14 @@ Core classes:
A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
+ :class:`pyspark.sql.SQLContext`
+
+ Main entry point for DataFrame and SQL functionality.
+
+ :class:`pyspark.sql.DataFrame`
+
+ A distributed collection of data grouped into named columns.
+
Indices and tables
==================
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index b9ffd6945e..54a01631d8 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -19,17 +19,19 @@
public classes of Spark SQL:
- L{SQLContext}
- Main entry point for SQL functionality.
+ Main entry point for :class:`DataFrame` and SQL functionality.
- L{DataFrame}
- A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
- addition to normal RDD operations, DataFrames also support SQL.
+ A distributed collection of data grouped into named columns.
- L{GroupedData}
+ Aggregation methods, returned by :func:`DataFrame.groupBy`.
- L{Column}
- Column is a DataFrame with a single column.
+ A column expression in a :class:`DataFrame`.
- L{Row}
- A Row of data returned by a Spark SQL query.
+ A row of data in a :class:`DataFrame`.
- L{HiveContext}
- Main entry point for accessing data stored in Apache Hive..
+ Main entry point for accessing data stored in Apache Hive.
+ - L{functions}
+ List of built-in functions available for :class:`DataFrame`.
"""
from pyspark.sql.context import SQLContext, HiveContext
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index d51309f7ef..23c0e63e77 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -50,13 +50,6 @@ class DataFrame(object):
ageCol = people.age
- Note that the :class:`Column` type can also be manipulated
- through its various functions::
-
- # The following creates a new column that increases everybody's age by 10.
- people.age + 10
-
-
A more concrete example::
# To create DataFrame using SQLContext
@@ -77,7 +70,7 @@ class DataFrame(object):
@property
def rdd(self):
"""
- Return the content of the :class:`DataFrame` as an :class:`RDD`
+ Return the content of the :class:`DataFrame` as an :class:`pyspark.RDD`
of :class:`Row` s.
"""
if not hasattr(self, '_lazy_rdd'):