aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-02-24 20:51:55 -0800
committerMichael Armbrust <michael@databricks.com>2015-02-24 20:51:55 -0800
commitd641fbb39c90b1d734cc55396ca43d7e98788975 (patch)
treed9741f7c08d6ba288b224e8c2af60fc1bdb445f3 /python/pyspark/sql/dataframe.py
parent769e092bdc51582372093f76dbaece27149cc4ea (diff)
downloadspark-d641fbb39c90b1d734cc55396ca43d7e98788975.tar.gz
spark-d641fbb39c90b1d734cc55396ca43d7e98788975.tar.bz2
spark-d641fbb39c90b1d734cc55396ca43d7e98788975.zip
[SPARK-5994] [SQL] Python DataFrame documentation fixes
select empty should NOT be the same as select. make sure selectExpr is behaving the same. join param documentation link to source doesn't work in jekyll generated file cross reference of columns (i.e. enabling linking) show(): move df example before df.show() move tests in SQLContext out of docstring otherwise doc is too long Column.desc and .asc doesn't have any documentation in documentation, sort functions.*) Author: Davies Liu <davies@databricks.com> Closes #4756 from davies/df_docs and squashes the following commits: f30502c [Davies Liu] fix doc 32f0d46 [Davies Liu] fix DataFrame docs
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py56
1 files changed, 28 insertions, 28 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 6f746d136b..6d42410020 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -96,7 +96,7 @@ class DataFrame(object):
return self._lazy_rdd
def toJSON(self, use_unicode=False):
- """Convert a DataFrame into a MappedRDD of JSON documents; one document per row.
+ """Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row.
>>> df.toJSON().first()
'{"age":2,"name":"Alice"}'
@@ -108,7 +108,7 @@ class DataFrame(object):
"""Save the contents as a Parquet file, preserving the schema.
Files that are written out using this method can be read back in as
- a DataFrame using the L{SQLContext.parquetFile} method.
+ a :class:`DataFrame` using the L{SQLContext.parquetFile} method.
>>> import tempfile, shutil
>>> parquetFile = tempfile.mkdtemp()
@@ -139,7 +139,7 @@ class DataFrame(object):
self.registerTempTable(name)
def insertInto(self, tableName, overwrite=False):
- """Inserts the contents of this DataFrame into the specified table.
+ """Inserts the contents of this :class:`DataFrame` into the specified table.
Optionally overwriting any existing data.
"""
@@ -165,7 +165,7 @@ class DataFrame(object):
return jmode
def saveAsTable(self, tableName, source=None, mode="append", **options):
- """Saves the contents of the DataFrame to a data source as a table.
+ """Saves the contents of the :class:`DataFrame` to a data source as a table.
The data source is specified by the `source` and a set of `options`.
If `source` is not specified, the default data source configured by
@@ -174,12 +174,13 @@ class DataFrame(object):
Additionally, mode is used to specify the behavior of the saveAsTable operation when
table already exists in the data source. There are four modes:
- * append: Contents of this DataFrame are expected to be appended to existing table.
- * overwrite: Data in the existing table is expected to be overwritten by the contents of \
- this DataFrame.
+ * append: Contents of this :class:`DataFrame` are expected to be appended \
+ to existing table.
+ * overwrite: Data in the existing table is expected to be overwritten by \
+ the contents of this DataFrame.
* error: An exception is expected to be thrown.
- * ignore: The save operation is expected to not save the contents of the DataFrame and \
- to not change the existing table.
+ * ignore: The save operation is expected to not save the contents of the \
+ :class:`DataFrame` and to not change the existing table.
"""
if source is None:
source = self.sql_ctx.getConf("spark.sql.sources.default",
@@ -190,7 +191,7 @@ class DataFrame(object):
self._jdf.saveAsTable(tableName, source, jmode, joptions)
def save(self, path=None, source=None, mode="append", **options):
- """Saves the contents of the DataFrame to a data source.
+ """Saves the contents of the :class:`DataFrame` to a data source.
The data source is specified by the `source` and a set of `options`.
If `source` is not specified, the default data source configured by
@@ -199,11 +200,11 @@ class DataFrame(object):
Additionally, mode is used to specify the behavior of the save operation when
data already exists in the data source. There are four modes:
- * append: Contents of this DataFrame are expected to be appended to existing data.
+ * append: Contents of this :class:`DataFrame` are expected to be appended to existing data.
* overwrite: Existing data is expected to be overwritten by the contents of this DataFrame.
* error: An exception is expected to be thrown.
- * ignore: The save operation is expected to not save the contents of the DataFrame and \
- to not change the existing data.
+ * ignore: The save operation is expected to not save the contents of \
+ the :class:`DataFrame` and to not change the existing data.
"""
if path is not None:
options["path"] = path
@@ -217,7 +218,7 @@ class DataFrame(object):
@property
def schema(self):
- """Returns the schema of this DataFrame (represented by
+ """Returns the schema of this :class:`DataFrame` (represented by
a L{StructType}).
>>> df.schema
@@ -275,12 +276,12 @@ class DataFrame(object):
"""
Print the first 20 rows.
+ >>> df
+ DataFrame[age: int, name: string]
>>> df.show()
age name
2 Alice
5 Bob
- >>> df
- DataFrame[age: int, name: string]
"""
print self._jdf.showString().encode('utf8', 'ignore')
@@ -481,8 +482,8 @@ class DataFrame(object):
def join(self, other, joinExprs=None, joinType=None):
"""
- Join with another DataFrame, using the given join expression.
- The following performs a full outer join between `df1` and `df2`::
+ Join with another :class:`DataFrame`, using the given join expression.
+ The following performs a full outer join between `df1` and `df2`.
:param other: Right side of the join
:param joinExprs: Join expression
@@ -582,8 +583,6 @@ class DataFrame(object):
def select(self, *cols):
""" Selecting a set of expressions.
- >>> df.select().collect()
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
>>> df.select('*').collect()
[Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
>>> df.select('name', 'age').collect()
@@ -591,8 +590,6 @@ class DataFrame(object):
>>> df.select(df.name, (df.age + 10).alias('age')).collect()
[Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
"""
- if not cols:
- cols = ["*"]
jcols = ListConverter().convert([_to_java_column(c) for c in cols],
self._sc._gateway._gateway_client)
jdf = self._jdf.select(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
@@ -612,7 +609,7 @@ class DataFrame(object):
def filter(self, condition):
""" Filtering rows using the given condition, which could be
- Column expression or string of SQL expression.
+ :class:`Column` expression or string of SQL expression.
where() is an alias for filter().
@@ -666,7 +663,7 @@ class DataFrame(object):
return self.groupBy().agg(*exprs)
def unionAll(self, other):
- """ Return a new DataFrame containing union of rows in this
+ """ Return a new :class:`DataFrame` containing union of rows in this
frame and another frame.
This is equivalent to `UNION ALL` in SQL.
@@ -919,9 +916,10 @@ class Column(object):
"""
A column in a DataFrame.
- `Column` instances can be created by::
+ :class:`Column` instances can be created by::
# 1. Select a column out of a DataFrame
+
df.colName
df["colName"]
@@ -975,7 +973,7 @@ class Column(object):
def substr(self, startPos, length):
"""
- Return a Column which is a substring of the column
+ Return a :class:`Column` which is a substring of the column
:param startPos: start position (int or Column)
:param length: length of the substring (int or Column)
@@ -996,8 +994,10 @@ class Column(object):
__getslice__ = substr
# order
- asc = _unary_op("asc")
- desc = _unary_op("desc")
+ asc = _unary_op("asc", "Returns a sort expression based on the"
+ " ascending order of the given column name.")
+ desc = _unary_op("desc", "Returns a sort expression based on the"
+ " descending order of the given column name.")
isNull = _unary_op("isNull", "True if the current expression is null.")
isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")