aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/session.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/sql/session.py')
-rw-r--r--python/pyspark/sql/session.py41
1 files changed, 23 insertions, 18 deletions
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 594f9375f7..10bd89b03f 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -47,7 +47,7 @@ def _monkey_patch_RDD(sparkSession):
This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``
- :param schema: a StructType or list of names of columns
+ :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns
:param samplingRatio: the sample ratio of rows used for inferring
:return: a DataFrame
@@ -274,9 +274,9 @@ class SparkSession(object):
@since(2.0)
def range(self, start, end=None, step=1, numPartitions=None):
"""
- Create a :class:`DataFrame` with single LongType column named `id`,
- containing elements in a range from `start` to `end` (exclusive) with
- step value `step`.
+ Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named
+ ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with
+ step value ``step``.
:param start: the start value
:param end: the end value (exclusive)
@@ -307,7 +307,7 @@ class SparkSession(object):
Infer schema from list of Row or tuple.
:param data: list of Row or tuple
- :return: StructType
+ :return: :class:`pyspark.sql.types.StructType`
"""
if not data:
raise ValueError("can not infer schema from empty dataset")
@@ -326,7 +326,7 @@ class SparkSession(object):
:param rdd: an RDD of Row or tuple
:param samplingRatio: sampling ratio, or no sampling (default)
- :return: StructType
+ :return: :class:`pyspark.sql.types.StructType`
"""
first = rdd.first()
if not first:
@@ -414,28 +414,33 @@ class SparkSession(object):
from ``data``, which should be an RDD of :class:`Row`,
or :class:`namedtuple`, or :class:`dict`.
- When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or
- exception will be thrown at runtime. If the given schema is not StructType, it will be
- wrapped into a StructType as its only field, and the field name will be "value", each record
- will also be wrapped into a tuple, which can be converted to row later.
+ When ``schema`` is :class:`pyspark.sql.types.DataType` or
+ :class:`pyspark.sql.types.StringType`, it must match the
+ real data, or an exception will be thrown at runtime. If the given schema is not
+ :class:`pyspark.sql.types.StructType`, it will be wrapped into a
+ :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
+ each record will also be wrapped into a tuple, which can be converted to row later.
If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.
:param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
etc.), or :class:`list`, or :class:`pandas.DataFrame`.
- :param schema: a :class:`DataType` or a datatype string or a list of column names, default
- is None. The data type string format equals to `DataType.simpleString`, except that
- top level struct type can omit the `struct<>` and atomic types use `typeName()` as
- their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int`
- as a short name for IntegerType.
+ :param schema: a :class:`pyspark.sql.types.DataType` or a
+ :class:`pyspark.sql.types.StringType` or a list of
+ column names, default is ``None``. The data type string format equals to
+ :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
+ omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
+ ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
+ ``int`` as a short name for ``IntegerType``.
:param samplingRatio: the sample ratio of rows used for inferring
:return: :class:`DataFrame`
.. versionchanged:: 2.0
- The schema parameter can be a DataType or a datatype string after 2.0. If it's not a
- StructType, it will be wrapped into a StructType and each record will also be wrapped
- into a tuple.
+ The ``schema`` parameter can be a :class:`pyspark.sql.types.DataType` or a
+ :class:`pyspark.sql.types.StringType` after 2.0. If it's not a
+ :class:`pyspark.sql.types.StructType`, it will be wrapped into a
+ :class:`pyspark.sql.types.StructType` and each record will also be wrapped into a tuple.
>>> l = [('Alice', 1)]
>>> spark.createDataFrame(l).collect()