aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorDavies Liu <davies.liu@gmail.com>2014-10-07 18:09:27 -0700
committerJosh Rosen <joshrosen@apache.org>2014-10-07 18:09:27 -0700
commit798ed22c289cf65f2249bf2f4250285685ca69e7 (patch)
tree137d93c32454aaf39e6416823a8604f816f73926 /python
parentb69c9fb6fb048509bbd8430fb697dc3a5ca4fe59 (diff)
downloadspark-798ed22c289cf65f2249bf2f4250285685ca69e7.tar.gz
spark-798ed22c289cf65f2249bf2f4250285685ca69e7.tar.bz2
spark-798ed22c289cf65f2249bf2f4250285685ca69e7.zip
[SPARK-3412] [PySpark] Replace Epydoc with Sphinx to generate Python API docs
Retire Epydoc, use Sphinx to generate API docs. Refine Sphinx docs, also convert some docstrings into Sphinx style. It looks like: ![api doc](https://cloud.githubusercontent.com/assets/40902/4538272/9e2d4f10-4dec-11e4-8d96-6e45a8fe51f9.png) Author: Davies Liu <davies.liu@gmail.com> Closes #2689 from davies/docs and squashes the following commits: bf4a0a5 [Davies Liu] fix links 3fb1572 [Davies Liu] fix _static in jekyll 65a287e [Davies Liu] fix scripts and logo 8524042 [Davies Liu] Merge branch 'master' of github.com:apache/spark into docs d5b874a [Davies Liu] Merge branch 'master' of github.com:apache/spark into docs 4bc1c3c [Davies Liu] refactor 746d0b6 [Davies Liu] @param -> :param 240b393 [Davies Liu] replace epydoc with sphinx doc
Diffstat (limited to 'python')
-rw-r--r--python/docs/conf.py12
-rw-r--r--python/docs/index.rst6
-rw-r--r--python/epydoc.conf38
-rw-r--r--python/pyspark/__init__.py26
-rw-r--r--python/pyspark/conf.py8
-rw-r--r--python/pyspark/context.py92
-rw-r--r--python/pyspark/mllib/classification.py32
-rw-r--r--python/pyspark/mllib/linalg.py8
-rw-r--r--python/pyspark/mllib/regression.py18
-rw-r--r--python/pyspark/mllib/util.py18
-rw-r--r--python/pyspark/rdd.py52
-rw-r--r--python/pyspark/sql.py33
12 files changed, 150 insertions, 193 deletions
diff --git a/python/docs/conf.py b/python/docs/conf.py
index c368cf81a0..8e6324f058 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -55,9 +55,9 @@ copyright = u'2014, Author'
# built documents.
#
# The short X.Y version.
-version = '1.1'
+version = '1.2-SNAPSHOT'
# The full version, including alpha/beta/rc tags.
-release = ''
+release = '1.2-SNAPSHOT'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -102,7 +102,7 @@ pygments_style = 'sphinx'
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'default'
+html_theme = 'nature'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
@@ -121,7 +121,7 @@ html_theme = 'default'
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+html_logo = "../../docs/img/spark-logo-hd.png"
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
@@ -154,10 +154,10 @@ html_static_path = ['_static']
#html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+html_domain_indices = False
# If false, no index is generated.
-#html_use_index = True
+html_use_index = False
# If true, the index is split into individual pages for each letter.
#html_split_index = False
diff --git a/python/docs/index.rst b/python/docs/index.rst
index 25b3f9bd93..d66e051b15 100644
--- a/python/docs/index.rst
+++ b/python/docs/index.rst
@@ -3,7 +3,7 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
-Welcome to PySpark API reference!
+Welcome to Spark Python API Docs!
===================================
Contents:
@@ -24,14 +24,12 @@ Core classes:
Main entry point for Spark functionality.
:class:`pyspark.RDD`
-
+
A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
Indices and tables
==================
-* :ref:`genindex`
-* :ref:`modindex`
* :ref:`search`
diff --git a/python/epydoc.conf b/python/epydoc.conf
deleted file mode 100644
index 8593e08ded..0000000000
--- a/python/epydoc.conf
+++ /dev/null
@@ -1,38 +0,0 @@
-[epydoc] # Epydoc section marker (required by ConfigParser)
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Information about the project.
-name: Spark 1.0.0 Python API Docs
-url: http://spark.apache.org
-
-# The list of modules to document. Modules can be named using
-# dotted names, module filenames, or package directory names.
-# This option may be repeated.
-modules: pyspark
-
-# Write html output to the directory "apidocs"
-output: html
-target: docs/
-
-private: no
-
-exclude: pyspark.cloudpickle pyspark.worker pyspark.join
- pyspark.java_gateway pyspark.examples pyspark.shell pyspark.tests
- pyspark.rddsampler pyspark.daemon
- pyspark.mllib.tests pyspark.shuffle
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 1a2e774738..e39e6514d7 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -20,33 +20,21 @@ PySpark is the Python API for Spark.
Public classes:
- - L{SparkContext<pyspark.context.SparkContext>}
+ - :class:`SparkContext`:
Main entry point for Spark functionality.
- - L{RDD<pyspark.rdd.RDD>}
+ - L{RDD}
A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
- - L{Broadcast<pyspark.broadcast.Broadcast>}
+ - L{Broadcast}
A broadcast variable that gets reused across tasks.
- - L{Accumulator<pyspark.accumulators.Accumulator>}
+ - L{Accumulator}
An "add-only" shared variable that tasks can only add values to.
- - L{SparkConf<pyspark.conf.SparkConf>}
+ - L{SparkConf}
For configuring Spark.
- - L{SparkFiles<pyspark.files.SparkFiles>}
+ - L{SparkFiles}
Access files shipped with jobs.
- - L{StorageLevel<pyspark.storagelevel.StorageLevel>}
+ - L{StorageLevel}
Finer-grained cache persistence levels.
-Spark SQL:
- - L{SQLContext<pyspark.sql.SQLContext>}
- Main entry point for SQL functionality.
- - L{SchemaRDD<pyspark.sql.SchemaRDD>}
- A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
- addition to normal RDD operations, SchemaRDDs also support SQL.
- - L{Row<pyspark.sql.Row>}
- A Row of data returned by a Spark SQL query.
-
-Hive:
- - L{HiveContext<pyspark.context.HiveContext>}
- Main entry point for accessing data stored in Apache Hive..
"""
# The following block allows us to import python's random instead of mllib.random for scripts in
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index b64875a3f4..dc7cd0bce5 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -83,11 +83,11 @@ class SparkConf(object):
"""
Create a new Spark configuration.
- @param loadDefaults: whether to load values from Java system
+ :param loadDefaults: whether to load values from Java system
properties (True by default)
- @param _jvm: internal parameter used to pass a handle to the
+ :param _jvm: internal parameter used to pass a handle to the
Java VM; does not need to be set by users
- @param _jconf: Optionally pass in an existing SparkConf handle
+ :param _jconf: Optionally pass in an existing SparkConf handle
to use its parameters
"""
if _jconf:
@@ -139,7 +139,7 @@ class SparkConf(object):
"""
Set multiple parameters, passed as a list of key-value pairs.
- @param pairs: list of key-value pairs to set
+ :param pairs: list of key-value pairs to set
"""
for (k, v) in pairs:
self._jconf.set(k, v)
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a45d79d642..6fb30d65c5 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -73,21 +73,21 @@ class SparkContext(object):
Create a new SparkContext. At least the master and app name should be set,
either through the named parameters here or through C{conf}.
- @param master: Cluster URL to connect to
+ :param master: Cluster URL to connect to
(e.g. mesos://host:port, spark://host:port, local[4]).
- @param appName: A name for your job, to display on the cluster web UI.
- @param sparkHome: Location where Spark is installed on cluster nodes.
- @param pyFiles: Collection of .zip or .py files to send to the cluster
+ :param appName: A name for your job, to display on the cluster web UI.
+ :param sparkHome: Location where Spark is installed on cluster nodes.
+ :param pyFiles: Collection of .zip or .py files to send to the cluster
and add to PYTHONPATH. These can be paths on the local file
system or HDFS, HTTP, HTTPS, or FTP URLs.
- @param environment: A dictionary of environment variables to set on
+ :param environment: A dictionary of environment variables to set on
worker nodes.
- @param batchSize: The number of Python objects represented as a single
+ :param batchSize: The number of Python objects represented as a single
Java object. Set 1 to disable batching or -1 to use an
unlimited batch size.
- @param serializer: The serializer for RDDs.
- @param conf: A L{SparkConf} object setting Spark properties.
- @param gateway: Use an existing gateway and JVM, otherwise a new JVM
+ :param serializer: The serializer for RDDs.
+ :param conf: A L{SparkConf} object setting Spark properties.
+ :param gateway: Use an existing gateway and JVM, otherwise a new JVM
will be instantiated.
@@ -417,16 +417,16 @@ class SparkContext(object):
3. If this fails, the fallback is to call 'toString' on each key and value
4. C{PickleSerializer} is used to deserialize pickled objects on the Python side
- @param path: path to sequncefile
- @param keyClass: fully qualified classname of key Writable class
+ :param path: path to sequncefile
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.Text")
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.LongWritable")
- @param keyConverter:
- @param valueConverter:
- @param minSplits: minimum splits in dataset
+ :param keyConverter:
+ :param valueConverter:
+ :param minSplits: minimum splits in dataset
(default min(2, sc.defaultParallelism))
- @param batchSize: The number of Python objects represented as a single
+ :param batchSize: The number of Python objects represented as a single
Java object. (default sc._default_batch_size_for_serialized_input)
"""
minSplits = minSplits or min(self.defaultParallelism, 2)
@@ -446,18 +446,18 @@ class SparkContext(object):
A Hadoop configuration can be passed in as a Python dict. This will be converted into a
Configuration in Java
- @param path: path to Hadoop file
- @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+ :param path: path to Hadoop file
+ :param inputFormatClass: fully qualified classname of Hadoop InputFormat
(e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
- @param keyClass: fully qualified classname of key Writable class
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.Text")
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.LongWritable")
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
- @param conf: Hadoop configuration, passed in as a dict
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
+ :param conf: Hadoop configuration, passed in as a dict
(None by default)
- @param batchSize: The number of Python objects represented as a single
+ :param batchSize: The number of Python objects represented as a single
Java object. (default sc._default_batch_size_for_serialized_input)
"""
jconf = self._dictToJavaMap(conf)
@@ -476,17 +476,17 @@ class SparkContext(object):
This will be converted into a Configuration in Java.
The mechanism is the same as for sc.sequenceFile.
- @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+ :param inputFormatClass: fully qualified classname of Hadoop InputFormat
(e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
- @param keyClass: fully qualified classname of key Writable class
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.Text")
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.LongWritable")
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
- @param conf: Hadoop configuration, passed in as a dict
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
+ :param conf: Hadoop configuration, passed in as a dict
(None by default)
- @param batchSize: The number of Python objects represented as a single
+ :param batchSize: The number of Python objects represented as a single
Java object. (default sc._default_batch_size_for_serialized_input)
"""
jconf = self._dictToJavaMap(conf)
@@ -507,18 +507,18 @@ class SparkContext(object):
A Hadoop configuration can be passed in as a Python dict. This will be converted into a
Configuration in Java.
- @param path: path to Hadoop file
- @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+ :param path: path to Hadoop file
+ :param inputFormatClass: fully qualified classname of Hadoop InputFormat
(e.g. "org.apache.hadoop.mapred.TextInputFormat")
- @param keyClass: fully qualified classname of key Writable class
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.Text")
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.LongWritable")
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
- @param conf: Hadoop configuration, passed in as a dict
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
+ :param conf: Hadoop configuration, passed in as a dict
(None by default)
- @param batchSize: The number of Python objects represented as a single
+ :param batchSize: The number of Python objects represented as a single
Java object. (default sc._default_batch_size_for_serialized_input)
"""
jconf = self._dictToJavaMap(conf)
@@ -537,17 +537,17 @@ class SparkContext(object):
This will be converted into a Configuration in Java.
The mechanism is the same as for sc.sequenceFile.
- @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+ :param inputFormatClass: fully qualified classname of Hadoop InputFormat
(e.g. "org.apache.hadoop.mapred.TextInputFormat")
- @param keyClass: fully qualified classname of key Writable class
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.Text")
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.LongWritable")
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
- @param conf: Hadoop configuration, passed in as a dict
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
+ :param conf: Hadoop configuration, passed in as a dict
(None by default)
- @param batchSize: The number of Python objects represented as a single
+ :param batchSize: The number of Python objects represented as a single
Java object. (default sc._default_batch_size_for_serialized_input)
"""
jconf = self._dictToJavaMap(conf)
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index a765b1c4f7..cd43982191 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -79,15 +79,15 @@ class LogisticRegressionWithSGD(object):
"""
Train a logistic regression model on the given data.
- @param data: The training data.
- @param iterations: The number of iterations (default: 100).
- @param step: The step parameter used in SGD
+ :param data: The training data.
+ :param iterations: The number of iterations (default: 100).
+ :param step: The step parameter used in SGD
(default: 1.0).
- @param miniBatchFraction: Fraction of data to be used for each SGD
+ :param miniBatchFraction: Fraction of data to be used for each SGD
iteration.
- @param initialWeights: The initial weights (default: None).
- @param regParam: The regularizer parameter (default: 1.0).
- @param regType: The type of regularizer used for training
+ :param initialWeights: The initial weights (default: None).
+ :param regParam: The regularizer parameter (default: 1.0).
+ :param regType: The type of regularizer used for training
our model.
:Allowed values:
@@ -151,15 +151,15 @@ class SVMWithSGD(object):
"""
Train a support vector machine on the given data.
- @param data: The training data.
- @param iterations: The number of iterations (default: 100).
- @param step: The step parameter used in SGD
+ :param data: The training data.
+ :param iterations: The number of iterations (default: 100).
+ :param step: The step parameter used in SGD
(default: 1.0).
- @param regParam: The regularizer parameter (default: 1.0).
- @param miniBatchFraction: Fraction of data to be used for each SGD
+ :param regParam: The regularizer parameter (default: 1.0).
+ :param miniBatchFraction: Fraction of data to be used for each SGD
iteration.
- @param initialWeights: The initial weights (default: None).
- @param regType: The type of regularizer used for training
+ :param initialWeights: The initial weights (default: None).
+ :param regType: The type of regularizer used for training
our model.
:Allowed values:
@@ -238,10 +238,10 @@ class NaiveBayes(object):
classification. By making every vector a 0-1 vector, it can also be
used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
- @param data: RDD of NumPy vectors, one per element, where the first
+ :param data: RDD of NumPy vectors, one per element, where the first
coordinate is the label and the rest is the feature vector
(e.g. a count vector).
- @param lambda_: The smoothing parameter
+ :param lambda_: The smoothing parameter
"""
sc = data.context
jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(data._to_java_object_rdd(), lambda_)
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 51014a8ceb..24c5480b2f 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -238,8 +238,8 @@ class SparseVector(Vector):
(index, value) pairs, or two separate arrays of indices and
values (sorted by index).
- @param size: Size of the vector.
- @param args: Non-zero entries, as a dictionary, list of tupes,
+ :param size: Size of the vector.
+ :param args: Non-zero entries, as a dictionary, list of tupes,
or two sorted lists containing indices and values.
>>> print SparseVector(4, {1: 1.0, 3: 5.5})
@@ -458,8 +458,8 @@ class Vectors(object):
(index, value) pairs, or two separate arrays of indices and
values (sorted by index).
- @param size: Size of the vector.
- @param args: Non-zero entries, as a dictionary, list of tupes,
+ :param size: Size of the vector.
+ :param args: Non-zero entries, as a dictionary, list of tupes,
or two sorted lists containing indices and values.
>>> print Vectors.sparse(4, {1: 1.0, 3: 5.5})
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 54f34a9833..12b322aaae 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -31,8 +31,8 @@ class LabeledPoint(object):
"""
The features and labels of a data point.
- @param label: Label for this data point.
- @param features: Vector of features for this point (NumPy array, list,
+ :param label: Label for this data point.
+ :param features: Vector of features for this point (NumPy array, list,
pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix)
"""
@@ -145,15 +145,15 @@ class LinearRegressionWithSGD(object):
"""
Train a linear regression model on the given data.
- @param data: The training data.
- @param iterations: The number of iterations (default: 100).
- @param step: The step parameter used in SGD
+ :param data: The training data.
+ :param iterations: The number of iterations (default: 100).
+ :param step: The step parameter used in SGD
(default: 1.0).
- @param miniBatchFraction: Fraction of data to be used for each SGD
+ :param miniBatchFraction: Fraction of data to be used for each SGD
iteration.
- @param initialWeights: The initial weights (default: None).
- @param regParam: The regularizer parameter (default: 1.0).
- @param regType: The type of regularizer used for training
+ :param initialWeights: The initial weights (default: None).
+ :param regParam: The regularizer parameter (default: 1.0).
+ :param regType: The type of regularizer used for training
our model.
:Allowed values:
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 8233d4e81f..1357fd4fbc 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -77,10 +77,10 @@ class MLUtils(object):
method parses each line into a LabeledPoint, where the feature
indices are converted to zero-based.
- @param sc: Spark context
- @param path: file or directory path in any Hadoop-supported file
+ :param sc: Spark context
+ :param path: file or directory path in any Hadoop-supported file
system URI
- @param numFeatures: number of features, which will be determined
+ :param numFeatures: number of features, which will be determined
from the input data if a nonpositive value
is given. This is useful when the dataset is
already split into multiple files and you
@@ -88,7 +88,7 @@ class MLUtils(object):
features may not present in certain files,
which leads to inconsistent feature
dimensions.
- @param minPartitions: min number of partitions
+ :param minPartitions: min number of partitions
@return: labeled data stored as an RDD of LabeledPoint
>>> from tempfile import NamedTemporaryFile
@@ -126,8 +126,8 @@ class MLUtils(object):
"""
Save labeled data in LIBSVM format.
- @param data: an RDD of LabeledPoint to be saved
- @param dir: directory to save the data
+ :param data: an RDD of LabeledPoint to be saved
+ :param dir: directory to save the data
>>> from tempfile import NamedTemporaryFile
>>> from fileinput import input
@@ -149,10 +149,10 @@ class MLUtils(object):
"""
Load labeled points saved using RDD.saveAsTextFile.
- @param sc: Spark context
- @param path: file or directory path in any Hadoop-supported file
+ :param sc: Spark context
+ :param path: file or directory path in any Hadoop-supported file
system URI
- @param minPartitions: min number of partitions
+ :param minPartitions: min number of partitions
@return: labeled data stored as an RDD of LabeledPoint
>>> from tempfile import NamedTemporaryFile
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index e77669aad7..6797d50659 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -752,7 +752,7 @@ class RDD(object):
"""
Find the maximum item in this RDD.
- @param key: A function used to generate key for comparing
+ :param key: A function used to generate key for comparing
>>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0])
>>> rdd.max()
@@ -768,7 +768,7 @@ class RDD(object):
"""
Find the minimum item in this RDD.
- @param key: A function used to generate key for comparing
+ :param key: A function used to generate key for comparing
>>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0])
>>> rdd.min()
@@ -1115,9 +1115,9 @@ class RDD(object):
converted for output using either user specified converters or, by default,
L{org.apache.spark.api.python.JavaToWritableConverter}.
- @param conf: Hadoop job configuration, passed in as a dict
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
+ :param conf: Hadoop job configuration, passed in as a dict
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
"""
jconf = self.ctx._dictToJavaMap(conf)
pickledRDD = self._toPickleSerialization()
@@ -1135,16 +1135,16 @@ class RDD(object):
C{conf} is applied on top of the base Hadoop conf associated with the SparkContext
of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.
- @param path: path to Hadoop file
- @param outputFormatClass: fully qualified classname of Hadoop OutputFormat
+ :param path: path to Hadoop file
+ :param outputFormatClass: fully qualified classname of Hadoop OutputFormat
(e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
- @param keyClass: fully qualified classname of key Writable class
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.IntWritable", None by default)
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.Text", None by default)
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
- @param conf: Hadoop job configuration, passed in as a dict (None by default)
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
+ :param conf: Hadoop job configuration, passed in as a dict (None by default)
"""
jconf = self.ctx._dictToJavaMap(conf)
pickledRDD = self._toPickleSerialization()
@@ -1161,9 +1161,9 @@ class RDD(object):
converted for output using either user specified converters or, by default,
L{org.apache.spark.api.python.JavaToWritableConverter}.
- @param conf: Hadoop job configuration, passed in as a dict
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
+ :param conf: Hadoop job configuration, passed in as a dict
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
"""
jconf = self.ctx._dictToJavaMap(conf)
pickledRDD = self._toPickleSerialization()
@@ -1182,17 +1182,17 @@ class RDD(object):
C{conf} is applied on top of the base Hadoop conf associated with the SparkContext
of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.
- @param path: path to Hadoop file
- @param outputFormatClass: fully qualified classname of Hadoop OutputFormat
+ :param path: path to Hadoop file
+ :param outputFormatClass: fully qualified classname of Hadoop OutputFormat
(e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat")
- @param keyClass: fully qualified classname of key Writable class
+ :param keyClass: fully qualified classname of key Writable class
(e.g. "org.apache.hadoop.io.IntWritable", None by default)
- @param valueClass: fully qualified classname of value Writable class
+ :param valueClass: fully qualified classname of value Writable class
(e.g. "org.apache.hadoop.io.Text", None by default)
- @param keyConverter: (None by default)
- @param valueConverter: (None by default)
- @param conf: (None by default)
- @param compressionCodecClass: (None by default)
+ :param keyConverter: (None by default)
+ :param valueConverter: (None by default)
+ :param conf: (None by default)
+ :param compressionCodecClass: (None by default)
"""
jconf = self.ctx._dictToJavaMap(conf)
pickledRDD = self._toPickleSerialization()
@@ -1212,8 +1212,8 @@ class RDD(object):
1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects.
2. Keys and values of this Java RDD are converted to Writables and written out.
- @param path: path to sequence file
- @param compressionCodecClass: (None by default)
+ :param path: path to sequence file
+ :param compressionCodecClass: (None by default)
"""
pickledRDD = self._toPickleSerialization()
batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
@@ -2009,7 +2009,7 @@ class RDD(object):
of The Art Cardinality Estimation Algorithm", available
<a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
- @param relativeSD Relative accuracy. Smaller values create
+ :param relativeSD Relative accuracy. Smaller values create
counters that require more space.
It must be greater than 0.000017.
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 114644ab8b..3d5a281239 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -15,28 +15,37 @@
# limitations under the License.
#
+"""
+public classes of Spark SQL:
+
+ - L{SQLContext}
+ Main entry point for SQL functionality.
+ - L{SchemaRDD}
+ A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
+ addition to normal RDD operations, SchemaRDDs also support SQL.
+ - L{Row}
+ A Row of data returned by a Spark SQL query.
+ - L{HiveContext}
+ Main entry point for accessing data stored in Apache Hive..
+"""
-import sys
-import types
import itertools
-import warnings
import decimal
import datetime
import keyword
import warnings
from array import array
from operator import itemgetter
+from itertools import imap
+
+from py4j.protocol import Py4JError
+from py4j.java_collections import ListConverter, MapConverter
from pyspark.rdd import RDD
from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer
from pyspark.storagelevel import StorageLevel
from pyspark.traceback_utils import SCCallSiteSync
-from itertools import chain, ifilter, imap
-
-from py4j.protocol import Py4JError
-from py4j.java_collections import ListConverter, MapConverter
-
__all__ = [
"StringType", "BinaryType", "BooleanType", "TimestampType", "DecimalType",
@@ -899,8 +908,8 @@ class SQLContext(object):
def __init__(self, sparkContext, sqlContext=None):
"""Create a new SQLContext.
- @param sparkContext: The SparkContext to wrap.
- @param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
+ :param sparkContext: The SparkContext to wrap.
+ :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
SQLContext in the JVM, instead we make all calls to this object.
>>> srdd = sqlCtx.inferSchema(rdd)
@@ -1325,8 +1334,8 @@ class HiveContext(SQLContext):
def __init__(self, sparkContext, hiveContext=None):
"""Create a new HiveContext.
- @param sparkContext: The SparkContext to wrap.
- @param hiveContext: An optional JVM Scala HiveContext. If set, we do not instatiate a new
+ :param sparkContext: The SparkContext to wrap.
+ :param hiveContext: An optional JVM Scala HiveContext. If set, we do not instatiate a new
HiveContext in the JVM, instead we make all calls to this object.
"""
SQLContext.__init__(self, sparkContext)