[SPARK-3412] [PySpark] Replace Epydoc with Sphinx to generate Python API docs

Retire Epydoc, use Sphinx to generate API docs. Refine Sphinx docs, also convert some docstrings into Sphinx style. It looks like: ![api doc](https://cloud.githubusercontent.com/assets/40902/4538272/9e2d4f10-4dec-11e4-8d96-6e45a8fe51f9.png) Author: Davies Liu <davies.liu@gmail.com> Closes #2689 from davies/docs and squashes the following commits: bf4a0a5 [Davies Liu] fix links 3fb1572 [Davies Liu] fix _static in jekyll 65a287e [Davies Liu] fix scripts and logo 8524042 [Davies Liu] Merge branch 'master' of github.com:apache/spark into docs d5b874a [Davies Liu] Merge branch 'master' of github.com:apache/spark into docs 4bc1c3c [Davies Liu] refactor 746d0b6 [Davies Liu] @param -> :param 240b393 [Davies Liu] replace epydoc with sphinx doc
author: Davies Liu <davies.liu@gmail.com> 2014-10-07 18:09:27 -0700
committer: Josh Rosen <joshrosen@apache.org> 2014-10-07 18:09:27 -0700
commit: 798ed22c289cf65f2249bf2f4250285685ca69e7 (patch)
tree: 137d93c32454aaf39e6416823a8604f816f73926 /python
parent: b69c9fb6fb048509bbd8430fb697dc3a5ca4fe59 (diff)
download: spark-798ed22c289cf65f2249bf2f4250285685ca69e7.tar.gz
spark-798ed22c289cf65f2249bf2f4250285685ca69e7.tar.bz2
spark-798ed22c289cf65f2249bf2f4250285685ca69e7.zip
12 files changed, 150 insertions, 193 deletions
diff --git a/python/docs/conf.py b/python/docs/conf.py
index c368cf81a0..8e6324f058 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -55,9 +55,9 @@ copyright = u'2014, Author'
 # built documents.
 #
 # The short X.Y version.
-version = '1.1'
+version = '1.2-SNAPSHOT'
 # The full version, including alpha/beta/rc tags.
-release = ''
+release = '1.2-SNAPSHOT'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -102,7 +102,7 @@ pygments_style = 'sphinx'
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'default'
+html_theme = 'nature'
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -121,7 +121,7 @@ html_theme = 'default'
 
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
-#html_logo = None
+html_logo = "../../docs/img/spark-logo-hd.png"
 
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
@@ -154,10 +154,10 @@ html_static_path = ['_static']
 #html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_domain_indices = True
+html_domain_indices = False
 
 # If false, no index is generated.
-#html_use_index = True
+html_use_index = False
 
 # If true, the index is split into individual pages for each letter.
 #html_split_index = False
diff --git a/python/docs/index.rst b/python/docs/index.rst
index 25b3f9bd93..d66e051b15 100644
--- a/python/docs/index.rst
+++ b/python/docs/index.rst
@@ -3,7 +3,7 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to PySpark API reference!
+Welcome to Spark Python API Docs!
 ===================================
 
 Contents:
@@ -24,14 +24,12 @@ Core classes:
     Main entry point for Spark functionality.
 
     :class:`pyspark.RDD`
-    
+
     A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
 
 
 Indices and tables
 ==================
 
-* :ref:`genindex`
-* :ref:`modindex`
 * :ref:`search`
 
diff --git a/python/epydoc.conf b/python/epydoc.conf
deleted file mode 100644
index 8593e08ded..0000000000
--- a/python/epydoc.conf
+++ /dev/null
@@ -1,38 +0,0 @@
-[epydoc] # Epydoc section marker (required by ConfigParser)
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Information about the project.
-name: Spark 1.0.0 Python API Docs
-url: http://spark.apache.org
-
-# The list of modules to document.  Modules can be named using
-# dotted names, module filenames, or package directory names.
-# This option may be repeated.
-modules: pyspark
-
-# Write html output to the directory "apidocs"
-output: html
-target: docs/
-
-private: no
-
-exclude: pyspark.cloudpickle pyspark.worker pyspark.join
-         pyspark.java_gateway pyspark.examples pyspark.shell pyspark.tests
-         pyspark.rddsampler pyspark.daemon
-         pyspark.mllib.tests pyspark.shuffle
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 1a2e774738..e39e6514d7 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -20,33 +20,21 @@ PySpark is the Python API for Spark.
 
 Public classes:
 
-  - L{SparkContext<pyspark.context.SparkContext>}
+  - :class:`SparkContext`:
       Main entry point for Spark functionality.
-  - L{RDD<pyspark.rdd.RDD>}
+  - L{RDD}
       A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
-  - L{Broadcast<pyspark.broadcast.Broadcast>}
+  - L{Broadcast}
       A broadcast variable that gets reused across tasks.
-  - L{Accumulator<pyspark.accumulators.Accumulator>}
+  - L{Accumulator}
       An "add-only" shared variable that tasks can only add values to.
-  - L{SparkConf<pyspark.conf.SparkConf>}
+  - L{SparkConf}
       For configuring Spark.
-  - L{SparkFiles<pyspark.files.SparkFiles>}
+  - L{SparkFiles}
       Access files shipped with jobs.
-  - L{StorageLevel<pyspark.storagelevel.StorageLevel>}
+  - L{StorageLevel}
       Finer-grained cache persistence levels.
 
-Spark SQL:
-  - L{SQLContext<pyspark.sql.SQLContext>}
-      Main entry point for SQL functionality.
-  - L{SchemaRDD<pyspark.sql.SchemaRDD>}
-      A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
-      addition to normal RDD operations, SchemaRDDs also support SQL.
-  - L{Row<pyspark.sql.Row>}
-      A Row of data returned by a Spark SQL query.
-
-Hive:
-  - L{HiveContext<pyspark.context.HiveContext>}
-      Main entry point for accessing data stored in Apache Hive..
 """
 
 # The following block allows us to import python's random instead of mllib.random for scripts in
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index b64875a3f4..dc7cd0bce5 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -83,11 +83,11 @@ class SparkConf(object):
         """
         Create a new Spark configuration.
 
-        @param loadDefaults: whether to load values from Java system
+        :param loadDefaults: whether to load values from Java system
                properties (True by default)
-        @param _jvm: internal parameter used to pass a handle to the
+        :param _jvm: internal parameter used to pass a handle to the
                Java VM; does not need to be set by users
-        @param _jconf: Optionally pass in an existing SparkConf handle
+        :param _jconf: Optionally pass in an existing SparkConf handle
                to use its parameters
         """
         if _jconf:
@@ -139,7 +139,7 @@ class SparkConf(object):
         """
         Set multiple parameters, passed as a list of key-value pairs.
 
-        @param pairs: list of key-value pairs to set
+        :param pairs: list of key-value pairs to set
         """
         for (k, v) in pairs:
             self._jconf.set(k, v)
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a45d79d642..6fb30d65c5 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -73,21 +73,21 @@ class SparkContext(object):
         Create a new SparkContext. At least the master and app name should be set,
         either through the named parameters here or through C{conf}.
 
-        @param master: Cluster URL to connect to
+        :param master: Cluster URL to connect to
                (e.g. mesos://host:port, spark://host:port, local[4]).
-        @param appName: A name for your job, to display on the cluster web UI.
-        @param sparkHome: Location where Spark is installed on cluster nodes.
-        @param pyFiles: Collection of .zip or .py files to send to the cluster
+        :param appName: A name for your job, to display on the cluster web UI.
+        :param sparkHome: Location where Spark is installed on cluster nodes.
+        :param pyFiles: Collection of .zip or .py files to send to the cluster
                and add to PYTHONPATH.  These can be paths on the local file
                system or HDFS, HTTP, HTTPS, or FTP URLs.
-        @param environment: A dictionary of environment variables to set on
+        :param environment: A dictionary of environment variables to set on
                worker nodes.
-        @param batchSize: The number of Python objects represented as a single
+        :param batchSize: The number of Python objects represented as a single
                Java object.  Set 1 to disable batching or -1 to use an
                unlimited batch size.
-        @param serializer: The serializer for RDDs.
-        @param conf: A L{SparkConf} object setting Spark properties.
-        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
+        :param serializer: The serializer for RDDs.
+        :param conf: A L{SparkConf} object setting Spark properties.
+        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
                will be instantiated.
 
 
@@ -417,16 +417,16 @@ class SparkContext(object):
             3. If this fails, the fallback is to call 'toString' on each key and value
             4. C{PickleSerializer} is used to deserialize pickled objects on the Python side
 
-        @param path: path to sequncefile
-        @param keyClass: fully qualified classname of key Writable class
+        :param path: path to sequncefile
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.Text")
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.LongWritable")
-        @param keyConverter:
-        @param valueConverter:
-        @param minSplits: minimum splits in dataset
+        :param keyConverter:
+        :param valueConverter:
+        :param minSplits: minimum splits in dataset
                (default min(2, sc.defaultParallelism))
-        @param batchSize: The number of Python objects represented as a single
+        :param batchSize: The number of Python objects represented as a single
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         minSplits = minSplits or min(self.defaultParallelism, 2)
@@ -446,18 +446,18 @@ class SparkContext(object):
         A Hadoop configuration can be passed in as a Python dict. This will be converted into a
         Configuration in Java
 
-        @param path: path to Hadoop file
-        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+        :param path: path to Hadoop file
+        :param inputFormatClass: fully qualified classname of Hadoop InputFormat
                (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
-        @param keyClass: fully qualified classname of key Writable class
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.Text")
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.LongWritable")
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
-        @param conf: Hadoop configuration, passed in as a dict
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
+        :param conf: Hadoop configuration, passed in as a dict
                (None by default)
-        @param batchSize: The number of Python objects represented as a single
+        :param batchSize: The number of Python objects represented as a single
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
@@ -476,17 +476,17 @@ class SparkContext(object):
         This will be converted into a Configuration in Java.
         The mechanism is the same as for sc.sequenceFile.
 
-        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+        :param inputFormatClass: fully qualified classname of Hadoop InputFormat
                (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
-        @param keyClass: fully qualified classname of key Writable class
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.Text")
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.LongWritable")
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
-        @param conf: Hadoop configuration, passed in as a dict
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
+        :param conf: Hadoop configuration, passed in as a dict
                (None by default)
-        @param batchSize: The number of Python objects represented as a single
+        :param batchSize: The number of Python objects represented as a single
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
@@ -507,18 +507,18 @@ class SparkContext(object):
         A Hadoop configuration can be passed in as a Python dict. This will be converted into a
         Configuration in Java.
 
-        @param path: path to Hadoop file
-        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+        :param path: path to Hadoop file
+        :param inputFormatClass: fully qualified classname of Hadoop InputFormat
                (e.g. "org.apache.hadoop.mapred.TextInputFormat")
-        @param keyClass: fully qualified classname of key Writable class
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.Text")
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.LongWritable")
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
-        @param conf: Hadoop configuration, passed in as a dict
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
+        :param conf: Hadoop configuration, passed in as a dict
                (None by default)
-        @param batchSize: The number of Python objects represented as a single
+        :param batchSize: The number of Python objects represented as a single
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
@@ -537,17 +537,17 @@ class SparkContext(object):
         This will be converted into a Configuration in Java.
         The mechanism is the same as for sc.sequenceFile.
 
-        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
+        :param inputFormatClass: fully qualified classname of Hadoop InputFormat
                (e.g. "org.apache.hadoop.mapred.TextInputFormat")
-        @param keyClass: fully qualified classname of key Writable class
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.Text")
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.LongWritable")
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
-        @param conf: Hadoop configuration, passed in as a dict
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
+        :param conf: Hadoop configuration, passed in as a dict
                (None by default)
-        @param batchSize: The number of Python objects represented as a single
+        :param batchSize: The number of Python objects represented as a single
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index a765b1c4f7..cd43982191 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -79,15 +79,15 @@ class LogisticRegressionWithSGD(object):
         """
         Train a logistic regression model on the given data.
 
-        @param data:              The training data.
-        @param iterations:        The number of iterations (default: 100).
-        @param step:              The step parameter used in SGD
+        :param data:              The training data.
+        :param iterations:        The number of iterations (default: 100).
+        :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        @param miniBatchFraction: Fraction of data to be used for each SGD
+        :param miniBatchFraction: Fraction of data to be used for each SGD
                                   iteration.
-        @param initialWeights:    The initial weights (default: None).
-        @param regParam:          The regularizer parameter (default: 1.0).
-        @param regType:           The type of regularizer used for training
+        :param initialWeights:    The initial weights (default: None).
+        :param regParam:          The regularizer parameter (default: 1.0).
+        :param regType:           The type of regularizer used for training
                                   our model.
 
                                   :Allowed values:
@@ -151,15 +151,15 @@ class SVMWithSGD(object):
         """
         Train a support vector machine on the given data.
 
-        @param data:              The training data.
-        @param iterations:        The number of iterations (default: 100).
-        @param step:              The step parameter used in SGD
+        :param data:              The training data.
+        :param iterations:        The number of iterations (default: 100).
+        :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        @param regParam:          The regularizer parameter (default: 1.0).
-        @param miniBatchFraction: Fraction of data to be used for each SGD
+        :param regParam:          The regularizer parameter (default: 1.0).
+        :param miniBatchFraction: Fraction of data to be used for each SGD
                                   iteration.
-        @param initialWeights:    The initial weights (default: None).
-        @param regType:           The type of regularizer used for training
+        :param initialWeights:    The initial weights (default: None).
+        :param regType:           The type of regularizer used for training
                                   our model.
 
                                   :Allowed values:
@@ -238,10 +238,10 @@ class NaiveBayes(object):
         classification.  By making every vector a 0-1 vector, it can also be
         used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
 
-        @param data: RDD of NumPy vectors, one per element, where the first
+        :param data: RDD of NumPy vectors, one per element, where the first
                coordinate is the label and the rest is the feature vector
                (e.g. a count vector).
-        @param lambda_: The smoothing parameter
+        :param lambda_: The smoothing parameter
         """
         sc = data.context
         jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(data._to_java_object_rdd(), lambda_)
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 51014a8ceb..24c5480b2f 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -238,8 +238,8 @@ class SparseVector(Vector):
         (index, value) pairs, or two separate arrays of indices and
         values (sorted by index).
 
-        @param size: Size of the vector.
-        @param args: Non-zero entries, as a dictionary, list of tupes,
+        :param size: Size of the vector.
+        :param args: Non-zero entries, as a dictionary, list of tupes,
                or two sorted lists containing indices and values.
 
         >>> print SparseVector(4, {1: 1.0, 3: 5.5})
@@ -458,8 +458,8 @@ class Vectors(object):
         (index, value) pairs, or two separate arrays of indices and
         values (sorted by index).
 
-        @param size: Size of the vector.
-        @param args: Non-zero entries, as a dictionary, list of tupes,
+        :param size: Size of the vector.
+        :param args: Non-zero entries, as a dictionary, list of tupes,
                      or two sorted lists containing indices and values.
 
         >>> print Vectors.sparse(4, {1: 1.0, 3: 5.5})
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 54f34a9833..12b322aaae 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -31,8 +31,8 @@ class LabeledPoint(object):
     """
     The features and labels of a data point.
 
-    @param label: Label for this data point.
-    @param features: Vector of features for this point (NumPy array, list,
+    :param label: Label for this data point.
+    :param features: Vector of features for this point (NumPy array, list,
         pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix)
     """
 
@@ -145,15 +145,15 @@ class LinearRegressionWithSGD(object):
         """
         Train a linear regression model on the given data.
 
-        @param data:              The training data.
-        @param iterations:        The number of iterations (default: 100).
-        @param step:              The step parameter used in SGD
+        :param data:              The training data.
+        :param iterations:        The number of iterations (default: 100).
+        :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        @param miniBatchFraction: Fraction of data to be used for each SGD
+        :param miniBatchFraction: Fraction of data to be used for each SGD
                                   iteration.
-        @param initialWeights:    The initial weights (default: None).
-        @param regParam:          The regularizer parameter (default: 1.0).
-        @param regType:           The type of regularizer used for training
+        :param initialWeights:    The initial weights (default: None).
+        :param regParam:          The regularizer parameter (default: 1.0).
+        :param regType:           The type of regularizer used for training
                                   our model.
 
                                   :Allowed values:
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 8233d4e81f..1357fd4fbc 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -77,10 +77,10 @@ class MLUtils(object):
         method parses each line into a LabeledPoint, where the feature
         indices are converted to zero-based.
 
-        @param sc: Spark context
-        @param path: file or directory path in any Hadoop-supported file
+        :param sc: Spark context
+        :param path: file or directory path in any Hadoop-supported file
                      system URI
-        @param numFeatures: number of features, which will be determined
+        :param numFeatures: number of features, which will be determined
                             from the input data if a nonpositive value
                             is given. This is useful when the dataset is
                             already split into multiple files and you
@@ -88,7 +88,7 @@ class MLUtils(object):
                             features may not present in certain files,
                             which leads to inconsistent feature
                             dimensions.
-        @param minPartitions: min number of partitions
+        :param minPartitions: min number of partitions
         @return: labeled data stored as an RDD of LabeledPoint
 
         >>> from tempfile import NamedTemporaryFile
@@ -126,8 +126,8 @@ class MLUtils(object):
         """
         Save labeled data in LIBSVM format.
 
-        @param data: an RDD of LabeledPoint to be saved
-        @param dir: directory to save the data
+        :param data: an RDD of LabeledPoint to be saved
+        :param dir: directory to save the data
 
         >>> from tempfile import NamedTemporaryFile
         >>> from fileinput import input
@@ -149,10 +149,10 @@ class MLUtils(object):
         """
         Load labeled points saved using RDD.saveAsTextFile.
 
-        @param sc: Spark context
-        @param path: file or directory path in any Hadoop-supported file
+        :param sc: Spark context
+        :param path: file or directory path in any Hadoop-supported file
                      system URI
-        @param minPartitions: min number of partitions
+        :param minPartitions: min number of partitions
         @return: labeled data stored as an RDD of LabeledPoint
 
         >>> from tempfile import NamedTemporaryFile
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index e77669aad7..6797d50659 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -752,7 +752,7 @@ class RDD(object):
         """
         Find the maximum item in this RDD.
 
-        @param key: A function used to generate key for comparing
+        :param key: A function used to generate key for comparing
 
         >>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0])
         >>> rdd.max()
@@ -768,7 +768,7 @@ class RDD(object):
         """
         Find the minimum item in this RDD.
 
-        @param key: A function used to generate key for comparing
+        :param key: A function used to generate key for comparing
 
         >>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0])
         >>> rdd.min()
@@ -1115,9 +1115,9 @@ class RDD(object):
         converted for output using either user specified converters or, by default,
         L{org.apache.spark.api.python.JavaToWritableConverter}.
 
-        @param conf: Hadoop job configuration, passed in as a dict
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
+        :param conf: Hadoop job configuration, passed in as a dict
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
         """
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
@@ -1135,16 +1135,16 @@ class RDD(object):
         C{conf} is applied on top of the base Hadoop conf associated with the SparkContext
         of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.
 
-        @param path: path to Hadoop file
-        @param outputFormatClass: fully qualified classname of Hadoop OutputFormat
+        :param path: path to Hadoop file
+        :param outputFormatClass: fully qualified classname of Hadoop OutputFormat
                (e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
-        @param keyClass: fully qualified classname of key Writable class
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.IntWritable", None by default)
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.Text", None by default)
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
-        @param conf: Hadoop job configuration, passed in as a dict (None by default)
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
+        :param conf: Hadoop job configuration, passed in as a dict (None by default)
         """
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
@@ -1161,9 +1161,9 @@ class RDD(object):
         converted for output using either user specified converters or, by default,
         L{org.apache.spark.api.python.JavaToWritableConverter}.
 
-        @param conf: Hadoop job configuration, passed in as a dict
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
+        :param conf: Hadoop job configuration, passed in as a dict
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
         """
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
@@ -1182,17 +1182,17 @@ class RDD(object):
         C{conf} is applied on top of the base Hadoop conf associated with the SparkContext
         of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.
 
-        @param path: path to Hadoop file
-        @param outputFormatClass: fully qualified classname of Hadoop OutputFormat
+        :param path: path to Hadoop file
+        :param outputFormatClass: fully qualified classname of Hadoop OutputFormat
                (e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat")
-        @param keyClass: fully qualified classname of key Writable class
+        :param keyClass: fully qualified classname of key Writable class
                (e.g. "org.apache.hadoop.io.IntWritable", None by default)
-        @param valueClass: fully qualified classname of value Writable class
+        :param valueClass: fully qualified classname of value Writable class
                (e.g. "org.apache.hadoop.io.Text", None by default)
-        @param keyConverter: (None by default)
-        @param valueConverter: (None by default)
-        @param conf: (None by default)
-        @param compressionCodecClass: (None by default)
+        :param keyConverter: (None by default)
+        :param valueConverter: (None by default)
+        :param conf: (None by default)
+        :param compressionCodecClass: (None by default)
         """
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
@@ -1212,8 +1212,8 @@ class RDD(object):
             1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects.
             2. Keys and values of this Java RDD are converted to Writables and written out.
 
-        @param path: path to sequence file
-        @param compressionCodecClass: (None by default)
+        :param path: path to sequence file
+        :param compressionCodecClass: (None by default)
         """
         pickledRDD = self._toPickleSerialization()
         batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
@@ -2009,7 +2009,7 @@ class RDD(object):
         of The Art Cardinality Estimation Algorithm", available
         <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
 
-        @param relativeSD Relative accuracy. Smaller values create
+        :param relativeSD Relative accuracy. Smaller values create
                            counters that require more space.
                            It must be greater than 0.000017.
 
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 114644ab8b..3d5a281239 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -15,28 +15,37 @@
 # limitations under the License.
 #
 
+"""
+public classes of Spark SQL:
+
+    - L{SQLContext}
+    Main entry point for SQL functionality.
+    - L{SchemaRDD}
+    A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
+    addition to normal RDD operations, SchemaRDDs also support SQL.
+    - L{Row}
+    A Row of data returned by a Spark SQL query.
+    - L{HiveContext}
+    Main entry point for accessing data stored in Apache Hive..
+"""
 
-import sys
-import types
 import itertools
-import warnings
 import decimal
 import datetime
 import keyword
 import warnings
 from array import array
 from operator import itemgetter
+from itertools import imap
+
+from py4j.protocol import Py4JError
+from py4j.java_collections import ListConverter, MapConverter
 
 from pyspark.rdd import RDD
 from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
 
-from itertools import chain, ifilter, imap
-
-from py4j.protocol import Py4JError
-from py4j.java_collections import ListConverter, MapConverter
-
 
 __all__ = [
     "StringType", "BinaryType", "BooleanType", "TimestampType", "DecimalType",
@@ -899,8 +908,8 @@ class SQLContext(object):
     def __init__(self, sparkContext, sqlContext=None):
         """Create a new SQLContext.
 
-        @param sparkContext: The SparkContext to wrap.
-        @param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
+        :param sparkContext: The SparkContext to wrap.
+        :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
         SQLContext in the JVM, instead we make all calls to this object.
 
         >>> srdd = sqlCtx.inferSchema(rdd)
@@ -1325,8 +1334,8 @@ class HiveContext(SQLContext):
     def __init__(self, sparkContext, hiveContext=None):
         """Create a new HiveContext.
 
-        @param sparkContext: The SparkContext to wrap.
-        @param hiveContext: An optional JVM Scala HiveContext. If set, we do not instatiate a new
+        :param sparkContext: The SparkContext to wrap.
+        :param hiveContext: An optional JVM Scala HiveContext. If set, we do not instatiate a new
         HiveContext in the JVM, instead we make all calls to this object.
         """
         SQLContext.__init__(self, sparkContext)
author	Davies Liu <davies.liu@gmail.com>	2014-10-07 18:09:27 -0700
committer	Josh Rosen <joshrosen@apache.org>	2014-10-07 18:09:27 -0700
commit	798ed22c289cf65f2249bf2f4250285685ca69e7 (patch)
tree	137d93c32454aaf39e6416823a8604f816f73926 /python
parent	b69c9fb6fb048509bbd8430fb697dc3a5ca4fe59 (diff)
download	spark-798ed22c289cf65f2249bf2f4250285685ca69e7.tar.gz spark-798ed22c289cf65f2249bf2f4250285685ca69e7.tar.bz2 spark-798ed22c289cf65f2249bf2f4250285685ca69e7.zip