aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/clustering.py6
-rw-r--r--python/pyspark/mllib/feature.py24
-rw-r--r--python/pyspark/mllib/linalg/__init__.py11
-rw-r--r--python/pyspark/mllib/linalg/distributed.py15
-rw-r--r--python/pyspark/mllib/regression.py2
-rw-r--r--python/pyspark/mllib/stat/_statistics.py3
-rw-r--r--python/pyspark/mllib/tree.py12
7 files changed, 39 insertions, 34 deletions
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 2036168e45..91123ace33 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -699,9 +699,9 @@ class StreamingKMeansModel(KMeansModel):
* n_t+1: New number of weights.
* a: Decay Factor, which gives the forgetfulness.
- Note that if a is set to 1, it is the weighted mean of the previous
- and new data. If it set to zero, the old centroids are completely
- forgotten.
+ .. note:: If a is set to 1, it is the weighted mean of the previous
+ and new data. If it set to zero, the old centroids are completely
+ forgotten.
:param clusterCenters:
Initial cluster centers.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 7eaa2282cb..bde0f67be7 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -114,9 +114,9 @@ class JavaVectorTransformer(JavaModelWrapper, VectorTransformer):
"""
Applies transformation on a vector or an RDD[Vector].
- Note: In Python, transform cannot currently be used within
- an RDD transformation or action.
- Call transform directly on the RDD instead.
+ .. note:: In Python, transform cannot currently be used within
+ an RDD transformation or action.
+ Call transform directly on the RDD instead.
:param vector: Vector or RDD of Vector to be transformed.
"""
@@ -139,9 +139,9 @@ class StandardScalerModel(JavaVectorTransformer):
"""
Applies standardization transformation on a vector.
- Note: In Python, transform cannot currently be used within
- an RDD transformation or action.
- Call transform directly on the RDD instead.
+ .. note:: In Python, transform cannot currently be used within
+ an RDD transformation or action.
+ Call transform directly on the RDD instead.
:param vector: Vector or RDD of Vector to be standardized.
:return: Standardized vector. If the variance of a column is
@@ -407,7 +407,7 @@ class HashingTF(object):
Maps a sequence of terms to their term frequencies using the hashing
trick.
- Note: the terms must be hashable (can not be dict/set/list...).
+ .. note:: The terms must be hashable (can not be dict/set/list...).
:param numFeatures: number of features (default: 2^20)
@@ -469,9 +469,9 @@ class IDFModel(JavaVectorTransformer):
the terms which occur in fewer than `minDocFreq`
documents will have an entry of 0.
- Note: In Python, transform cannot currently be used within
- an RDD transformation or action.
- Call transform directly on the RDD instead.
+ .. note:: In Python, transform cannot currently be used within
+ an RDD transformation or action.
+ Call transform directly on the RDD instead.
:param x: an RDD of term frequency vectors or a term frequency
vector
@@ -551,7 +551,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
"""
Transforms a word to its vector representation
- Note: local use only
+ .. note:: Local use only
:param word: a word
:return: vector representation of word(s)
@@ -570,7 +570,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
:param num: number of synonyms to find
:return: array of (word, cosineSimilarity)
- Note: local use only
+ .. note:: Local use only
"""
if not isinstance(word, basestring):
word = _convert_to_vector(word)
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index d37e715c8d..031f22c020 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -835,11 +835,12 @@ class SparseVector(Vector):
class Vectors(object):
"""
- Factory methods for working with vectors. Note that dense vectors
- are simply represented as NumPy array objects, so there is no need
- to covert them for use in MLlib. For sparse vectors, the factory
- methods in this class create an MLlib-compatible type, or users
- can pass in SciPy's C{scipy.sparse} column vectors.
+ Factory methods for working with vectors.
+
+ .. note:: Dense vectors are simply represented as NumPy array objects,
+ so there is no need to covert them for use in MLlib. For sparse vectors,
+ the factory methods in this class create an MLlib-compatible type, or users
+ can pass in SciPy's C{scipy.sparse} column vectors.
"""
@staticmethod
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 538cada7d1..600655c912 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -171,8 +171,9 @@ class RowMatrix(DistributedMatrix):
def computeCovariance(self):
"""
Computes the covariance matrix, treating each row as an
- observation. Note that this cannot be computed on matrices
- with more than 65535 columns.
+ observation.
+
+ .. note:: This cannot be computed on matrices with more than 65535 columns.
>>> rows = sc.parallelize([[1, 2], [2, 1]])
>>> mat = RowMatrix(rows)
@@ -185,8 +186,9 @@ class RowMatrix(DistributedMatrix):
@since('2.0.0')
def computeGramianMatrix(self):
"""
- Computes the Gramian matrix `A^T A`. Note that this cannot be
- computed on matrices with more than 65535 columns.
+ Computes the Gramian matrix `A^T A`.
+
+ .. note:: This cannot be computed on matrices with more than 65535 columns.
>>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
>>> mat = RowMatrix(rows)
@@ -458,8 +460,9 @@ class IndexedRowMatrix(DistributedMatrix):
@since('2.0.0')
def computeGramianMatrix(self):
"""
- Computes the Gramian matrix `A^T A`. Note that this cannot be
- computed on matrices with more than 65535 columns.
+ Computes the Gramian matrix `A^T A`.
+
+ .. note:: This cannot be computed on matrices with more than 65535 columns.
>>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
... IndexedRow(1, [4, 5, 6])])
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 705022934e..1b66f5b510 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -44,7 +44,7 @@ class LabeledPoint(object):
Vector of features for this point (NumPy array, list,
pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix).
- Note: 'label' and 'features' are accessible as class attributes.
+ .. note:: 'label' and 'features' are accessible as class attributes.
.. versionadded:: 1.0.0
"""
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index 67d5f0e44f..49b26446db 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -164,7 +164,6 @@ class Statistics(object):
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
- (Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
@@ -176,6 +175,8 @@ class Statistics(object):
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
+ .. note:: `observed` cannot contain negative values
+
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index b3011d42e5..a6089fc8b9 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -40,9 +40,9 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
Predict values for a single data point or an RDD of points using
the model trained.
- Note: In Python, predict cannot currently be used within an RDD
- transformation or action.
- Call predict directly on the RDD instead.
+ .. note:: In Python, predict cannot currently be used within an RDD
+ transformation or action.
+ Call predict directly on the RDD instead.
"""
if isinstance(x, RDD):
return self.call("predict", x.map(_convert_to_vector))
@@ -85,9 +85,9 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
"""
Predict the label of one or more examples.
- Note: In Python, predict cannot currently be used within an RDD
- transformation or action.
- Call predict directly on the RDD instead.
+ .. note:: In Python, predict cannot currently be used within an RDD
+ transformation or action.
+ Call predict directly on the RDD instead.
:param x:
Data point (feature vector), or an RDD of data points (feature