aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-05-30 16:24:07 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-05-30 16:24:07 -0700
commit1617363fbb9b22a2eb09e7bab98c8d05f9508761 (patch)
treee473b7f65ebfbd4faafebb919865a348ea821b3f /python
parenta6430028ecd7a6130f1eb15af9ec00e242c46725 (diff)
downloadspark-1617363fbb9b22a2eb09e7bab98c8d05f9508761.tar.gz
spark-1617363fbb9b22a2eb09e7bab98c8d05f9508761.tar.bz2
spark-1617363fbb9b22a2eb09e7bab98c8d05f9508761.zip
[SPARK-7918] [MLLIB] MLlib Python doc parity check for evaluation and feature
Check then make the MLlib Python evaluation and feature doc to be as complete as the Scala doc. Author: Yanbo Liang <ybliang8@gmail.com> Closes #6461 from yanboliang/spark-7918 and squashes the following commits: 940e3f1 [Yanbo Liang] truncate too long line and remove extra sparse a80ae58 [Yanbo Liang] MLlib Python doc parity check for evaluation and feature
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/evaluation.py26
-rw-r--r--python/pyspark/mllib/feature.py49
2 files changed, 36 insertions, 39 deletions
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index aab5e5f4b7..c5cf3a4e7f 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -27,6 +27,8 @@ class BinaryClassificationMetrics(JavaModelWrapper):
"""
Evaluator for binary classification.
+ :param scoreAndLabels: an RDD of (score, label) pairs
+
>>> scoreAndLabels = sc.parallelize([
... (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2)
>>> metrics = BinaryClassificationMetrics(scoreAndLabels)
@@ -38,9 +40,6 @@ class BinaryClassificationMetrics(JavaModelWrapper):
"""
def __init__(self, scoreAndLabels):
- """
- :param scoreAndLabels: an RDD of (score, label) pairs
- """
sc = scoreAndLabels.ctx
sql_ctx = SQLContext(sc)
df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([
@@ -76,6 +75,9 @@ class RegressionMetrics(JavaModelWrapper):
"""
Evaluator for regression.
+ :param predictionAndObservations: an RDD of (prediction,
+ observation) pairs.
+
>>> predictionAndObservations = sc.parallelize([
... (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)])
>>> metrics = RegressionMetrics(predictionAndObservations)
@@ -92,9 +94,6 @@ class RegressionMetrics(JavaModelWrapper):
"""
def __init__(self, predictionAndObservations):
- """
- :param predictionAndObservations: an RDD of (prediction, observation) pairs.
- """
sc = predictionAndObservations.ctx
sql_ctx = SQLContext(sc)
df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([
@@ -148,6 +147,8 @@ class MulticlassMetrics(JavaModelWrapper):
"""
Evaluator for multiclass classification.
+ :param predictionAndLabels an RDD of (prediction, label) pairs.
+
>>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
>>> metrics = MulticlassMetrics(predictionAndLabels)
@@ -176,9 +177,6 @@ class MulticlassMetrics(JavaModelWrapper):
"""
def __init__(self, predictionAndLabels):
- """
- :param predictionAndLabels an RDD of (prediction, label) pairs.
- """
sc = predictionAndLabels.ctx
sql_ctx = SQLContext(sc)
df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([
@@ -277,6 +275,9 @@ class RankingMetrics(JavaModelWrapper):
"""
Evaluator for ranking algorithms.
+ :param predictionAndLabels: an RDD of (predicted ranking,
+ ground truth set) pairs.
+
>>> predictionAndLabels = sc.parallelize([
... ([1, 6, 2, 7, 8, 3, 9, 10, 4, 5], [1, 2, 3, 4, 5]),
... ([4, 1, 5, 6, 2, 7, 3, 8, 9, 10], [1, 2, 3]),
@@ -298,9 +299,6 @@ class RankingMetrics(JavaModelWrapper):
"""
def __init__(self, predictionAndLabels):
- """
- :param predictionAndLabels: an RDD of (predicted ranking, ground truth set) pairs.
- """
sc = predictionAndLabels.ctx
sql_ctx = SQLContext(sc)
df = sql_ctx.createDataFrame(predictionAndLabels,
@@ -347,6 +345,10 @@ class MultilabelMetrics(JavaModelWrapper):
"""
Evaluator for multilabel classification.
+ :param predictionAndLabels: an RDD of (predictions, labels) pairs,
+ both are non-null Arrays, each with
+ unique elements.
+
>>> predictionAndLabels = sc.parallelize([([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
... ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])])
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index aac305db6c..da90554f41 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -68,6 +68,8 @@ class Normalizer(VectorTransformer):
For `p` = float('inf'), max(abs(vector)) will be used as norm for
normalization.
+ :param p: Normalization in L^p^ space, p = 2 by default.
+
>>> v = Vectors.dense(range(3))
>>> nor = Normalizer(1)
>>> nor.transform(v)
@@ -82,9 +84,6 @@ class Normalizer(VectorTransformer):
DenseVector([0.0, 0.5, 1.0])
"""
def __init__(self, p=2.0):
- """
- :param p: Normalization in L^p^ space, p = 2 by default.
- """
assert p >= 1.0, "p should be greater than 1.0"
self.p = float(p)
@@ -94,7 +93,7 @@ class Normalizer(VectorTransformer):
:param vector: vector or RDD of vector to be normalized.
:return: normalized vector. If the norm of the input is zero, it
- will return the input vector.
+ will return the input vector.
"""
sc = SparkContext._active_spark_context
assert sc is not None, "SparkContext should be initialized first"
@@ -164,6 +163,13 @@ class StandardScaler(object):
variance using column summary statistics on the samples in the
training set.
+ :param withMean: False by default. Centers the data with mean
+ before scaling. It will build a dense output, so this
+ does not work on sparse input and will raise an
+ exception.
+ :param withStd: True by default. Scales the data to unit
+ standard deviation.
+
>>> vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
>>> dataset = sc.parallelize(vs)
>>> standardizer = StandardScaler(True, True)
@@ -174,14 +180,6 @@ class StandardScaler(object):
DenseVector([0.7071, -0.7071, 0.7071])
"""
def __init__(self, withMean=False, withStd=True):
- """
- :param withMean: False by default. Centers the data with mean
- before scaling. It will build a dense output, so this
- does not work on sparse input and will raise an
- exception.
- :param withStd: True by default. Scales the data to unit
- standard deviation.
- """
if not (withMean or withStd):
warnings.warn("Both withMean and withStd are false. The model does nothing.")
self.withMean = withMean
@@ -193,7 +191,7 @@ class StandardScaler(object):
for later scaling.
:param data: The data used to compute the mean and variance
- to build the transformation model.
+ to build the transformation model.
:return: a StandardScalarModel
"""
dataset = dataset.map(_convert_to_vector)
@@ -223,6 +221,8 @@ class ChiSqSelector(object):
Creates a ChiSquared feature selector.
+ :param numTopFeatures: number of features that selector will select.
+
>>> data = [
... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
... LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
@@ -236,9 +236,6 @@ class ChiSqSelector(object):
DenseVector([5.0])
"""
def __init__(self, numTopFeatures):
- """
- :param numTopFeatures: number of features that selector will select.
- """
self.numTopFeatures = int(numTopFeatures)
def fit(self, data):
@@ -246,9 +243,9 @@ class ChiSqSelector(object):
Returns a ChiSquared feature selector.
:param data: an `RDD[LabeledPoint]` containing the labeled dataset
- with categorical features. Real-valued features will be
- treated as categorical for each distinct value.
- Apply feature discretizer before using this function.
+ with categorical features. Real-valued features will be
+ treated as categorical for each distinct value.
+ Apply feature discretizer before using this function.
"""
jmodel = callMLlibFunc("fitChiSqSelector", self.numTopFeatures, data)
return ChiSqSelectorModel(jmodel)
@@ -263,15 +260,14 @@ class HashingTF(object):
Note: the terms must be hashable (can not be dict/set/list...).
+ :param numFeatures: number of features (default: 2^20)
+
>>> htf = HashingTF(100)
>>> doc = "a a b b c d".split(" ")
>>> htf.transform(doc)
SparseVector(100, {...})
"""
def __init__(self, numFeatures=1 << 20):
- """
- :param numFeatures: number of features (default: 2^20)
- """
self.numFeatures = numFeatures
def indexOf(self, term):
@@ -311,7 +307,7 @@ class IDFModel(JavaVectorTransformer):
Call transform directly on the RDD instead.
:param x: an RDD of term frequency vectors or a term frequency
- vector
+ vector
:return: an RDD of TF-IDF vectors or a TF-IDF vector
"""
if isinstance(x, RDD):
@@ -342,6 +338,9 @@ class IDF(object):
`minDocFreq`). For terms that are not in at least `minDocFreq`
documents, the IDF is found as 0, resulting in TF-IDFs of 0.
+ :param minDocFreq: minimum of documents in which a term
+ should appear for filtering
+
>>> n = 4
>>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)),
... Vectors.dense([0.0, 1.0, 2.0, 3.0]),
@@ -362,10 +361,6 @@ class IDF(object):
SparseVector(4, {1: 0.0, 3: 0.5754})
"""
def __init__(self, minDocFreq=0):
- """
- :param minDocFreq: minimum of documents in which a term
- should appear for filtering
- """
self.minDocFreq = minDocFreq
def fit(self, dataset):