aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-02-25 16:13:17 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-25 16:13:17 -0800
commitd20559b157743981b9c09e286f2aaff8cbefab59 (patch)
tree6d92015c1ae6b05c725860685351f86b8c4ed6af /python
parent46a044a36a2aff1306f7f677e952ce253ddbefac (diff)
downloadspark-d20559b157743981b9c09e286f2aaff8cbefab59.tar.gz
spark-d20559b157743981b9c09e286f2aaff8cbefab59.tar.bz2
spark-d20559b157743981b9c09e286f2aaff8cbefab59.zip
[SPARK-5974] [SPARK-5980] [mllib] [python] [docs] Update ML guide with save/load, Python GBT
* Add GradientBoostedTrees Python examples to ML guide * I ran these in the pyspark shell, and they worked. * Add save/load to examples in ML guide * Added note to python docs about predict,transform not working within RDD actions,transformations in some cases (See SPARK-5981) CC: mengxr Author: Joseph K. Bradley <joseph@databricks.com> Closes #4750 from jkbradley/SPARK-5974 and squashes the following commits: c410e38 [Joseph K. Bradley] Added note to LabeledPoint about attributes bcae18b [Joseph K. Bradley] Added import of models for save/load examples in ml guide. Fixed line length for tree.py, feature.py (but not other ML Pyspark files yet). 6d81c3e [Joseph K. Bradley] completed python GBT examples 9903309 [Joseph K. Bradley] Added note to python docs about predict,transform not working within RDD actions,transformations in some cases c7dfad8 [Joseph K. Bradley] Added model save/load to ML guide. Added GBT examples to ML guide
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/feature.py67
-rw-r--r--python/pyspark/mllib/regression.py7
-rw-r--r--python/pyspark/mllib/tree.py156
3 files changed, 141 insertions, 89 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 10df628806..0ffe092a07 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -58,7 +58,8 @@ class Normalizer(VectorTransformer):
For any 1 <= `p` < float('inf'), normalizes samples using
sum(abs(vector) :sup:`p`) :sup:`(1/p)` as norm.
- For `p` = float('inf'), max(abs(vector)) will be used as norm for normalization.
+ For `p` = float('inf'), max(abs(vector)) will be used as norm for
+ normalization.
>>> v = Vectors.dense(range(3))
>>> nor = Normalizer(1)
@@ -120,9 +121,14 @@ class StandardScalerModel(JavaVectorTransformer):
"""
Applies standardization transformation on a vector.
+ Note: In Python, transform cannot currently be used within
+ an RDD transformation or action.
+ Call transform directly on the RDD instead.
+
:param vector: Vector or RDD of Vector to be standardized.
- :return: Standardized vector. If the variance of a column is zero,
- it will return default `0.0` for the column with zero variance.
+ :return: Standardized vector. If the variance of a column is
+ zero, it will return default `0.0` for the column with
+ zero variance.
"""
return JavaVectorTransformer.transform(self, vector)
@@ -148,9 +154,10 @@ class StandardScaler(object):
"""
:param withMean: False by default. Centers the data with mean
before scaling. It will build a dense output, so this
- does not work on sparse input and will raise an exception.
- :param withStd: True by default. Scales the data to unit standard
- deviation.
+ does not work on sparse input and will raise an
+ exception.
+ :param withStd: True by default. Scales the data to unit
+ standard deviation.
"""
if not (withMean or withStd):
warnings.warn("Both withMean and withStd are false. The model does nothing.")
@@ -159,10 +166,11 @@ class StandardScaler(object):
def fit(self, dataset):
"""
- Computes the mean and variance and stores as a model to be used for later scaling.
+ Computes the mean and variance and stores as a model to be used
+ for later scaling.
- :param data: The data used to compute the mean and variance to build
- the transformation model.
+ :param data: The data used to compute the mean and variance
+ to build the transformation model.
:return: a StandardScalarModel
"""
dataset = dataset.map(_convert_to_vector)
@@ -174,7 +182,8 @@ class HashingTF(object):
"""
.. note:: Experimental
- Maps a sequence of terms to their term frequencies using the hashing trick.
+ Maps a sequence of terms to their term frequencies using the hashing
+ trick.
Note: the terms must be hashable (can not be dict/set/list...).
@@ -195,8 +204,9 @@ class HashingTF(object):
def transform(self, document):
"""
- Transforms the input document (list of terms) to term frequency vectors,
- or transform the RDD of document to RDD of term frequency vectors.
+ Transforms the input document (list of terms) to term frequency
+ vectors, or transform the RDD of document to RDD of term
+ frequency vectors.
"""
if isinstance(document, RDD):
return document.map(self.transform)
@@ -220,7 +230,12 @@ class IDFModel(JavaVectorTransformer):
the terms which occur in fewer than `minDocFreq`
documents will have an entry of 0.
- :param x: an RDD of term frequency vectors or a term frequency vector
+ Note: In Python, transform cannot currently be used within
+ an RDD transformation or action.
+ Call transform directly on the RDD instead.
+
+ :param x: an RDD of term frequency vectors or a term frequency
+ vector
:return: an RDD of TF-IDF vectors or a TF-IDF vector
"""
if isinstance(x, RDD):
@@ -241,9 +256,9 @@ class IDF(object):
of documents that contain term `t`.
This implementation supports filtering out terms which do not appear
- in a minimum number of documents (controlled by the variable `minDocFreq`).
- For terms that are not in at least `minDocFreq` documents, the IDF is
- found as 0, resulting in TF-IDFs of 0.
+ in a minimum number of documents (controlled by the variable
+ `minDocFreq`). For terms that are not in at least `minDocFreq`
+ documents, the IDF is found as 0, resulting in TF-IDFs of 0.
>>> n = 4
>>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)),
@@ -325,15 +340,16 @@ class Word2Vec(object):
The vector representation can be used as features in
natural language processing and machine learning algorithms.
- We used skip-gram model in our implementation and hierarchical softmax
- method to train the model. The variable names in the implementation
- matches the original C implementation.
+ We used skip-gram model in our implementation and hierarchical
+ softmax method to train the model. The variable names in the
+ implementation matches the original C implementation.
- For original C implementation, see https://code.google.com/p/word2vec/
+ For original C implementation,
+ see https://code.google.com/p/word2vec/
For research papers, see
Efficient Estimation of Word Representations in Vector Space
- and
- Distributed Representations of Words and Phrases and their Compositionality.
+ and Distributed Representations of Words and Phrases and their
+ Compositionality.
>>> sentence = "a b " * 100 + "a c " * 10
>>> localDoc = [sentence, sentence]
@@ -374,15 +390,16 @@ class Word2Vec(object):
def setNumPartitions(self, numPartitions):
"""
- Sets number of partitions (default: 1). Use a small number for accuracy.
+ Sets number of partitions (default: 1). Use a small number for
+ accuracy.
"""
self.numPartitions = numPartitions
return self
def setNumIterations(self, numIterations):
"""
- Sets number of iterations (default: 1), which should be smaller than or equal to number of
- partitions.
+ Sets number of iterations (default: 1), which should be smaller
+ than or equal to number of partitions.
"""
self.numIterations = numIterations
return self
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 21751cc68f..66617abb85 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -31,8 +31,11 @@ class LabeledPoint(object):
The features and labels of a data point.
:param label: Label for this data point.
- :param features: Vector of features for this point (NumPy array, list,
- pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix)
+ :param features: Vector of features for this point (NumPy array,
+ list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
+ column matrix)
+
+ Note: 'label' and 'features' are accessible as class attributes.
"""
def __init__(self, label, features):
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 02d551b87d..73618f0449 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -33,6 +33,10 @@ class TreeEnsembleModel(JavaModelWrapper):
"""
Predict values for a single data point or an RDD of points using
the model trained.
+
+ Note: In Python, predict cannot currently be used within an RDD
+ transformation or action.
+ Call predict directly on the RDD instead.
"""
if isinstance(x, RDD):
return self.call("predict", x.map(_convert_to_vector))
@@ -48,7 +52,8 @@ class TreeEnsembleModel(JavaModelWrapper):
def totalNumNodes(self):
"""
- Get total number of nodes, summed over all trees in the ensemble.
+ Get total number of nodes, summed over all trees in the
+ ensemble.
"""
return self.call("totalNumNodes")
@@ -71,6 +76,10 @@ class DecisionTreeModel(JavaModelWrapper):
"""
Predict the label of one or more examples.
+ Note: In Python, predict cannot currently be used within an RDD
+ transformation or action.
+ Call predict directly on the RDD instead.
+
:param x: Data point (feature vector),
or an RDD of data points (feature vectors).
"""
@@ -99,7 +108,8 @@ class DecisionTree(object):
"""
.. note:: Experimental
- Learning algorithm for a decision tree model for classification or regression.
+ Learning algorithm for a decision tree model for classification or
+ regression.
"""
@classmethod
@@ -176,17 +186,17 @@ class DecisionTree(object):
:param data: Training data: RDD of LabeledPoint.
Labels are real numbers.
- :param categoricalFeaturesInfo: Map from categorical feature index
- to number of categories.
- Any feature not in this map
- is treated as continuous.
+ :param categoricalFeaturesInfo: Map from categorical feature
+ index to number of categories.
+ Any feature not in this map is treated as continuous.
:param impurity: Supported values: "variance"
:param maxDepth: Max depth of tree.
- E.g., depth 0 means 1 leaf node.
- Depth 1 means 1 internal node + 2 leaf nodes.
- :param maxBins: Number of bins used for finding splits at each node.
- :param minInstancesPerNode: Min number of instances required at child
- nodes to create the parent split
+ E.g., depth 0 means 1 leaf node.
+ Depth 1 means 1 internal node + 2 leaf nodes.
+ :param maxBins: Number of bins used for finding splits at each
+ node.
+ :param minInstancesPerNode: Min number of instances required at
+ child nodes to create the parent split
:param minInfoGain: Min info gain required to create a split
:return: DecisionTreeModel
@@ -229,7 +239,8 @@ class RandomForest(object):
"""
.. note:: Experimental
- Learning algorithm for a random forest model for classification or regression.
+ Learning algorithm for a random forest model for classification or
+ regression.
"""
supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
@@ -256,26 +267,33 @@ class RandomForest(object):
Method to train a decision tree model for binary or multiclass
classification.
- :param data: Training dataset: RDD of LabeledPoint. Labels should take
- values {0, 1, ..., numClasses-1}.
+ :param data: Training dataset: RDD of LabeledPoint. Labels
+ should take values {0, 1, ..., numClasses-1}.
:param numClasses: number of classes for classification.
- :param categoricalFeaturesInfo: Map storing arity of categorical features.
- E.g., an entry (n -> k) indicates that feature n is categorical
- with k categories indexed from 0: {0, 1, ..., k-1}.
+ :param categoricalFeaturesInfo: Map storing arity of categorical
+ features. E.g., an entry (n -> k) indicates that
+ feature n is categorical with k categories indexed
+ from 0: {0, 1, ..., k-1}.
:param numTrees: Number of trees in the random forest.
- :param featureSubsetStrategy: Number of features to consider for splits at
- each node.
- Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
- If "auto" is set, this parameter is set based on numTrees:
- if numTrees == 1, set to "all";
- if numTrees > 1 (forest) set to "sqrt".
- :param impurity: Criterion used for information gain calculation.
+ :param featureSubsetStrategy: Number of features to consider for
+ splits at each node.
+ Supported: "auto" (default), "all", "sqrt", "log2",
+ "onethird".
+ If "auto" is set, this parameter is set based on
+ numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "sqrt".
+ :param impurity: Criterion used for information gain
+ calculation.
Supported values: "gini" (recommended) or "entropy".
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
- depth 1 means 1 internal node + 2 leaf nodes. (default: 4)
- :param maxBins: maximum number of bins used for splitting features
+ :param maxDepth: Maximum depth of the tree.
+ E.g., depth 0 means 1 leaf node; depth 1 means
+ 1 internal node + 2 leaf nodes. (default: 4)
+ :param maxBins: maximum number of bins used for splitting
+ features
(default: 100)
- :param seed: Random seed for bootstrapping and choosing feature subsets.
+ :param seed: Random seed for bootstrapping and choosing feature
+ subsets.
:return: RandomForestModel that can be used for prediction
Example usage:
@@ -337,19 +355,24 @@ class RandomForest(object):
{0, 1, ..., k-1}.
:param numTrees: Number of trees in the random forest.
:param featureSubsetStrategy: Number of features to consider for
- splits at each node.
- Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
- If "auto" is set, this parameter is set based on numTrees:
- if numTrees == 1, set to "all";
- if numTrees > 1 (forest) set to "onethird" for regression.
- :param impurity: Criterion used for information gain calculation.
- Supported values: "variance".
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
- leaf node; depth 1 means 1 internal node + 2 leaf nodes.
- (default: 4)
- :param maxBins: maximum number of bins used for splitting features
- (default: 100)
- :param seed: Random seed for bootstrapping and choosing feature subsets.
+ splits at each node.
+ Supported: "auto" (default), "all", "sqrt", "log2",
+ "onethird".
+ If "auto" is set, this parameter is set based on
+ numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "onethird" for
+ regression.
+ :param impurity: Criterion used for information gain
+ calculation.
+ Supported values: "variance".
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
+ 1 leaf node; depth 1 means 1 internal node + 2 leaf
+ nodes. (default: 4)
+ :param maxBins: maximum number of bins used for splitting
+ features (default: 100)
+ :param seed: Random seed for bootstrapping and choosing feature
+ subsets.
:return: RandomForestModel that can be used for prediction
Example usage:
@@ -395,7 +418,8 @@ class GradientBoostedTrees(object):
"""
.. note:: Experimental
- Learning algorithm for a gradient boosted trees model for classification or regression.
+ Learning algorithm for a gradient boosted trees model for
+ classification or regression.
"""
@classmethod
@@ -411,24 +435,29 @@ class GradientBoostedTrees(object):
def trainClassifier(cls, data, categoricalFeaturesInfo,
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
"""
- Method to train a gradient-boosted trees model for classification.
+ Method to train a gradient-boosted trees model for
+ classification.
- :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}.
+ :param data: Training dataset: RDD of LabeledPoint.
+ Labels should take values {0, 1}.
:param categoricalFeaturesInfo: Map storing arity of categorical
features. E.g., an entry (n -> k) indicates that feature
n is categorical with k categories indexed from 0:
{0, 1, ..., k-1}.
- :param loss: Loss function used for minimization during gradient boosting.
- Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}.
+ :param loss: Loss function used for minimization during gradient
+ boosting. Supported: {"logLoss" (default),
+ "leastSquaresError", "leastAbsoluteError"}.
:param numIterations: Number of iterations of boosting.
(default: 100)
- :param learningRate: Learning rate for shrinking the contribution of each estimator.
- The learning rate should be between in the interval (0, 1]
- (default: 0.1)
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
- leaf node; depth 1 means 1 internal node + 2 leaf nodes.
- (default: 3)
- :return: GradientBoostedTreesModel that can be used for prediction
+ :param learningRate: Learning rate for shrinking the
+ contribution of each estimator. The learning rate
+ should be between in the interval (0, 1].
+ (default: 0.1)
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
+ 1 leaf node; depth 1 means 1 internal node + 2 leaf
+ nodes. (default: 3)
+ :return: GradientBoostedTreesModel that can be used for
+ prediction
Example usage:
@@ -472,17 +501,20 @@ class GradientBoostedTrees(object):
features. E.g., an entry (n -> k) indicates that feature
n is categorical with k categories indexed from 0:
{0, 1, ..., k-1}.
- :param loss: Loss function used for minimization during gradient boosting.
- Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}.
+ :param loss: Loss function used for minimization during gradient
+ boosting. Supported: {"logLoss" (default),
+ "leastSquaresError", "leastAbsoluteError"}.
:param numIterations: Number of iterations of boosting.
(default: 100)
- :param learningRate: Learning rate for shrinking the contribution of each estimator.
- The learning rate should be between in the interval (0, 1]
- (default: 0.1)
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
- leaf node; depth 1 means 1 internal node + 2 leaf nodes.
- (default: 3)
- :return: GradientBoostedTreesModel that can be used for prediction
+ :param learningRate: Learning rate for shrinking the
+ contribution of each estimator. The learning rate
+ should be between in the interval (0, 1].
+ (default: 0.1)
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
+ 1 leaf node; depth 1 means 1 internal node + 2 leaf
+ nodes. (default: 3)
+ :return: GradientBoostedTreesModel that can be used for
+ prediction
Example usage: