aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/ml')
-rw-r--r--python/pyspark/ml/classification.py45
-rw-r--r--python/pyspark/ml/clustering.py8
-rwxr-xr-xpython/pyspark/ml/feature.py13
-rw-r--r--python/pyspark/ml/linalg/__init__.py11
-rw-r--r--python/pyspark/ml/regression.py32
5 files changed, 56 insertions, 53 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 83e1e89347..8054a34db3 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -440,9 +440,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
.. seealso:: `Wikipedia reference \
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
- Note: This ignores instance weights (setting all to 1.0) from
- `LogisticRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LogisticRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("roc")
@@ -453,9 +453,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
Computes the area under the receiver operating characteristic
(ROC) curve.
- Note: This ignores instance weights (setting all to 1.0) from
- `LogisticRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LogisticRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("areaUnderROC")
@@ -467,9 +467,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
containing two fields recall, precision with (0.0, 1.0) prepended
to it.
- Note: This ignores instance weights (setting all to 1.0) from
- `LogisticRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LogisticRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("pr")
@@ -480,9 +480,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
Returns a dataframe with two fields (threshold, F-Measure) curve
with beta = 1.0.
- Note: This ignores instance weights (setting all to 1.0) from
- `LogisticRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LogisticRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("fMeasureByThreshold")
@@ -494,9 +494,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
Every possible probability obtained in transforming the dataset
are used as thresholds used in calculating the precision.
- Note: This ignores instance weights (setting all to 1.0) from
- `LogisticRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LogisticRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("precisionByThreshold")
@@ -508,9 +508,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
Every possible probability obtained in transforming the dataset
are used as thresholds used in calculating the recall.
- Note: This ignores instance weights (setting all to 1.0) from
- `LogisticRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LogisticRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("recallByThreshold")
@@ -695,9 +695,9 @@ class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel
where gain is scaled by the number of instances passing through node
- Normalize importances for tree to sum to 1.
- Note: Feature importance for single decision trees can have high variance due to
- correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
- to determine feature importance instead.
+ .. note:: Feature importance for single decision trees can have high variance due to
+ correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
+ to determine feature importance instead.
"""
return self._call_java("featureImportances")
@@ -839,7 +839,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
`Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
learning algorithm for classification.
It supports binary labels, as well as both continuous and categorical features.
- Note: Multiclass labels are not currently supported.
The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
@@ -851,6 +850,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
- We expect to implement TreeBoost in the future:
`SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_
+ .. note:: Multiclass labels are not currently supported.
+
>>> from numpy import allclose
>>> from pyspark.ml.linalg import Vectors
>>> from pyspark.ml.feature import StringIndexer
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index e58ec1e7ac..b29b5ac70e 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -155,7 +155,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
While this process is generally guaranteed to converge, it is not guaranteed
to find a global optimum.
- Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+ .. note:: For high-dimensional data (with many features), this algorithm may perform poorly.
This is due to high-dimensional data (a) making it difficult to cluster at all
(based on statistical/theoretical arguments) and (b) numerical issues with
Gaussian distributions.
@@ -749,9 +749,9 @@ class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may
be saved checkpoint files. This method is provided so that users can manage those files.
- Note that removing the checkpoints can cause failures if a partition is lost and is needed
- by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up the
- checkpoints when this model and derivative data go out of scope.
+ .. note:: Removing the checkpoints can cause failures if a partition is lost and is needed
+ by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up
+ the checkpoints when this model and derivative data go out of scope.
:return List of checkpoint files from training
"""
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 635cf13045..40b63d4d31 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -742,8 +742,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav
For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)
- Note that since zero values will probably be transformed to non-zero values, output of the
- transformer will be DenseVector even for sparse input.
+ .. note:: Since zero values will probably be transformed to non-zero values, output of the
+ transformer will be DenseVector even for sparse input.
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
@@ -1014,9 +1014,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
:py:attr:`dropLast`) because it makes the vector entries sum up to
one, and hence linearly dependent.
So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
- Note that this is different from scikit-learn's OneHotEncoder,
- which keeps all categories.
- The output vectors are sparse.
+
+ .. note:: This is different from scikit-learn's OneHotEncoder,
+ which keeps all categories. The output vectors are sparse.
.. seealso::
@@ -1698,7 +1698,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
A feature transformer that filters out stop words from input.
- Note: null values from input array are preserved unless adding null to stopWords explicitly.
+
+ .. note:: null values from input array are preserved unless adding null to stopWords explicitly.
>>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"])
>>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index a5df727fdb..1705c156ce 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -746,11 +746,12 @@ class SparseVector(Vector):
class Vectors(object):
"""
- Factory methods for working with vectors. Note that dense vectors
- are simply represented as NumPy array objects, so there is no need
- to covert them for use in MLlib. For sparse vectors, the factory
- methods in this class create an MLlib-compatible type, or users
- can pass in SciPy's C{scipy.sparse} column vectors.
+ Factory methods for working with vectors.
+
+ .. note:: Dense vectors are simply represented as NumPy array objects,
+ so there is no need to covert them for use in MLlib. For sparse vectors,
+ the factory methods in this class create an MLlib-compatible type, or users
+ can pass in SciPy's C{scipy.sparse} column vectors.
"""
@staticmethod
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 385391ba53..b42e807069 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -245,9 +245,9 @@ class LinearRegressionSummary(JavaWrapper):
.. seealso:: `Wikipedia explain variation \
<http://en.wikipedia.org/wiki/Explained_variation>`_
- Note: This ignores instance weights (setting all to 1.0) from
- `LinearRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LinearRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("explainedVariance")
@@ -259,9 +259,9 @@ class LinearRegressionSummary(JavaWrapper):
corresponding to the expected value of the absolute error
loss or l1-norm loss.
- Note: This ignores instance weights (setting all to 1.0) from
- `LinearRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LinearRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("meanAbsoluteError")
@@ -273,9 +273,9 @@ class LinearRegressionSummary(JavaWrapper):
corresponding to the expected value of the squared error
loss or quadratic loss.
- Note: This ignores instance weights (setting all to 1.0) from
- `LinearRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LinearRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("meanSquaredError")
@@ -286,9 +286,9 @@ class LinearRegressionSummary(JavaWrapper):
Returns the root mean squared error, which is defined as the
square root of the mean squared error.
- Note: This ignores instance weights (setting all to 1.0) from
- `LinearRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LinearRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("rootMeanSquaredError")
@@ -301,9 +301,9 @@ class LinearRegressionSummary(JavaWrapper):
.. seealso:: `Wikipedia coefficient of determination \
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`
- Note: This ignores instance weights (setting all to 1.0) from
- `LinearRegression.weightCol`. This will change in later Spark
- versions.
+ .. note:: This ignores instance weights (setting all to 1.0) from
+ `LinearRegression.weightCol`. This will change in later Spark
+ versions.
"""
return self._call_java("r2")
@@ -822,7 +822,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReada
where gain is scaled by the number of instances passing through node
- Normalize importances for tree to sum to 1.
- Note: Feature importance for single decision trees can have high variance due to
+ .. note:: Feature importance for single decision trees can have high variance due to
correlated predictor variables. Consider using a :py:class:`RandomForestRegressor`
to determine feature importance instead.
"""