diff options
Diffstat (limited to 'python/pyspark/ml')
-rw-r--r-- | python/pyspark/ml/classification.py | 45 | ||||
-rw-r--r-- | python/pyspark/ml/clustering.py | 8 | ||||
-rwxr-xr-x | python/pyspark/ml/feature.py | 13 | ||||
-rw-r--r-- | python/pyspark/ml/linalg/__init__.py | 11 | ||||
-rw-r--r-- | python/pyspark/ml/regression.py | 32 |
5 files changed, 56 insertions, 53 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 83e1e89347..8054a34db3 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -440,9 +440,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): .. seealso:: `Wikipedia reference \ <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("roc") @@ -453,9 +453,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Computes the area under the receiver operating characteristic (ROC) curve. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("areaUnderROC") @@ -467,9 +467,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): containing two fields recall, precision with (0.0, 1.0) prepended to it. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("pr") @@ -480,9 +480,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("fMeasureByThreshold") @@ -494,9 +494,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the precision. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("precisionByThreshold") @@ -508,9 +508,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the recall. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("recallByThreshold") @@ -695,9 +695,9 @@ class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel where gain is scaled by the number of instances passing through node - Normalize importances for tree to sum to 1. - Note: Feature importance for single decision trees can have high variance due to - correlated predictor variables. Consider using a :py:class:`RandomForestClassifier` - to determine feature importance instead. + .. note:: Feature importance for single decision trees can have high variance due to + correlated predictor variables. Consider using a :py:class:`RandomForestClassifier` + to determine feature importance instead. """ return self._call_java("featureImportances") @@ -839,7 +839,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_ learning algorithm for classification. It supports binary labels, as well as both continuous and categorical features. - Note: Multiclass labels are not currently supported. The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. @@ -851,6 +850,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol - We expect to implement TreeBoost in the future: `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_ + .. note:: Multiclass labels are not currently supported. + >>> from numpy import allclose >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index e58ec1e7ac..b29b5ac70e 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -155,7 +155,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte While this process is generally guaranteed to converge, it is not guaranteed to find a global optimum. - Note: For high-dimensional data (with many features), this algorithm may perform poorly. + .. note:: For high-dimensional data (with many features), this algorithm may perform poorly. This is due to high-dimensional data (a) making it difficult to cluster at all (based on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions. @@ -749,9 +749,9 @@ class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be saved checkpoint files. This method is provided so that users can manage those files. - Note that removing the checkpoints can cause failures if a partition is lost and is needed - by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up the - checkpoints when this model and derivative data go out of scope. + .. note:: Removing the checkpoints can cause failures if a partition is lost and is needed + by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up + the checkpoints when this model and derivative data go out of scope. :return List of checkpoint files from training """ diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 635cf13045..40b63d4d31 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -742,8 +742,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min) - Note that since zero values will probably be transformed to non-zero values, output of the - transformer will be DenseVector even for sparse input. + .. note:: Since zero values will probably be transformed to non-zero values, output of the + transformer will be DenseVector even for sparse input. >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) @@ -1014,9 +1014,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, :py:attr:`dropLast`) because it makes the vector entries sum up to one, and hence linearly dependent. So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`. - Note that this is different from scikit-learn's OneHotEncoder, - which keeps all categories. - The output vectors are sparse. + + .. note:: This is different from scikit-learn's OneHotEncoder, + which keeps all categories. The output vectors are sparse. .. seealso:: @@ -1698,7 +1698,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A feature transformer that filters out stop words from input. - Note: null values from input array are preserved unless adding null to stopWords explicitly. + + .. note:: null values from input array are preserved unless adding null to stopWords explicitly. >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"]) >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index a5df727fdb..1705c156ce 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -746,11 +746,12 @@ class SparseVector(Vector): class Vectors(object): """ - Factory methods for working with vectors. Note that dense vectors - are simply represented as NumPy array objects, so there is no need - to covert them for use in MLlib. For sparse vectors, the factory - methods in this class create an MLlib-compatible type, or users - can pass in SciPy's C{scipy.sparse} column vectors. + Factory methods for working with vectors. + + .. note:: Dense vectors are simply represented as NumPy array objects, + so there is no need to covert them for use in MLlib. For sparse vectors, + the factory methods in this class create an MLlib-compatible type, or users + can pass in SciPy's C{scipy.sparse} column vectors. """ @staticmethod diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 385391ba53..b42e807069 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -245,9 +245,9 @@ class LinearRegressionSummary(JavaWrapper): .. seealso:: `Wikipedia explain variation \ <http://en.wikipedia.org/wiki/Explained_variation>`_ - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("explainedVariance") @@ -259,9 +259,9 @@ class LinearRegressionSummary(JavaWrapper): corresponding to the expected value of the absolute error loss or l1-norm loss. - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("meanAbsoluteError") @@ -273,9 +273,9 @@ class LinearRegressionSummary(JavaWrapper): corresponding to the expected value of the squared error loss or quadratic loss. - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("meanSquaredError") @@ -286,9 +286,9 @@ class LinearRegressionSummary(JavaWrapper): Returns the root mean squared error, which is defined as the square root of the mean squared error. - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("rootMeanSquaredError") @@ -301,9 +301,9 @@ class LinearRegressionSummary(JavaWrapper): .. seealso:: `Wikipedia coefficient of determination \ <http://en.wikipedia.org/wiki/Coefficient_of_determination>` - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("r2") @@ -822,7 +822,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReada where gain is scaled by the number of instances passing through node - Normalize importances for tree to sum to 1. - Note: Feature importance for single decision trees can have high variance due to + .. note:: Feature importance for single decision trees can have high variance due to correlated predictor variables. Consider using a :py:class:`RandomForestRegressor` to determine feature importance instead. """ |