From b1bc5ebdd52ed12aea3fdc7b8f2fa2d00ea09c6b Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 19 May 2016 04:48:36 +0200 Subject: [DOC][MINOR] ml.feature Scala and Python API sync ## What changes were proposed in this pull request? I reviewed Scala and Python APIs for ml.feature and corrected discrepancies. ## How was this patch tested? Built docs locally, ran style checks Author: Bryan Cutler Closes #13159 from BryanCutler/ml.feature-api-sync. --- python/pyspark/ml/feature.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'python') diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 983b6a5301..497f2ad68e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -352,7 +352,7 @@ class CountVectorizerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by CountVectorizer. + Model fitted by :py:class:`CountVectorizer`. .. versionadded:: 1.6.0 """ @@ -609,7 +609,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab """ minDocFreq = Param(Params._dummy(), "minDocFreq", - "minimum of documents in which a term should appear for filtering", + "minimum number of documents in which a term should appear for filtering", typeConverter=TypeConverters.toInt) @keyword_only @@ -655,7 +655,7 @@ class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by IDF. + Model fitted by :py:class:`IDF`. .. versionadded:: 1.4.0 """ @@ -1302,7 +1302,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)", typeConverter=TypeConverters.toInt) - gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") + gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens " + + "(False)") pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing", typeConverter=TypeConverters.toString) toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " + @@ -1549,7 +1550,7 @@ class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by StandardScaler. + Model fitted by :py:class:`StandardScaler`. .. versionadded:: 1.4.0 """ @@ -1641,7 +1642,7 @@ class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by StringIndexer. + Model fitted by :py:class:`StringIndexer`. .. versionadded:: 1.4.0 """ @@ -1907,7 +1908,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja """ .. note:: Experimental - Class for indexing categorical feature columns in a dataset of [[Vector]]. + Class for indexing categorical feature columns in a dataset of `Vector`. This has 2 usage modes: - Automatically identify categorical features (default behavior) @@ -2023,7 +2024,17 @@ class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by VectorIndexer. + Model fitted by :py:class:`VectorIndexer`. + + Transform categorical features to use 0-based indices instead of their original values. + - Categorical features are mapped to indices. + - Continuous features (columns) are left unchanged. + + This also appends metadata to the output column, marking features as Numeric (continuous), + Nominal (categorical), or Binary (either continuous or categorical). + Non-ML metadata is not carried over from the input to the output column. + + This maintains vector sparsity. .. versionadded:: 1.4.0 """ @@ -2296,7 +2307,7 @@ class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by Word2Vec. + Model fitted by :py:class:`Word2Vec`. .. versionadded:: 1.4.0 """ @@ -2327,7 +2338,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab """ .. note:: Experimental - PCA trains a model to project vectors to a low-dimensional space using PCA. + PCA trains a model to project vectors to a lower dimensional space of the + top :py:attr:`k` principal components. >>> from pyspark.ml.linalg import Vectors >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), @@ -2401,7 +2413,7 @@ class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by PCA. + Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space. .. versionadded:: 1.5.0 """ @@ -2532,7 +2544,8 @@ class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by :py:class:`RFormula`. + Model fitted by :py:class:`RFormula`. Fitting is required to determine the + factor levels of formula terms. .. versionadded:: 1.5.0 """ @@ -2624,7 +2637,7 @@ class ChiSqSelectorModel(JavaModel, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental - Model fitted by ChiSqSelector. + Model fitted by :py:class:`ChiSqSelector`. .. versionadded:: 2.0.0 """ -- cgit v1.2.3