aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-05-19 04:48:36 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-19 04:48:36 +0200
commitb1bc5ebdd52ed12aea3fdc7b8f2fa2d00ea09c6b (patch)
treecb5de53388d4a136fafd7eec3fc877f472c7098a
parent4987f39ac7a694e1c8b8b82246eb4fbd863201c4 (diff)
downloadspark-b1bc5ebdd52ed12aea3fdc7b8f2fa2d00ea09c6b.tar.gz
spark-b1bc5ebdd52ed12aea3fdc7b8f2fa2d00ea09c6b.tar.bz2
spark-b1bc5ebdd52ed12aea3fdc7b8f2fa2d00ea09c6b.zip
[DOC][MINOR] ml.feature Scala and Python API sync
## What changes were proposed in this pull request? I reviewed Scala and Python APIs for ml.feature and corrected discrepancies. ## How was this patch tested? Built docs locally, ran style checks Author: Bryan Cutler <cutlerb@gmail.com> Closes #13159 from BryanCutler/ml.feature-api-sync.
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala3
-rwxr-xr-xpython/pyspark/ml/feature.py39
5 files changed, 36 insertions, 19 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index f85f4c65af..08beda6d75 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -38,12 +38,12 @@ import org.apache.spark.sql.types.StructType
private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol {
/**
- * The minimum of documents in which a term should appear.
+ * The minimum number of documents in which a term should appear.
* Default: 0
* @group param
*/
final val minDocFreq = new IntParam(
- this, "minDocFreq", "minimum of documents in which a term should appear for filtering")
+ this, "minDocFreq", "minimum number of documents in which a term should appear for filtering")
setDefault(minDocFreq -> 0)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 141d3b924b..dbbaa5aa46 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -53,7 +53,8 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
/**
* :: Experimental ::
- * PCA trains a model to project vectors to a low-dimensional space using PCA.
+ * PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]]
+ * principal components.
*/
@Experimental
class PCA (override val uid: String) extends Estimator[PCAModel] with PCAParams
@@ -106,7 +107,7 @@ object PCA extends DefaultParamsReadable[PCA] {
/**
* :: Experimental ::
- * Model fitted by [[PCA]].
+ * Model fitted by [[PCA]]. Transforms vectors to a lower dimensional space.
*
* @param pc A principal components Matrix. Each column is one principal component.
* @param explainedVariance A vector of proportions of variance explained by
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index c0feaa01fc..2916b6d9df 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -194,7 +194,9 @@ object RFormula extends DefaultParamsReadable[RFormula] {
/**
* :: Experimental ::
- * A fitted RFormula. Fitting is required to determine the factor levels of formula terms.
+ * Model fitted by [[RFormula]]. Fitting is required to determine the factor levels of
+ * formula terms.
+ *
* @param resolvedFormula the fitted R formula.
* @param pipelineModel the fitted feature model, including factor to index mappings.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 2bc9d225ac..d814528ec4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -240,7 +240,8 @@ object VectorIndexer extends DefaultParamsReadable[VectorIndexer] {
/**
* :: Experimental ::
- * Transform categorical features to use 0-based indices instead of their original values.
+ * Model fitted by [[VectorIndexer]]. Transform categorical features to use 0-based indices
+ * instead of their original values.
* - Categorical features are mapped to indices.
* - Continuous features (columns) are left unchanged.
* This also appends metadata to the output column, marking features as Numeric (continuous),
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 983b6a5301..497f2ad68e 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -352,7 +352,7 @@ class CountVectorizerModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by CountVectorizer.
+ Model fitted by :py:class:`CountVectorizer`.
.. versionadded:: 1.6.0
"""
@@ -609,7 +609,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
"""
minDocFreq = Param(Params._dummy(), "minDocFreq",
- "minimum of documents in which a term should appear for filtering",
+ "minimum number of documents in which a term should appear for filtering",
typeConverter=TypeConverters.toInt)
@keyword_only
@@ -655,7 +655,7 @@ class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by IDF.
+ Model fitted by :py:class:`IDF`.
.. versionadded:: 1.4.0
"""
@@ -1302,7 +1302,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)",
typeConverter=TypeConverters.toInt)
- gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
+ gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens " +
+ "(False)")
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing",
typeConverter=TypeConverters.toString)
toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " +
@@ -1549,7 +1550,7 @@ class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by StandardScaler.
+ Model fitted by :py:class:`StandardScaler`.
.. versionadded:: 1.4.0
"""
@@ -1641,7 +1642,7 @@ class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by StringIndexer.
+ Model fitted by :py:class:`StringIndexer`.
.. versionadded:: 1.4.0
"""
@@ -1907,7 +1908,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
"""
.. note:: Experimental
- Class for indexing categorical feature columns in a dataset of [[Vector]].
+ Class for indexing categorical feature columns in a dataset of `Vector`.
This has 2 usage modes:
- Automatically identify categorical features (default behavior)
@@ -2023,7 +2024,17 @@ class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by VectorIndexer.
+ Model fitted by :py:class:`VectorIndexer`.
+
+ Transform categorical features to use 0-based indices instead of their original values.
+ - Categorical features are mapped to indices.
+ - Continuous features (columns) are left unchanged.
+
+ This also appends metadata to the output column, marking features as Numeric (continuous),
+ Nominal (categorical), or Binary (either continuous or categorical).
+ Non-ML metadata is not carried over from the input to the output column.
+
+ This maintains vector sparsity.
.. versionadded:: 1.4.0
"""
@@ -2296,7 +2307,7 @@ class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by Word2Vec.
+ Model fitted by :py:class:`Word2Vec`.
.. versionadded:: 1.4.0
"""
@@ -2327,7 +2338,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
"""
.. note:: Experimental
- PCA trains a model to project vectors to a low-dimensional space using PCA.
+ PCA trains a model to project vectors to a lower dimensional space of the
+ top :py:attr:`k` principal components.
>>> from pyspark.ml.linalg import Vectors
>>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
@@ -2401,7 +2413,7 @@ class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by PCA.
+ Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space.
.. versionadded:: 1.5.0
"""
@@ -2532,7 +2544,8 @@ class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by :py:class:`RFormula`.
+ Model fitted by :py:class:`RFormula`. Fitting is required to determine the
+ factor levels of formula terms.
.. versionadded:: 1.5.0
"""
@@ -2624,7 +2637,7 @@ class ChiSqSelectorModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
- Model fitted by ChiSqSelector.
+ Model fitted by :py:class:`ChiSqSelector`.
.. versionadded:: 2.0.0
"""