aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-09-09 18:02:33 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-09-09 18:02:33 -0700
commit56a0fe5c6e4ae2929c48fae2d6225558d020e5f9 (patch)
treee02635afeb23bcee675e2d3273a9b34ae20b2567 /python
parent45de518742446ddfbd4816c9d0f8501139f9bc2d (diff)
downloadspark-56a0fe5c6e4ae2929c48fae2d6225558d020e5f9.tar.gz
spark-56a0fe5c6e4ae2929c48fae2d6225558d020e5f9.tar.bz2
spark-56a0fe5c6e4ae2929c48fae2d6225558d020e5f9.zip
[SPARK-9772] [PYSPARK] [ML] Add Python API for ml.feature.VectorSlicer
Add Python API for ml.feature.VectorSlicer. Author: Yanbo Liang <ybliang8@gmail.com> Closes #8102 from yanboliang/SPARK-9772.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/feature.py95
1 files changed, 90 insertions, 5 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8c26cfbd5a..1c423486be 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -27,11 +27,11 @@ from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector
__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
- 'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion',
- 'RegexTokenizer', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
- 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
- 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel',
- 'StopWordsRemover']
+ 'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
+ 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
+ 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
+ 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
+ 'Word2Vec', 'Word2VecModel']
@inherit_doc
@@ -1299,6 +1299,91 @@ class VectorIndexerModel(JavaModel):
@inherit_doc
+class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
+ """
+ .. note:: Experimental
+
+ This class takes a feature vector and outputs a new feature vector with a subarray
+ of the original features.
+
+ The subset of features can be specified with either indices (`setIndices()`)
+ or names (`setNames()`). At least one feature must be selected. Duplicate features
+ are not allowed, so there can be no overlap between selected indices and names.
+
+ The output vector will order features with the selected indices first (in the order given),
+ followed by the selected names (in the order given).
+
+ >>> from pyspark.mllib.linalg import Vectors
+ >>> df = sqlContext.createDataFrame([
+ ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
+ ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
+ ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
+ >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
+ >>> vs.transform(df).head().sliced
+ DenseVector([2.3, 1.0])
+ """
+
+ # a placeholder to make it appear in the generated doc
+ indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
+ "a vector column. There can be no overlap with names.")
+ names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
+ "a vector column. These names must be specified by ML " +
+ "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
+ "indices.")
+
+ @keyword_only
+ def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
+ """
+ __init__(self, inputCol=None, outputCol=None, indices=None, names=None)
+ """
+ super(VectorSlicer, self).__init__()
+ self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
+ self.indices = Param(self, "indices", "An array of indices to select features from " +
+ "a vector column. There can be no overlap with names.")
+ self.names = Param(self, "names", "An array of feature names to select features from " +
+ "a vector column. These names must be specified by ML " +
+ "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
+ "with indices.")
+ kwargs = self.__init__._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
+ """
+ setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
+ Sets params for this VectorSlicer.
+ """
+ kwargs = self.setParams._input_kwargs
+ return self._set(**kwargs)
+
+ def setIndices(self, value):
+ """
+ Sets the value of :py:attr:`indices`.
+ """
+ self._paramMap[self.indices] = value
+ return self
+
+ def getIndices(self):
+ """
+ Gets the value of indices or its default value.
+ """
+ return self.getOrDefault(self.indices)
+
+ def setNames(self, value):
+ """
+ Sets the value of :py:attr:`names`.
+ """
+ self._paramMap[self.names] = value
+ return self
+
+ def getNames(self):
+ """
+ Gets the value of names or its default value.
+ """
+ return self.getOrDefault(self.names)
+
+
+@inherit_doc
@ignore_unicode_prefix
class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):
"""