diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2015-09-10 20:43:38 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-09-10 20:43:38 -0700 |
commit | a140dd77c62255d6f7f6817a2517d47feb8540d4 (patch) | |
tree | 2fb9533b8e36ab564baeb1ec0aa3e368906e9fa0 /python/pyspark/ml/feature.py | |
parent | 339a527141984bfb182862b0987d3c4690c9ede1 (diff) | |
download | spark-a140dd77c62255d6f7f6817a2517d47feb8540d4.tar.gz spark-a140dd77c62255d6f7f6817a2517d47feb8540d4.tar.bz2 spark-a140dd77c62255d6f7f6817a2517d47feb8540d4.zip |
[SPARK-10027] [ML] [PySpark] Add Python API missing methods for ml.feature
Missing method of ml.feature are listed here:
```StringIndexer``` lacks of parameter ```handleInvalid```.
```StringIndexerModel``` lacks of method ```labels```.
```VectorIndexerModel``` lacks of methods ```numFeatures``` and ```categoryMaps```.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #8313 from yanboliang/spark-10027.
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rw-r--r-- | python/pyspark/ml/feature.py | 31 |
1 files changed, 26 insertions, 5 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1c423486be..71dc636b83 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -920,7 +920,7 @@ class StandardScalerModel(JavaModel): @inherit_doc -class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): +class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid): """ .. note:: Experimental @@ -943,19 +943,20 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): """ @keyword_only - def __init__(self, inputCol=None, outputCol=None): + def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"): """ - __init__(self, inputCol=None, outputCol=None) + __init__(self, inputCol=None, outputCol=None, handleInvalid="error") """ super(StringIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid) + self._setDefault(handleInvalid="error") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only - def setParams(self, inputCol=None, outputCol=None): + def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"): """ - setParams(self, inputCol=None, outputCol=None) + setParams(self, inputCol=None, outputCol=None, handleInvalid="error") Sets params for this StringIndexer. """ kwargs = self.setParams._input_kwargs @@ -1235,6 +1236,10 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> model = indexer.fit(df) >>> model.transform(df).head().indexed DenseVector([1.0, 0.0]) + >>> model.numFeatures + 2 + >>> model.categoryMaps + {0: {0.0: 0, -1.0: 1}} >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test DenseVector([0.0, 1.0]) >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"} @@ -1297,6 +1302,22 @@ class VectorIndexerModel(JavaModel): Model fitted by VectorIndexer. """ + @property + def numFeatures(self): + """ + Number of features, i.e., length of Vectors which this transforms. + """ + return self._call_java("numFeatures") + + @property + def categoryMaps(self): + """ + Feature value index. Keys are categorical feature indices (column indices). + Values are maps from original features values to 0-based category indices. + If a feature is not in this map, it is treated as continuous. + """ + return self._call_java("javaCategoryMaps") + @inherit_doc class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): |