aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-08-31 15:50:41 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-31 15:50:41 -0700
commit5b3245d6dff65972fc39c73f90d5cbdf84d19129 (patch)
treebd7f4c94c9fe954ab6047671fe1723a17542cf2f /python
parent23e39cc7b1bb7f1087c4706234c9b5165a571357 (diff)
downloadspark-5b3245d6dff65972fc39c73f90d5cbdf84d19129.tar.gz
spark-5b3245d6dff65972fc39c73f90d5cbdf84d19129.tar.bz2
spark-5b3245d6dff65972fc39c73f90d5cbdf84d19129.zip
[SPARK-8472] [ML] [PySpark] Python API for DCT
Add Python API for ml.feature.DCT. Author: Yanbo Liang <ybliang8@gmail.com> Closes #8485 from yanboliang/spark-8472.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/feature.py65
1 files changed, 64 insertions, 1 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 04b2b2ccc9..59300a6078 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,7 +26,7 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector
-__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
+__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
@@ -167,6 +167,69 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
@inherit_doc
+class DCT(JavaTransformer, HasInputCol, HasOutputCol):
+ """
+ A feature transformer that takes the 1D discrete cosine transform
+ of a real vector. No zero padding is performed on the input vector.
+ It returns a real vector of the same length representing the DCT.
+ The return vector is scaled such that the transform matrix is
+ unitary (aka scaled DCT-II).
+
+ More information on
+ `https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`.
+
+ >>> from pyspark.mllib.linalg import Vectors
+ >>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
+ >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
+ >>> df2 = dct.transform(df1)
+ >>> df2.head().resultVec
+ DenseVector([10.969..., -0.707..., -2.041...])
+ >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
+ >>> df3.head().origVec
+ DenseVector([5.0, 8.0, 6.0])
+ """
+
+ # a placeholder to make it appear in the generated doc
+ inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
+ "default False.")
+
+ @keyword_only
+ def __init__(self, inverse=False, inputCol=None, outputCol=None):
+ """
+ __init__(self, inverse=False, inputCol=None, outputCol=None)
+ """
+ super(DCT, self).__init__()
+ self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
+ self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
+ "default False.")
+ self._setDefault(inverse=False)
+ kwargs = self.__init__._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ def setParams(self, inverse=False, inputCol=None, outputCol=None):
+ """
+ setParams(self, inverse=False, inputCol=None, outputCol=None)
+ Sets params for this DCT.
+ """
+ kwargs = self.setParams._input_kwargs
+ return self._set(**kwargs)
+
+ def setInverse(self, value):
+ """
+ Sets the value of :py:attr:`inverse`.
+ """
+ self._paramMap[self.inverse] = value
+ return self
+
+ def getInverse(self):
+ """
+ Gets the value of inverse or its default value.
+ """
+ return self.getOrDefault(self.inverse)
+
+
+@inherit_doc
class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
"""
Outputs the Hadamard product (i.e., the element-wise product) of each input vector