aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-08-31 16:11:27 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-31 16:11:27 -0700
commit52ea399e6ee37b7c44aae7709863e006fca88906 (patch)
tree0f0ee9374a05f019c69da8b762f5d44ca4777617 /python
parentfe16fd0b8b717f01151bc659ec3299dab091c97a (diff)
downloadspark-52ea399e6ee37b7c44aae7709863e006fca88906.tar.gz
spark-52ea399e6ee37b7c44aae7709863e006fca88906.tar.bz2
spark-52ea399e6ee37b7c44aae7709863e006fca88906.zip
[SPARK-10355] [ML] [PySpark] Add Python API for SQLTransformer
Add Python API for SQLTransformer Author: Yanbo Liang <ybliang8@gmail.com> Closes #8527 from yanboliang/spark-10355.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/feature.py57
1 files changed, 54 insertions, 3 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 59300a6078..0626281e20 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -28,9 +28,9 @@ from pyspark.mllib.linalg import _convert_to_vector
__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
- 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
- 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
- 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+ 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
+ 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
+ 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
@inherit_doc
@@ -744,6 +744,57 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
@inherit_doc
+class SQLTransformer(JavaTransformer):
+ """
+ Implements the transforms which are defined by SQL statement.
+ Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+ where '__THIS__' represents the underlying table of the input dataset.
+
+ >>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
+ >>> sqlTrans = SQLTransformer(
+ ... statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+ >>> sqlTrans.transform(df).head()
+ Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
+ """
+
+ # a placeholder to make it appear in the generated doc
+ statement = Param(Params._dummy(), "statement", "SQL statement")
+
+ @keyword_only
+ def __init__(self, statement=None):
+ """
+ __init__(self, statement=None)
+ """
+ super(SQLTransformer, self).__init__()
+ self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
+ self.statement = Param(self, "statement", "SQL statement")
+ kwargs = self.__init__._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ def setParams(self, statement=None):
+ """
+ setParams(self, statement=None)
+ Sets params for this SQLTransformer.
+ """
+ kwargs = self.setParams._input_kwargs
+ return self._set(**kwargs)
+
+ def setStatement(self, value):
+ """
+ Sets the value of :py:attr:`statement`.
+ """
+ self._paramMap[self.statement] = value
+ return self
+
+ def getStatement(self):
+ """
+ Gets the value of statement or its default value.
+ """
+ return self.getOrDefault(self.statement)
+
+
+@inherit_doc
class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
"""
Standardizes features by removing the mean and scaling to unit variance using column summary