aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-12-07 23:50:57 -0800
committerXiangrui Meng <meng@databricks.com>2015-12-07 23:50:57 -0800
commit4a39b5a1bee28cec792d509654f6236390cafdcb (patch)
tree1637657b13ee5294d74abf8f3f2f4c3f5bf9ba86
parent7d05a624510f7299b3dd07f87c203db1ff7caa3e (diff)
downloadspark-4a39b5a1bee28cec792d509654f6236390cafdcb.tar.gz
spark-4a39b5a1bee28cec792d509654f6236390cafdcb.tar.bz2
spark-4a39b5a1bee28cec792d509654f6236390cafdcb.zip
[SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code
Add ```SQLTransformer``` user guide, example code and make Scala API doc more clear. Author: Yanbo Liang <ybliang8@gmail.com> Closes #10006 from yanboliang/spark-11958.
-rw-r--r--docs/ml-features.md59
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java59
-rw-r--r--examples/src/main/python/ml/sql_transformer.py40
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala45
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala11
5 files changed, 212 insertions, 2 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 5105a948fe..f85e0d56d2 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -756,6 +756,65 @@ for more details on the API.
</div>
</div>
+## SQLTransformer
+
+`SQLTransformer` implements the transformations which are defined by SQL statement.
+Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
+where `"__THIS__"` represents the underlying table of the input dataset.
+The select clause specifies the fields, constants, and expressions to display in
+the output, it can be any select clause that Spark SQL supports. Users can also
+use Spark SQL built-in function and UDFs to operate on these selected columns.
+For example, `SQLTransformer` supports statements like:
+
+* `SELECT a, a + b AS a_b FROM __THIS__`
+* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
+* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b`
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id`, `v1` and `v2`:
+
+~~~~
+ id | v1 | v2
+----|-----|-----
+ 0 | 1.0 | 3.0
+ 2 | 2.0 | 5.0
+~~~~
+
+This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`:
+
+~~~~
+ id | v1 | v2 | v3 | v4
+----|-----|-----|-----|-----
+ 0 | 1.0 | 3.0 | 4.0 | 3.0
+ 2 | 2.0 | 5.0 | 7.0 |10.0
+~~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [SQLTransformer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [SQLTransformer Java docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [SQLTransformer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more details on the API.
+
+{% include_example python/ml/sql_transformer.py %}
+</div>
+</div>
+
## VectorAssembler
`VectorAssembler` is a transformer that combines a given list of columns into a single vector
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
new file mode 100644
index 0000000000..d55c70796a
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.SQLTransformer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaSQLTransformerExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaSQLTransformerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, 1.0, 3.0),
+ RowFactory.create(2, 2.0, 5.0)
+ ));
+ StructType schema = new StructType(new StructField [] {
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
+ });
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+ SQLTransformer sqlTrans = new SQLTransformer().setStatement(
+ "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");
+
+ sqlTrans.transform(df).show();
+ // $example off$
+ }
+}
diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py
new file mode 100644
index 0000000000..9575d728d8
--- /dev/null
+++ b/examples/src/main/python/ml/sql_transformer.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.ml.feature import SQLTransformer
+# $example off$
+from pyspark.sql import SQLContext
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="SQLTransformerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext.createDataFrame([
+ (0, 1.0, 3.0),
+ (2, 2.0, 5.0)
+ ], ["id", "v1", "v2"])
+ sqlTrans = SQLTransformer(
+ statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+ sqlTrans.transform(df).show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
new file mode 100644
index 0000000000..014abd1fdb
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.SQLTransformer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+object SQLTransformerExample {
+ def main(args: Array[String]) {
+ val conf = new SparkConf().setAppName("SQLTransformerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val df = sqlContext.createDataFrame(
+ Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
+
+ val sqlTrans = new SQLTransformer().setStatement(
+ "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+
+ sqlTrans.transform(df).show()
+ // $example off$
+ }
+}
+// scalastyle:on println
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index 3a735017ba..c09f4d076c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -27,9 +27,16 @@ import org.apache.spark.sql.types.StructType
/**
* :: Experimental ::
- * Implements the transforms which are defined by SQL statement.
- * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+ * Implements the transformations which are defined by SQL statement.
+ * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
* where '__THIS__' represents the underlying table of the input dataset.
+ * The select clause specifies the fields, constants, and expressions to display in
+ * the output, it can be any select clause that Spark SQL supports. Users can also
+ * use Spark SQL built-in function and UDFs to operate on these selected columns.
+ * For example, [[SQLTransformer]] supports statements like:
+ * - SELECT a, a + b AS a_b FROM __THIS__
+ * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
+ * - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
*/
@Experimental
@Since("1.6.0")