aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchie8842 <hayashidac@nttdata.co.jp>2016-11-08 13:45:37 +0000
committerSean Owen <sowen@cloudera.com>2016-11-08 13:45:37 +0000
commitee2e741ac16b01d9cae0eadd35af774547bbd415 (patch)
tree792d6d1460e93d7ab1e991d5df355df7c43c6819
parentc291bd2745a8a2e4ba91d8697879eb8da10287e2 (diff)
downloadspark-ee2e741ac16b01d9cae0eadd35af774547bbd415.tar.gz
spark-ee2e741ac16b01d9cae0eadd35af774547bbd415.tar.bz2
spark-ee2e741ac16b01d9cae0eadd35af774547bbd415.zip
[SPARK-13770][DOCUMENTATION][ML] Document the ML feature Interaction
I created Scala and Java example and added documentation. Author: chie8842 <hayashidac@nttdata.co.jp> Closes #15658 from hayashidac/SPARK-13770.
-rw-r--r--docs/ml-features.md52
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java88
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala68
3 files changed, 208 insertions, 0 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 352887d3ba..903177210d 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -729,6 +729,58 @@ for more details on the API.
</div>
</div>
+## Interaction
+
+`Interaction` is a `Transformer` which takes vector or double-valued columns, and generates a single vector column that contains the product of all combinations of one value from each input column.
+
+For example, if you have 2 vector type columns each of which has 3 dimensions as input columns, then then you'll get a 9-dimensional vector as the output column.
+
+**Examples**
+
+Assume that we have the following DataFrame with the columns "id1", "vec1", and "vec2":
+
+~~~~
+ id1|vec1 |vec2
+ ---|--------------|--------------
+ 1 |[1.0,2.0,3.0] |[8.0,4.0,5.0]
+ 2 |[4.0,3.0,8.0] |[7.0,9.0,8.0]
+ 3 |[6.0,1.0,9.0] |[2.0,3.0,6.0]
+ 4 |[10.0,8.0,6.0]|[9.0,4.0,5.0]
+ 5 |[9.0,2.0,7.0] |[10.0,7.0,3.0]
+ 6 |[1.0,1.0,4.0] |[2.0,8.0,4.0]
+~~~~
+
+Applying `Interaction` with those input columns,
+then `interactedCol` as the output column contains:
+
+~~~~
+ id1|vec1 |vec2 |interactedCol
+ ---|--------------|--------------|------------------------------------------------------
+ 1 |[1.0,2.0,3.0] |[8.0,4.0,5.0] |[8.0,4.0,5.0,16.0,8.0,10.0,24.0,12.0,15.0]
+ 2 |[4.0,3.0,8.0] |[7.0,9.0,8.0] |[56.0,72.0,64.0,42.0,54.0,48.0,112.0,144.0,128.0]
+ 3 |[6.0,1.0,9.0] |[2.0,3.0,6.0] |[36.0,54.0,108.0,6.0,9.0,18.0,54.0,81.0,162.0]
+ 4 |[10.0,8.0,6.0]|[9.0,4.0,5.0] |[360.0,160.0,200.0,288.0,128.0,160.0,216.0,96.0,120.0]
+ 5 |[9.0,2.0,7.0] |[10.0,7.0,3.0]|[450.0,315.0,135.0,100.0,70.0,30.0,350.0,245.0,105.0]
+ 6 |[1.0,1.0,4.0] |[2.0,8.0,4.0] |[12.0,48.0,24.0,12.0,48.0,24.0,48.0,192.0,96.0]
+~~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Interaction Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Interaction)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/InteractionExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Interaction Java docs](api/java/org/apache/spark/ml/feature/Interaction.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaInteractionExample.java %}
+</div>
+</div>
## Normalizer
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
new file mode 100644
index 0000000000..4213c05703
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.ml.feature.Interaction;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.Arrays;
+import java.util.List;
+
+// $example on$
+// $example off$
+
+public class JavaInteractionExample {
+ public static void main(String[] args) {
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("JavaInteractionExample")
+ .getOrCreate();
+
+ // $example on$
+ List<Row> data = Arrays.asList(
+ RowFactory.create(1, 1, 2, 3, 8, 4, 5),
+ RowFactory.create(2, 4, 3, 8, 7, 9, 8),
+ RowFactory.create(3, 6, 1, 9, 2, 3, 6),
+ RowFactory.create(4, 10, 8, 6, 9, 4, 5),
+ RowFactory.create(5, 9, 2, 7, 10, 7, 3),
+ RowFactory.create(6, 1, 1, 4, 2, 8, 4)
+ );
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id7", DataTypes.IntegerType, false, Metadata.empty())
+ });
+
+ Dataset<Row> df = spark.createDataFrame(data, schema);
+
+ VectorAssembler assembler1 = new VectorAssembler()
+ .setInputCols(new String[]{"id2", "id3", "id4"})
+ .setOutputCol("vec1");
+
+ Dataset<Row> assembled1 = assembler1.transform(df);
+
+ VectorAssembler assembler2 = new VectorAssembler()
+ .setInputCols(new String[]{"id5", "id6", "id7"})
+ .setOutputCol("vec2");
+
+ Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2");
+
+ Interaction interaction = new Interaction()
+ .setInputCols(new String[]{"id1","vec1","vec2"})
+ .setOutputCol("interactedCol");
+
+ Dataset<Row> interacted = interaction.transform(assembled2);
+
+ interacted.show(false);
+ // $example off$
+
+ spark.stop();
+ }
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala
new file mode 100644
index 0000000000..8113c992b1
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Interaction
+import org.apache.spark.ml.feature.VectorAssembler
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object InteractionExample {
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName("InteractionExample")
+ .getOrCreate()
+
+ // $example on$
+ val df = spark.createDataFrame(Seq(
+ (1, 1, 2, 3, 8, 4, 5),
+ (2, 4, 3, 8, 7, 9, 8),
+ (3, 6, 1, 9, 2, 3, 6),
+ (4, 10, 8, 6, 9, 4, 5),
+ (5, 9, 2, 7, 10, 7, 3),
+ (6, 1, 1, 4, 2, 8, 4)
+ )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7")
+
+ val assembler1 = new VectorAssembler().
+ setInputCols(Array("id2", "id3", "id4")).
+ setOutputCol("vec1")
+
+ val assembled1 = assembler1.transform(df)
+
+ val assembler2 = new VectorAssembler().
+ setInputCols(Array("id5", "id6", "id7")).
+ setOutputCol("vec2")
+
+ val assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
+
+ val interaction = new Interaction()
+ .setInputCols(Array("id1", "vec1", "vec2"))
+ .setOutputCol("interactedCol")
+
+ val interacted = interaction.transform(assembled2)
+
+ interacted.show(truncate = false)
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println