From 34777184cd8cab61e1dd25d0a4d5e738880a57b2 Mon Sep 17 00:00:00 2001 From: Yunni Date: Sat, 3 Dec 2016 16:58:15 -0800 Subject: [SPARK-18081][ML][DOCS] Add user guide for Locality Sensitive Hashing(LSH) ## What changes were proposed in this pull request? The user guide for LSH is added to ml-features.md, with several scala/java examples in spark-examples. ## How was this patch tested? Doc has been generated through Jekyll, and checked through manual inspection. Author: Yunni Author: Yun Ni Author: Joseph K. Bradley Author: Yun Ni Closes #15795 from Yunni/SPARK-18081-lsh-guide. --- .../ml/JavaBucketedRandomProjectionLSHExample.java | 98 ++++++++++++++++++++++ .../spark/examples/ml/JavaMinHashLSHExample.java | 70 ++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java (limited to 'examples/src/main/java') diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java new file mode 100644 index 0000000000..ca3ee5a285 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.sql.SparkSession; + +// $example on$ +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.ml.feature.BucketedRandomProjectionLSH; +import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaBucketedRandomProjectionLSHExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaBucketedRandomProjectionLSHExample") + .getOrCreate(); + + // $example on$ + List dataA = Arrays.asList( + RowFactory.create(0, Vectors.dense(1.0, 1.0)), + RowFactory.create(1, Vectors.dense(1.0, -1.0)), + RowFactory.create(2, Vectors.dense(-1.0, -1.0)), + RowFactory.create(3, Vectors.dense(-1.0, 1.0)) + ); + + List dataB = Arrays.asList( + RowFactory.create(4, Vectors.dense(1.0, 0.0)), + RowFactory.create(5, Vectors.dense(-1.0, 0.0)), + RowFactory.create(6, Vectors.dense(0.0, 1.0)), + RowFactory.create(7, Vectors.dense(0.0, -1.0)) + ); + + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("keys", new VectorUDT(), false, Metadata.empty()) + }); + Dataset dfA = spark.createDataFrame(dataA, schema); + Dataset dfB = spark.createDataFrame(dataB, schema); + + Vector key = Vectors.dense(1.0, 0.0); + + BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH() + .setBucketLength(2.0) + .setNumHashTables(3) + .setInputCol("keys") + .setOutputCol("values"); + + BucketedRandomProjectionLSHModel model = mh.fit(dfA); + + // Feature Transformation + model.transform(dfA).show(); + // Cache the transformed columns + Dataset transformedA = model.transform(dfA).cache(); + Dataset transformedB = model.transform(dfB).cache(); + + // Approximate similarity join + model.approxSimilarityJoin(dfA, dfB, 1.5).show(); + model.approxSimilarityJoin(transformedA, transformedB, 1.5).show(); + // Self Join + model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show(); + + // Approximate nearest neighbor search + model.approxNearestNeighbors(dfA, key, 2).show(); + model.approxNearestNeighbors(transformedA, key, 2).show(); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java new file mode 100644 index 0000000000..9dbbf6d117 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.sql.SparkSession; + +// $example on$ +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.ml.feature.MinHashLSH; +import org.apache.spark.ml.feature.MinHashLSHModel; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaMinHashLSHExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaMinHashLSHExample") + .getOrCreate(); + + // $example on$ + List data = Arrays.asList( + RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0})) + ); + + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("keys", new VectorUDT(), false, Metadata.empty()) + }); + Dataset dataFrame = spark.createDataFrame(data, schema); + + MinHashLSH mh = new MinHashLSH() + .setNumHashTables(1) + .setInputCol("keys") + .setOutputCol("values"); + + MinHashLSHModel model = mh.fit(dataFrame); + model.transform(dataFrame).show(); + // $example off$ + + spark.stop(); + } +} -- cgit v1.2.3