diff options
author | Yunni <Euler57721@gmail.com> | 2016-12-03 16:58:15 -0800 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-12-03 16:58:15 -0800 |
commit | 34777184cd8cab61e1dd25d0a4d5e738880a57b2 (patch) | |
tree | 5e67658f2c8cc889ba02978963f4c4993f44bd88 /examples/src/main/java/org/apache | |
parent | 4a3c09601ba69f7d49d1946bb6f20f5cfe453031 (diff) | |
download | spark-34777184cd8cab61e1dd25d0a4d5e738880a57b2.tar.gz spark-34777184cd8cab61e1dd25d0a4d5e738880a57b2.tar.bz2 spark-34777184cd8cab61e1dd25d0a4d5e738880a57b2.zip |
[SPARK-18081][ML][DOCS] Add user guide for Locality Sensitive Hashing(LSH)
## What changes were proposed in this pull request?
The user guide for LSH is added to ml-features.md, with several scala/java examples in spark-examples.
## How was this patch tested?
Doc has been generated through Jekyll, and checked through manual inspection.
Author: Yunni <Euler57721@gmail.com>
Author: Yun Ni <yunn@uber.com>
Author: Joseph K. Bradley <joseph@databricks.com>
Author: Yun Ni <Euler57721@gmail.com>
Closes #15795 from Yunni/SPARK-18081-lsh-guide.
Diffstat (limited to 'examples/src/main/java/org/apache')
-rw-r--r-- | examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java | 98 | ||||
-rw-r--r-- | examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java | 70 |
2 files changed, 168 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java new file mode 100644 index 0000000000..ca3ee5a285 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.sql.SparkSession; + +// $example on$ +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.ml.feature.BucketedRandomProjectionLSH; +import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaBucketedRandomProjectionLSHExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaBucketedRandomProjectionLSHExample") + .getOrCreate(); + + // $example on$ + List<Row> dataA = Arrays.asList( + RowFactory.create(0, Vectors.dense(1.0, 1.0)), + RowFactory.create(1, Vectors.dense(1.0, -1.0)), + RowFactory.create(2, Vectors.dense(-1.0, -1.0)), + RowFactory.create(3, Vectors.dense(-1.0, 1.0)) + ); + + List<Row> dataB = Arrays.asList( + RowFactory.create(4, Vectors.dense(1.0, 0.0)), + RowFactory.create(5, Vectors.dense(-1.0, 0.0)), + RowFactory.create(6, Vectors.dense(0.0, 1.0)), + RowFactory.create(7, Vectors.dense(0.0, -1.0)) + ); + + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("keys", new VectorUDT(), false, Metadata.empty()) + }); + Dataset<Row> dfA = spark.createDataFrame(dataA, schema); + Dataset<Row> dfB = spark.createDataFrame(dataB, schema); + + Vector key = Vectors.dense(1.0, 0.0); + + BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH() + .setBucketLength(2.0) + .setNumHashTables(3) + .setInputCol("keys") + .setOutputCol("values"); + + BucketedRandomProjectionLSHModel model = mh.fit(dfA); + + // Feature Transformation + model.transform(dfA).show(); + // Cache the transformed columns + Dataset<Row> transformedA = model.transform(dfA).cache(); + Dataset<Row> transformedB = model.transform(dfB).cache(); + + // Approximate similarity join + model.approxSimilarityJoin(dfA, dfB, 1.5).show(); + model.approxSimilarityJoin(transformedA, transformedB, 1.5).show(); + // Self Join + model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show(); + + // Approximate nearest neighbor search + model.approxNearestNeighbors(dfA, key, 2).show(); + model.approxNearestNeighbors(transformedA, key, 2).show(); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java new file mode 100644 index 0000000000..9dbbf6d117 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.sql.SparkSession; + +// $example on$ +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.ml.feature.MinHashLSH; +import org.apache.spark.ml.feature.MinHashLSHModel; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaMinHashLSHExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaMinHashLSHExample") + .getOrCreate(); + + // $example on$ + List<Row> data = Arrays.asList( + RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0})) + ); + + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("keys", new VectorUDT(), false, Metadata.empty()) + }); + Dataset<Row> dataFrame = spark.createDataFrame(data, schema); + + MinHashLSH mh = new MinHashLSH() + .setNumHashTables(1) + .setInputCol("keys") + .setOutputCol("values"); + + MinHashLSHModel model = mh.fit(dataFrame); + model.transform(dataFrame).show(); + // $example off$ + + spark.stop(); + } +} |