aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main
diff options
context:
space:
mode:
authorReza Zadeh <rizlar@gmail.com>2014-10-07 16:40:16 -0700
committerXiangrui Meng <meng@databricks.com>2014-10-07 16:40:16 -0700
commit3d7b36e0de26049e8b36b6705d8ff4224bde9eb1 (patch)
treed1bbe86cfbb9af7e39e42780a11146ebc017be8d /examples/src/main
parent446063eca98ae56d1ac61415f4c6e89699b8db02 (diff)
downloadspark-3d7b36e0de26049e8b36b6705d8ff4224bde9eb1.tar.gz
spark-3d7b36e0de26049e8b36b6705d8ff4224bde9eb1.tar.bz2
spark-3d7b36e0de26049e8b36b6705d8ff4224bde9eb1.zip
[SPARK-3790][MLlib] CosineSimilarity Example
Provide example for `RowMatrix.columnSimilarity()` Author: Reza Zadeh <rizlar@gmail.com> Closes #2622 from rezazadeh/dimsumexample and squashes the following commits: 8f20b82 [Reza Zadeh] update comment 379066d [Reza Zadeh] cache rows 792b81c [Reza Zadeh] Address review comments e573c7a [Reza Zadeh] Average absolute error b15685f [Reza Zadeh] Use scopt. Distribute evaluation. eca3dfd [Reza Zadeh] Documentation ac96fb2 [Reza Zadeh] Compute approximation error, add command line. 4533579 [Reza Zadeh] CosineSimilarity Example
Diffstat (limited to 'examples/src/main')
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala107
1 files changed, 107 insertions, 0 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
new file mode 100644
index 0000000000..6a3b0241ce
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import scopt.OptionParser
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * Compute the similar columns of a matrix, using cosine similarity.
+ *
+ * The input matrix must be stored in row-oriented dense format, one line per row with its entries
+ * separated by space. For example,
+ * {{{
+ * 0.5 1.0
+ * 2.0 3.0
+ * 4.0 5.0
+ * }}}
+ * represents a 3-by-2 matrix, whose first row is (0.5, 1.0).
+ *
+ * Example invocation:
+ *
+ * bin/run-example mllib.CosineSimilarity \
+ * --threshold 0.1 data/mllib/sample_svm_data.txt
+ */
+object CosineSimilarity {
+ case class Params(inputFile: String = null, threshold: Double = 0.1)
+
+ def main(args: Array[String]) {
+ val defaultParams = Params()
+
+ val parser = new OptionParser[Params]("CosineSimilarity") {
+ head("CosineSimilarity: an example app.")
+ opt[Double]("threshold")
+ .required()
+ .text(s"threshold similarity: to tradeoff computation vs quality estimate")
+ .action((x, c) => c.copy(threshold = x))
+ arg[String]("<inputFile>")
+ .required()
+ .text(s"input file, one row per line, space-separated")
+ .action((x, c) => c.copy(inputFile = x))
+ note(
+ """
+ |For example, the following command runs this app on a dataset:
+ |
+ | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \
+ | examplesjar.jar \
+ | --threshold 0.1 data/mllib/sample_svm_data.txt
+ """.stripMargin)
+ }
+
+ parser.parse(args, defaultParams).map { params =>
+ run(params)
+ } getOrElse {
+ System.exit(1)
+ }
+ }
+
+ def run(params: Params) {
+ val conf = new SparkConf().setAppName("CosineSimilarity")
+ val sc = new SparkContext(conf)
+
+ // Load and parse the data file.
+ val rows = sc.textFile(params.inputFile).map { line =>
+ val values = line.split(' ').map(_.toDouble)
+ Vectors.dense(values)
+ }.cache()
+ val mat = new RowMatrix(rows)
+
+ // Compute similar columns perfectly, with brute force.
+ val exact = mat.columnSimilarities()
+
+ // Compute similar columns with estimation using DIMSUM
+ val approx = mat.columnSimilarities(params.threshold)
+
+ val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) }
+ val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) }
+ val MAE = exactEntries.leftOuterJoin(approxEntries).values.map {
+ case (u, Some(v)) =>
+ math.abs(u - v)
+ case (u, None) =>
+ math.abs(u)
+ }.mean()
+
+ println(s"Average absolute error in estimate is: $MAE")
+
+ sc.stop()
+ }
+}