aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorReza Zadeh <reza@databricks.com>2015-10-26 22:00:24 -0700
committerXiangrui Meng <meng@databricks.com>2015-10-26 22:00:24 -0700
commit8b292b19c9b3aaaa51b919a12132e099e5be832d (patch)
tree21eb6092735c74f891051793236781b28a6f7fa9 /mllib
parent3cac6614a4fe60b1446bf704d0a35787d385fb86 (diff)
downloadspark-8b292b19c9b3aaaa51b919a12132e099e5be832d.tar.gz
spark-8b292b19c9b3aaaa51b919a12132e099e5be832d.tar.bz2
spark-8b292b19c9b3aaaa51b919a12132e099e5be832d.zip
[SPARK-10654][MLLIB] Add columnSimilarities to IndexedRowMatrix
Add columnSimilarities to IndexedRowMatrix by delegating to functionality already in RowMatrix. With a test. Author: Reza Zadeh <reza@databricks.com> Closes #8792 from rezazadeh/colsims.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala13
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala12
2 files changed, 25 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index e6af0c0ec7..976299124c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -68,6 +68,19 @@ class IndexedRowMatrix @Since("1.0.0") (
nRows
}
+
+ /**
+ * Compute all cosine similarities between columns of this matrix using the brute-force
+ * approach of computing normalized dot products.
+ *
+ * @return An n x n sparse upper-triangular matrix of cosine similarities between
+ * columns of this matrix.
+ */
+ @Since("1.6.0")
+ def columnSimilarities(): CoordinateMatrix = {
+ toRowMatrix().columnSimilarities()
+ }
+
/**
* Drops row indices and converts this matrix to a
* [[org.apache.spark.mllib.linalg.distributed.RowMatrix]].
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
index 0ecb7a221a..6de6cf2fa8 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
@@ -153,6 +153,18 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
}
}
+ test("similar columns") {
+ val A = new IndexedRowMatrix(indexedRows)
+ val gram = A.computeGramianMatrix().toBreeze.toDenseMatrix
+
+ val G = A.columnSimilarities().toBreeze()
+
+ for (i <- 0 until n; j <- i + 1 until n) {
+ val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j))
+ assert(math.abs(G(i, j) - trueResult) < 1e-6)
+ }
+ }
+
def closeToZero(G: BDM[Double]): Boolean = {
G.valuesIterator.map(math.abs).sum < 1e-6
}