aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-04-28 21:49:53 -0700
committerXiangrui Meng <meng@databricks.com>2015-04-28 21:49:53 -0700
commit5ef006fc4d010905e02cb905c9115b95ba55282b (patch)
treeff6245ceb5226d8b4801b429aa45f987a76570d4 /mllib
parenta8aeadb7d4a2dc308a75a50fdd8065f9a32ef336 (diff)
downloadspark-5ef006fc4d010905e02cb905c9115b95ba55282b.tar.gz
spark-5ef006fc4d010905e02cb905c9115b95ba55282b.tar.bz2
spark-5ef006fc4d010905e02cb905c9115b95ba55282b.zip
[SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
Add `compressed` to `Vector` with some other methods: `numActives`, `numNonzeros`, `toSparse`, and `toDense`. jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #5756 from mengxr/SPARK-6756 and squashes the following commits: 8d4ecbd [Xiangrui Meng] address comment and add mima excludes da54179 [Xiangrui Meng] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala93
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala44
2 files changed, 137 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 34833e90d4..188d1e542b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -116,6 +116,40 @@ sealed trait Vector extends Serializable {
* with type `Double`.
*/
private[spark] def foreachActive(f: (Int, Double) => Unit)
+
+ /**
+ * Number of active entries. An "active entry" is an element which is explicitly stored,
+ * regardless of its value. Note that inactive entries have value 0.
+ */
+ def numActives: Int
+
+ /**
+ * Number of nonzero elements. This scans all active values and count nonzeros.
+ */
+ def numNonzeros: Int
+
+ /**
+ * Converts this vector to a sparse vector with all explicit zeros removed.
+ */
+ def toSparse: SparseVector
+
+ /**
+ * Converts this vector to a dense vector.
+ */
+ def toDense: DenseVector = new DenseVector(this.toArray)
+
+ /**
+ * Returns a vector in either dense or sparse format, whichever uses less storage.
+ */
+ def compressed: Vector = {
+ val nnz = numNonzeros
+ // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
+ if (1.5 * (nnz + 1.0) < size) {
+ toSparse
+ } else {
+ toDense
+ }
+ }
}
/**
@@ -525,6 +559,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
result
}
+
+ override def numActives: Int = size
+
+ override def numNonzeros: Int = {
+ // same as values.count(_ != 0.0) but faster
+ var nnz = 0
+ values.foreach { v =>
+ if (v != 0.0) {
+ nnz += 1
+ }
+ }
+ nnz
+ }
+
+ override def toSparse: SparseVector = {
+ val nnz = numNonzeros
+ val ii = new Array[Int](nnz)
+ val vv = new Array[Double](nnz)
+ var k = 0
+ foreachActive { (i, v) =>
+ if (v != 0) {
+ ii(k) = i
+ vv(k) = v
+ k += 1
+ }
+ }
+ new SparseVector(size, ii, vv)
+ }
}
object DenseVector {
@@ -602,6 +664,37 @@ class SparseVector(
}
result
}
+
+ override def numActives: Int = values.length
+
+ override def numNonzeros: Int = {
+ var nnz = 0
+ values.foreach { v =>
+ if (v != 0.0) {
+ nnz += 1
+ }
+ }
+ nnz
+ }
+
+ override def toSparse: SparseVector = {
+ val nnz = numNonzeros
+ if (nnz == numActives) {
+ this
+ } else {
+ val ii = new Array[Int](nnz)
+ val vv = new Array[Double](nnz)
+ var k = 0
+ foreachActive { (i, v) =>
+ if (v != 0.0) {
+ ii(k) = i
+ vv(k) = v
+ k += 1
+ }
+ }
+ new SparseVector(size, ii, vv)
+ }
+ }
}
object SparseVector {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 2839c4c289..24755e9ff4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
}
+
+ test("Vector numActive and numNonzeros") {
+ val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
+ assert(dv.numActives === 4)
+ assert(dv.numNonzeros === 2)
+
+ val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
+ assert(sv.numActives === 3)
+ assert(sv.numNonzeros === 2)
+ }
+
+ test("Vector toSparse and toDense") {
+ val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
+ assert(dv0.toDense === dv0)
+ val dv0s = dv0.toSparse
+ assert(dv0s.numActives === 2)
+ assert(dv0s === dv0)
+
+ val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
+ assert(sv0.toDense === sv0)
+ val sv0s = sv0.toSparse
+ assert(sv0s.numActives === 2)
+ assert(sv0s === sv0)
+ }
+
+ test("Vector.compressed") {
+ val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
+ val dv0c = dv0.compressed.asInstanceOf[DenseVector]
+ assert(dv0c === dv0)
+
+ val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
+ val dv1c = dv1.compressed.asInstanceOf[SparseVector]
+ assert(dv1 === dv1c)
+ assert(dv1c.numActives === 1)
+
+ val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
+ val sv0c = sv0.compressed.asInstanceOf[SparseVector]
+ assert(sv0 === sv0c)
+ assert(sv0c.numActives === 1)
+
+ val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
+ val sv1c = sv1.compressed.asInstanceOf[DenseVector]
+ assert(sv1 === sv1c)
+ }
}