From c684e5f9ff193ac78d48f214d7f8b87df227c8a6 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Thu, 12 Mar 2015 01:40:40 -0700 Subject: [SPARK-5186][branch-1.2] Vector.hashCode is not efficient Backport hhbyyh 's hasCode implementation to branch-1.2. The old implementation causes performance issues with PySpark, which calls hashCode (https://issues.apache.org/jira/browse/SPARK-6288). Author: Yuhao Yang Closes #4985 from mengxr/SPARK-5186-1.2 and squashes the following commits: 155e559 [Yuhao Yang] backport SPARK-5186 --- .../org/apache/spark/mllib/linalg/Vectors.scala | 55 ++++++++++++++++++++-- .../apache/spark/mllib/linalg/VectorsSuite.scala | 18 +++++++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 47d1a76fa3..451fe9122b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -51,13 +51,35 @@ sealed trait Vector extends Serializable { override def equals(other: Any): Boolean = { other match { - case v: Vector => - util.Arrays.equals(this.toArray, v.toArray) + case v2: Vector => { + if (this.size != v2.size) return false + (this, v2) match { + case (s1: SparseVector, s2: SparseVector) => + Vectors.equals(s1.indices, s1.values, s2.indices, s2.values) + case (s1: SparseVector, d1: DenseVector) => + Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values) + case (d1: DenseVector, s1: SparseVector) => + Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values) + case (_, _) => util.Arrays.equals(this.toArray, v2.toArray) + } + } case _ => false } } - override def hashCode(): Int = util.Arrays.hashCode(this.toArray) + override def hashCode(): Int = { + var result: Int = size + 31 + this.foreachActive { case (index, value) => + // ignore explict 0 for comparison between sparse and dense + if (value != 0) { + result = 31 * result + index + // refer to {@link java.util.Arrays.equals} for hash algorithm + val bits = java.lang.Double.doubleToLongBits(value) + result = 31 * result + (bits ^ (bits >>> 32)).toInt + } + } + return result + } /** * Converts the instance to a breeze vector. @@ -312,6 +334,33 @@ object Vectors { math.pow(sum, 1.0 / p) } } + + /** + * Check equality between sparse/dense vectors + */ + private[mllib] def equals( + v1Indices: IndexedSeq[Int], + v1Values: Array[Double], + v2Indices: IndexedSeq[Int], + v2Values: Array[Double]): Boolean = { + val v1Size = v1Values.size + val v2Size = v2Values.size + var k1 = 0 + var k2 = 0 + var allEqual = true + while (allEqual) { + while (k1 < v1Size && v1Values(k1) == 0) k1 += 1 + while (k2 < v2Size && v2Values(k2) == 0) k2 += 1 + + if (k1 >= v1Size || k2 >= v2Size) { + return k1 >= v1Size && k2 >= v2Size // check end alignment + } + allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2) + k1 += 1 + k2 += 1 + } + allEqual + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index f99f014509..b9a8aecc2d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -87,6 +87,24 @@ class VectorsSuite extends FunSuite { } } + test("vectors equals with explicit 0") { + val dv1 = Vectors.dense(Array(0, 0.9, 0, 0.8, 0)) + val sv1 = Vectors.sparse(5, Array(1, 3), Array(0.9, 0.8)) + val sv2 = Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(0, 0.9, 0, 0.8, 0)) + + val vectors = Seq(dv1, sv1, sv2) + for (v <- vectors; u <- vectors) { + assert(v === u) + assert(v.## === u.##) + } + + val another = Vectors.sparse(5, Array(0, 1, 3), Array(0, 0.9, 0.2)) + for (v <- vectors) { + assert(v != another) + assert(v.## != another.##) + } + } + test("indexing dense vectors") { val vec = Vectors.dense(1.0, 2.0, 3.0, 4.0) assert(vec(0) === 1.0) -- cgit v1.2.3