aboutsummaryrefslogtreecommitdiff
path: root/mllib-local/src/main/scala/org/apache/spark
diff options
context:
space:
mode:
Diffstat (limited to 'mllib-local/src/main/scala/org/apache/spark')
-rw-r--r--mllib-local/src/main/scala/org/apache/spark/ml/DummyTesting.scala23
-rw-r--r--mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala723
-rw-r--r--mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala1026
-rw-r--r--mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala736
4 files changed, 2485 insertions, 23 deletions
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/DummyTesting.scala b/mllib-local/src/main/scala/org/apache/spark/ml/DummyTesting.scala
deleted file mode 100644
index 6b3268cdfa..0000000000
--- a/mllib-local/src/main/scala/org/apache/spark/ml/DummyTesting.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml
-
-// This is a private class testing if the new build works. To be removed soon.
-private[ml] object DummyTesting {
- private[ml] def add10(input: Double): Double = input + 10
-}
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
new file mode 100644
index 0000000000..41b0c6c89a
--- /dev/null
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
@@ -0,0 +1,723 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.linalg
+
+import com.github.fommil.netlib.{BLAS => NetlibBLAS, F2jBLAS}
+import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
+
+/**
+ * BLAS routines for MLlib's vectors and matrices.
+ */
+private[spark] object BLAS extends Serializable {
+
+ @transient private var _f2jBLAS: NetlibBLAS = _
+ @transient private var _nativeBLAS: NetlibBLAS = _
+
+ // For level-1 routines, we use Java implementation.
+ private def f2jBLAS: NetlibBLAS = {
+ if (_f2jBLAS == null) {
+ _f2jBLAS = new F2jBLAS
+ }
+ _f2jBLAS
+ }
+
+ /**
+ * y += a * x
+ */
+ def axpy(a: Double, x: Vector, y: Vector): Unit = {
+ require(x.size == y.size)
+ y match {
+ case dy: DenseVector =>
+ x match {
+ case sx: SparseVector =>
+ axpy(a, sx, dy)
+ case dx: DenseVector =>
+ axpy(a, dx, dy)
+ case _ =>
+ throw new UnsupportedOperationException(
+ s"axpy doesn't support x type ${x.getClass}.")
+ }
+ case _ =>
+ throw new IllegalArgumentException(
+ s"axpy only supports adding to a dense vector but got type ${y.getClass}.")
+ }
+ }
+
+ /**
+ * y += a * x
+ */
+ private def axpy(a: Double, x: DenseVector, y: DenseVector): Unit = {
+ val n = x.size
+ f2jBLAS.daxpy(n, a, x.values, 1, y.values, 1)
+ }
+
+ /**
+ * y += a * x
+ */
+ private def axpy(a: Double, x: SparseVector, y: DenseVector): Unit = {
+ val xValues = x.values
+ val xIndices = x.indices
+ val yValues = y.values
+ val nnz = xIndices.length
+
+ if (a == 1.0) {
+ var k = 0
+ while (k < nnz) {
+ yValues(xIndices(k)) += xValues(k)
+ k += 1
+ }
+ } else {
+ var k = 0
+ while (k < nnz) {
+ yValues(xIndices(k)) += a * xValues(k)
+ k += 1
+ }
+ }
+ }
+
+ /** Y += a * x */
+ private[spark] def axpy(a: Double, X: DenseMatrix, Y: DenseMatrix): Unit = {
+ require(X.numRows == Y.numRows && X.numCols == Y.numCols, "Dimension mismatch: " +
+ s"size(X) = ${(X.numRows, X.numCols)} but size(Y) = ${(Y.numRows, Y.numCols)}.")
+ f2jBLAS.daxpy(X.numRows * X.numCols, a, X.values, 1, Y.values, 1)
+ }
+
+ /**
+ * dot(x, y)
+ */
+ def dot(x: Vector, y: Vector): Double = {
+ require(x.size == y.size,
+ "BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes:" +
+ " x.size = " + x.size + ", y.size = " + y.size)
+ (x, y) match {
+ case (dx: DenseVector, dy: DenseVector) =>
+ dot(dx, dy)
+ case (sx: SparseVector, dy: DenseVector) =>
+ dot(sx, dy)
+ case (dx: DenseVector, sy: SparseVector) =>
+ dot(sy, dx)
+ case (sx: SparseVector, sy: SparseVector) =>
+ dot(sx, sy)
+ case _ =>
+ throw new IllegalArgumentException(s"dot doesn't support (${x.getClass}, ${y.getClass}).")
+ }
+ }
+
+ /**
+ * dot(x, y)
+ */
+ private def dot(x: DenseVector, y: DenseVector): Double = {
+ val n = x.size
+ f2jBLAS.ddot(n, x.values, 1, y.values, 1)
+ }
+
+ /**
+ * dot(x, y)
+ */
+ private def dot(x: SparseVector, y: DenseVector): Double = {
+ val xValues = x.values
+ val xIndices = x.indices
+ val yValues = y.values
+ val nnz = xIndices.length
+
+ var sum = 0.0
+ var k = 0
+ while (k < nnz) {
+ sum += xValues(k) * yValues(xIndices(k))
+ k += 1
+ }
+ sum
+ }
+
+ /**
+ * dot(x, y)
+ */
+ private def dot(x: SparseVector, y: SparseVector): Double = {
+ val xValues = x.values
+ val xIndices = x.indices
+ val yValues = y.values
+ val yIndices = y.indices
+ val nnzx = xIndices.length
+ val nnzy = yIndices.length
+
+ var kx = 0
+ var ky = 0
+ var sum = 0.0
+ // y catching x
+ while (kx < nnzx && ky < nnzy) {
+ val ix = xIndices(kx)
+ while (ky < nnzy && yIndices(ky) < ix) {
+ ky += 1
+ }
+ if (ky < nnzy && yIndices(ky) == ix) {
+ sum += xValues(kx) * yValues(ky)
+ ky += 1
+ }
+ kx += 1
+ }
+ sum
+ }
+
+ /**
+ * y = x
+ */
+ def copy(x: Vector, y: Vector): Unit = {
+ val n = y.size
+ require(x.size == n)
+ y match {
+ case dy: DenseVector =>
+ x match {
+ case sx: SparseVector =>
+ val sxIndices = sx.indices
+ val sxValues = sx.values
+ val dyValues = dy.values
+ val nnz = sxIndices.length
+
+ var i = 0
+ var k = 0
+ while (k < nnz) {
+ val j = sxIndices(k)
+ while (i < j) {
+ dyValues(i) = 0.0
+ i += 1
+ }
+ dyValues(i) = sxValues(k)
+ i += 1
+ k += 1
+ }
+ while (i < n) {
+ dyValues(i) = 0.0
+ i += 1
+ }
+ case dx: DenseVector =>
+ Array.copy(dx.values, 0, dy.values, 0, n)
+ }
+ case _ =>
+ throw new IllegalArgumentException(s"y must be dense in copy but got ${y.getClass}")
+ }
+ }
+
+ /**
+ * x = a * x
+ */
+ def scal(a: Double, x: Vector): Unit = {
+ x match {
+ case sx: SparseVector =>
+ f2jBLAS.dscal(sx.values.length, a, sx.values, 1)
+ case dx: DenseVector =>
+ f2jBLAS.dscal(dx.values.length, a, dx.values, 1)
+ case _ =>
+ throw new IllegalArgumentException(s"scal doesn't support vector type ${x.getClass}.")
+ }
+ }
+
+ // For level-3 routines, we use the native BLAS.
+ private def nativeBLAS: NetlibBLAS = {
+ if (_nativeBLAS == null) {
+ _nativeBLAS = NativeBLAS
+ }
+ _nativeBLAS
+ }
+
+ /**
+ * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
+ *
+ * @param U the upper triangular part of the matrix in a [[DenseVector]](column major)
+ */
+ def spr(alpha: Double, v: Vector, U: DenseVector): Unit = {
+ spr(alpha, v, U.values)
+ }
+
+ /**
+ * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
+ *
+ * @param U the upper triangular part of the matrix packed in an array (column major)
+ */
+ def spr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
+ val n = v.size
+ v match {
+ case DenseVector(values) =>
+ NativeBLAS.dspr("U", n, alpha, values, 1, U)
+ case SparseVector(size, indices, values) =>
+ val nnz = indices.length
+ var colStartIdx = 0
+ var prevCol = 0
+ var col = 0
+ var j = 0
+ var i = 0
+ var av = 0.0
+ while (j < nnz) {
+ col = indices(j)
+ // Skip empty columns.
+ colStartIdx += (col - prevCol) * (col + prevCol + 1) / 2
+ col = indices(j)
+ av = alpha * values(j)
+ i = 0
+ while (i <= j) {
+ U(colStartIdx + indices(i)) += av * values(i)
+ i += 1
+ }
+ j += 1
+ prevCol = col
+ }
+ }
+ }
+
+ /**
+ * A := alpha * x * x^T^ + A
+ * @param alpha a real scalar that will be multiplied to x * x^T^.
+ * @param x the vector x that contains the n elements.
+ * @param A the symmetric matrix A. Size of n x n.
+ */
+ def syr(alpha: Double, x: Vector, A: DenseMatrix) {
+ val mA = A.numRows
+ val nA = A.numCols
+ require(mA == nA, s"A is not a square matrix (and hence is not symmetric). A: $mA x $nA")
+ require(mA == x.size, s"The size of x doesn't match the rank of A. A: $mA x $nA, x: ${x.size}")
+
+ x match {
+ case dv: DenseVector => syr(alpha, dv, A)
+ case sv: SparseVector => syr(alpha, sv, A)
+ case _ =>
+ throw new IllegalArgumentException(s"syr doesn't support vector type ${x.getClass}.")
+ }
+ }
+
+ private def syr(alpha: Double, x: DenseVector, A: DenseMatrix) {
+ val nA = A.numRows
+ val mA = A.numCols
+
+ nativeBLAS.dsyr("U", x.size, alpha, x.values, 1, A.values, nA)
+
+ // Fill lower triangular part of A
+ var i = 0
+ while (i < mA) {
+ var j = i + 1
+ while (j < nA) {
+ A(j, i) = A(i, j)
+ j += 1
+ }
+ i += 1
+ }
+ }
+
+ private def syr(alpha: Double, x: SparseVector, A: DenseMatrix) {
+ val mA = A.numCols
+ val xIndices = x.indices
+ val xValues = x.values
+ val nnz = xValues.length
+ val Avalues = A.values
+
+ var i = 0
+ while (i < nnz) {
+ val multiplier = alpha * xValues(i)
+ val offset = xIndices(i) * mA
+ var j = 0
+ while (j < nnz) {
+ Avalues(xIndices(j) + offset) += multiplier * xValues(j)
+ j += 1
+ }
+ i += 1
+ }
+ }
+
+ /**
+ * C := alpha * A * B + beta * C
+ * @param alpha a scalar to scale the multiplication A * B.
+ * @param A the matrix A that will be left multiplied to B. Size of m x k.
+ * @param B the matrix B that will be left multiplied by A. Size of k x n.
+ * @param beta a scalar that can be used to scale matrix C.
+ * @param C the resulting matrix C. Size of m x n. C.isTransposed must be false.
+ */
+ def gemm(
+ alpha: Double,
+ A: Matrix,
+ B: DenseMatrix,
+ beta: Double,
+ C: DenseMatrix): Unit = {
+ require(!C.isTransposed,
+ "The matrix C cannot be the product of a transpose() call. C.isTransposed must be false.")
+ if (alpha == 0.0 && beta == 1.0) {
+ // gemm: alpha is equal to 0 and beta is equal to 1. Returning C.
+ return
+ } else if (alpha == 0.0) {
+ f2jBLAS.dscal(C.values.length, beta, C.values, 1)
+ } else {
+ A match {
+ case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C)
+ case dense: DenseMatrix => gemm(alpha, dense, B, beta, C)
+ case _ =>
+ throw new IllegalArgumentException(s"gemm doesn't support matrix type ${A.getClass}.")
+ }
+ }
+ }
+
+ /**
+ * C := alpha * A * B + beta * C
+ * For `DenseMatrix` A.
+ */
+ private def gemm(
+ alpha: Double,
+ A: DenseMatrix,
+ B: DenseMatrix,
+ beta: Double,
+ C: DenseMatrix): Unit = {
+ val tAstr = if (A.isTransposed) "T" else "N"
+ val tBstr = if (B.isTransposed) "T" else "N"
+ val lda = if (!A.isTransposed) A.numRows else A.numCols
+ val ldb = if (!B.isTransposed) B.numRows else B.numCols
+
+ require(A.numCols == B.numRows,
+ s"The columns of A don't match the rows of B. A: ${A.numCols}, B: ${B.numRows}")
+ require(A.numRows == C.numRows,
+ s"The rows of C don't match the rows of A. C: ${C.numRows}, A: ${A.numRows}")
+ require(B.numCols == C.numCols,
+ s"The columns of C don't match the columns of B. C: ${C.numCols}, A: ${B.numCols}")
+ nativeBLAS.dgemm(tAstr, tBstr, A.numRows, B.numCols, A.numCols, alpha, A.values, lda,
+ B.values, ldb, beta, C.values, C.numRows)
+ }
+
+ /**
+ * C := alpha * A * B + beta * C
+ * For `SparseMatrix` A.
+ */
+ private def gemm(
+ alpha: Double,
+ A: SparseMatrix,
+ B: DenseMatrix,
+ beta: Double,
+ C: DenseMatrix): Unit = {
+ val mA: Int = A.numRows
+ val nB: Int = B.numCols
+ val kA: Int = A.numCols
+ val kB: Int = B.numRows
+
+ require(kA == kB, s"The columns of A don't match the rows of B. A: $kA, B: $kB")
+ require(mA == C.numRows, s"The rows of C don't match the rows of A. C: ${C.numRows}, A: $mA")
+ require(nB == C.numCols,
+ s"The columns of C don't match the columns of B. C: ${C.numCols}, A: $nB")
+
+ val Avals = A.values
+ val Bvals = B.values
+ val Cvals = C.values
+ val ArowIndices = A.rowIndices
+ val AcolPtrs = A.colPtrs
+
+ // Slicing is easy in this case. This is the optimal multiplication setting for sparse matrices
+ if (A.isTransposed) {
+ var colCounterForB = 0
+ if (!B.isTransposed) { // Expensive to put the check inside the loop
+ while (colCounterForB < nB) {
+ var rowCounterForA = 0
+ val Cstart = colCounterForB * mA
+ val Bstart = colCounterForB * kA
+ while (rowCounterForA < mA) {
+ var i = AcolPtrs(rowCounterForA)
+ val indEnd = AcolPtrs(rowCounterForA + 1)
+ var sum = 0.0
+ while (i < indEnd) {
+ sum += Avals(i) * Bvals(Bstart + ArowIndices(i))
+ i += 1
+ }
+ val Cindex = Cstart + rowCounterForA
+ Cvals(Cindex) = beta * Cvals(Cindex) + sum * alpha
+ rowCounterForA += 1
+ }
+ colCounterForB += 1
+ }
+ } else {
+ while (colCounterForB < nB) {
+ var rowCounterForA = 0
+ val Cstart = colCounterForB * mA
+ while (rowCounterForA < mA) {
+ var i = AcolPtrs(rowCounterForA)
+ val indEnd = AcolPtrs(rowCounterForA + 1)
+ var sum = 0.0
+ while (i < indEnd) {
+ sum += Avals(i) * B(ArowIndices(i), colCounterForB)
+ i += 1
+ }
+ val Cindex = Cstart + rowCounterForA
+ Cvals(Cindex) = beta * Cvals(Cindex) + sum * alpha
+ rowCounterForA += 1
+ }
+ colCounterForB += 1
+ }
+ }
+ } else {
+ // Scale matrix first if `beta` is not equal to 1.0
+ if (beta != 1.0) {
+ f2jBLAS.dscal(C.values.length, beta, C.values, 1)
+ }
+ // Perform matrix multiplication and add to C. The rows of A are multiplied by the columns of
+ // B, and added to C.
+ var colCounterForB = 0 // the column to be updated in C
+ if (!B.isTransposed) { // Expensive to put the check inside the loop
+ while (colCounterForB < nB) {
+ var colCounterForA = 0 // The column of A to multiply with the row of B
+ val Bstart = colCounterForB * kB
+ val Cstart = colCounterForB * mA
+ while (colCounterForA < kA) {
+ var i = AcolPtrs(colCounterForA)
+ val indEnd = AcolPtrs(colCounterForA + 1)
+ val Bval = Bvals(Bstart + colCounterForA) * alpha
+ while (i < indEnd) {
+ Cvals(Cstart + ArowIndices(i)) += Avals(i) * Bval
+ i += 1
+ }
+ colCounterForA += 1
+ }
+ colCounterForB += 1
+ }
+ } else {
+ while (colCounterForB < nB) {
+ var colCounterForA = 0 // The column of A to multiply with the row of B
+ val Cstart = colCounterForB * mA
+ while (colCounterForA < kA) {
+ var i = AcolPtrs(colCounterForA)
+ val indEnd = AcolPtrs(colCounterForA + 1)
+ val Bval = B(colCounterForA, colCounterForB) * alpha
+ while (i < indEnd) {
+ Cvals(Cstart + ArowIndices(i)) += Avals(i) * Bval
+ i += 1
+ }
+ colCounterForA += 1
+ }
+ colCounterForB += 1
+ }
+ }
+ }
+ }
+
+ /**
+ * y := alpha * A * x + beta * y
+ * @param alpha a scalar to scale the multiplication A * x.
+ * @param A the matrix A that will be left multiplied to x. Size of m x n.
+ * @param x the vector x that will be left multiplied by A. Size of n x 1.
+ * @param beta a scalar that can be used to scale vector y.
+ * @param y the resulting vector y. Size of m x 1.
+ */
+ def gemv(
+ alpha: Double,
+ A: Matrix,
+ x: Vector,
+ beta: Double,
+ y: DenseVector): Unit = {
+ require(A.numCols == x.size,
+ s"The columns of A don't match the number of elements of x. A: ${A.numCols}, x: ${x.size}")
+ require(A.numRows == y.size,
+ s"The rows of A don't match the number of elements of y. A: ${A.numRows}, y:${y.size}")
+ if (alpha == 0.0 && beta == 1.0) {
+ // gemv: alpha is equal to 0 and beta is equal to 1. Returning y.
+ return
+ } else if (alpha == 0.0) {
+ scal(beta, y)
+ } else {
+ (A, x) match {
+ case (smA: SparseMatrix, dvx: DenseVector) =>
+ gemv(alpha, smA, dvx, beta, y)
+ case (smA: SparseMatrix, svx: SparseVector) =>
+ gemv(alpha, smA, svx, beta, y)
+ case (dmA: DenseMatrix, dvx: DenseVector) =>
+ gemv(alpha, dmA, dvx, beta, y)
+ case (dmA: DenseMatrix, svx: SparseVector) =>
+ gemv(alpha, dmA, svx, beta, y)
+ case _ =>
+ throw new IllegalArgumentException(s"gemv doesn't support running on matrix type " +
+ s"${A.getClass} and vector type ${x.getClass}.")
+ }
+ }
+ }
+
+ /**
+ * y := alpha * A * x + beta * y
+ * For `DenseMatrix` A and `DenseVector` x.
+ */
+ private def gemv(
+ alpha: Double,
+ A: DenseMatrix,
+ x: DenseVector,
+ beta: Double,
+ y: DenseVector): Unit = {
+ val tStrA = if (A.isTransposed) "T" else "N"
+ val mA = if (!A.isTransposed) A.numRows else A.numCols
+ val nA = if (!A.isTransposed) A.numCols else A.numRows
+ nativeBLAS.dgemv(tStrA, mA, nA, alpha, A.values, mA, x.values, 1, beta,
+ y.values, 1)
+ }
+
+ /**
+ * y := alpha * A * x + beta * y
+ * For `DenseMatrix` A and `SparseVector` x.
+ */
+ private def gemv(
+ alpha: Double,
+ A: DenseMatrix,
+ x: SparseVector,
+ beta: Double,
+ y: DenseVector): Unit = {
+ val mA: Int = A.numRows
+ val nA: Int = A.numCols
+
+ val Avals = A.values
+
+ val xIndices = x.indices
+ val xNnz = xIndices.length
+ val xValues = x.values
+ val yValues = y.values
+
+ if (A.isTransposed) {
+ var rowCounterForA = 0
+ while (rowCounterForA < mA) {
+ var sum = 0.0
+ var k = 0
+ while (k < xNnz) {
+ sum += xValues(k) * Avals(xIndices(k) + rowCounterForA * nA)
+ k += 1
+ }
+ yValues(rowCounterForA) = sum * alpha + beta * yValues(rowCounterForA)
+ rowCounterForA += 1
+ }
+ } else {
+ var rowCounterForA = 0
+ while (rowCounterForA < mA) {
+ var sum = 0.0
+ var k = 0
+ while (k < xNnz) {
+ sum += xValues(k) * Avals(xIndices(k) * mA + rowCounterForA)
+ k += 1
+ }
+ yValues(rowCounterForA) = sum * alpha + beta * yValues(rowCounterForA)
+ rowCounterForA += 1
+ }
+ }
+ }
+
+ /**
+ * y := alpha * A * x + beta * y
+ * For `SparseMatrix` A and `SparseVector` x.
+ */
+ private def gemv(
+ alpha: Double,
+ A: SparseMatrix,
+ x: SparseVector,
+ beta: Double,
+ y: DenseVector): Unit = {
+ val xValues = x.values
+ val xIndices = x.indices
+ val xNnz = xIndices.length
+
+ val yValues = y.values
+
+ val mA: Int = A.numRows
+ val nA: Int = A.numCols
+
+ val Avals = A.values
+ val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs
+ val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices
+
+ if (A.isTransposed) {
+ var rowCounter = 0
+ while (rowCounter < mA) {
+ var i = Arows(rowCounter)
+ val indEnd = Arows(rowCounter + 1)
+ var sum = 0.0
+ var k = 0
+ while (k < xNnz && i < indEnd) {
+ if (xIndices(k) == Acols(i)) {
+ sum += Avals(i) * xValues(k)
+ i += 1
+ }
+ k += 1
+ }
+ yValues(rowCounter) = sum * alpha + beta * yValues(rowCounter)
+ rowCounter += 1
+ }
+ } else {
+ if (beta != 1.0) scal(beta, y)
+
+ var colCounterForA = 0
+ var k = 0
+ while (colCounterForA < nA && k < xNnz) {
+ if (xIndices(k) == colCounterForA) {
+ var i = Acols(colCounterForA)
+ val indEnd = Acols(colCounterForA + 1)
+
+ val xTemp = xValues(k) * alpha
+ while (i < indEnd) {
+ val rowIndex = Arows(i)
+ yValues(Arows(i)) += Avals(i) * xTemp
+ i += 1
+ }
+ k += 1
+ }
+ colCounterForA += 1
+ }
+ }
+ }
+
+ /**
+ * y := alpha * A * x + beta * y
+ * For `SparseMatrix` A and `DenseVector` x.
+ */
+ private def gemv(
+ alpha: Double,
+ A: SparseMatrix,
+ x: DenseVector,
+ beta: Double,
+ y: DenseVector): Unit = {
+ val xValues = x.values
+ val yValues = y.values
+ val mA: Int = A.numRows
+ val nA: Int = A.numCols
+
+ val Avals = A.values
+ val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs
+ val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices
+ // Slicing is easy in this case. This is the optimal multiplication setting for sparse matrices
+ if (A.isTransposed) {
+ var rowCounter = 0
+ while (rowCounter < mA) {
+ var i = Arows(rowCounter)
+ val indEnd = Arows(rowCounter + 1)
+ var sum = 0.0
+ while (i < indEnd) {
+ sum += Avals(i) * xValues(Acols(i))
+ i += 1
+ }
+ yValues(rowCounter) = beta * yValues(rowCounter) + sum * alpha
+ rowCounter += 1
+ }
+ } else {
+ if (beta != 1.0) scal(beta, y)
+ // Perform matrix-vector multiplication and add to y
+ var colCounterForA = 0
+ while (colCounterForA < nA) {
+ var i = Acols(colCounterForA)
+ val indEnd = Acols(colCounterForA + 1)
+ val xVal = xValues(colCounterForA) * alpha
+ while (i < indEnd) {
+ val rowIndex = Arows(i)
+ yValues(rowIndex) += Avals(i) * xVal
+ i += 1
+ }
+ colCounterForA += 1
+ }
+ }
+ }
+}
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
new file mode 100644
index 0000000000..baa04fb0fd
--- /dev/null
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -0,0 +1,1026 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.linalg
+
+import java.util.{Arrays, Random}
+
+import scala.collection.mutable.{ArrayBuffer, ArrayBuilder => MArrayBuilder, HashSet => MHashSet}
+
+import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM}
+import com.github.fommil.netlib.BLAS.{getInstance => blas}
+
+/**
+ * Trait for a local matrix.
+ */
+sealed trait Matrix extends Serializable {
+
+ /** Number of rows. */
+ def numRows: Int
+
+ /** Number of columns. */
+ def numCols: Int
+
+ /** Flag that keeps track whether the matrix is transposed or not. False by default. */
+ val isTransposed: Boolean = false
+
+ /** Converts to a dense array in column major. */
+ def toArray: Array[Double] = {
+ val newArray = new Array[Double](numRows * numCols)
+ foreachActive { (i, j, v) =>
+ newArray(j * numRows + i) = v
+ }
+ newArray
+ }
+
+ /**
+ * Returns an iterator of column vectors.
+ * This operation could be expensive, depending on the underlying storage.
+ */
+ def colIter: Iterator[Vector]
+
+ /**
+ * Returns an iterator of row vectors.
+ * This operation could be expensive, depending on the underlying storage.
+ */
+ def rowIter: Iterator[Vector] = this.transpose.colIter
+
+ /** Converts to a breeze matrix. */
+ private[ml] def toBreeze: BM[Double]
+
+ /** Gets the (i, j)-th element. */
+ def apply(i: Int, j: Int): Double
+
+ /** Return the index for the (i, j)-th element in the backing array. */
+ private[ml] def index(i: Int, j: Int): Int
+
+ /** Update element at (i, j) */
+ private[ml] def update(i: Int, j: Int, v: Double): Unit
+
+ /** Get a deep copy of the matrix. */
+ def copy: Matrix
+
+ /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */
+ def transpose: Matrix
+
+ /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+ def multiply(y: DenseMatrix): DenseMatrix = {
+ val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
+ BLAS.gemm(1.0, this, y, 0.0, C)
+ C
+ }
+
+ /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
+ def multiply(y: DenseVector): DenseVector = {
+ multiply(y.asInstanceOf[Vector])
+ }
+
+ /** Convenience method for `Matrix`-`Vector` multiplication. */
+ def multiply(y: Vector): DenseVector = {
+ val output = new DenseVector(new Array[Double](numRows))
+ BLAS.gemv(1.0, this, y, 0.0, output)
+ output
+ }
+
+ /** A human readable representation of the matrix */
+ override def toString: String = toBreeze.toString()
+
+ /** A human readable representation of the matrix with maximum lines and width */
+ def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth)
+
+ /**
+ * Map the values of this matrix using a function. Generates a new matrix. Performs the
+ * function on only the backing array. For example, an operation such as addition or
+ * subtraction will only be performed on the non-zero values in a `SparseMatrix`.
+ */
+ private[spark] def map(f: Double => Double): Matrix
+
+ /**
+ * Update all the values of this matrix using the function f. Performed in-place on the
+ * backing array. For example, an operation such as addition or subtraction will only be
+ * performed on the non-zero values in a `SparseMatrix`.
+ */
+ private[ml] def update(f: Double => Double): Matrix
+
+ /**
+ * Applies a function `f` to all the active elements of dense and sparse matrix. The ordering
+ * of the elements are not defined.
+ *
+ * @param f the function takes three parameters where the first two parameters are the row
+ * and column indices respectively with the type `Int`, and the final parameter is the
+ * corresponding value in the matrix with type `Double`.
+ */
+ private[spark] def foreachActive(f: (Int, Int, Double) => Unit)
+
+ /**
+ * Find the number of non-zero active values.
+ */
+ def numNonzeros: Int
+
+ /**
+ * Find the number of values stored explicitly. These values can be zero as well.
+ */
+ def numActives: Int
+}
+
+/**
+ * Column-major dense matrix.
+ * The entry values are stored in a single array of doubles with columns listed in sequence.
+ * For example, the following matrix
+ * {{{
+ * 1.0 2.0
+ * 3.0 4.0
+ * 5.0 6.0
+ * }}}
+ * is stored as `[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]`.
+ *
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param values matrix entries in column major if not transposed or in row major otherwise
+ * @param isTransposed whether the matrix is transposed. If true, `values` stores the matrix in
+ * row major.
+ */
+class DenseMatrix (
+ val numRows: Int,
+ val numCols: Int,
+ val values: Array[Double],
+ override val isTransposed: Boolean) extends Matrix {
+
+ require(values.length == numRows * numCols, "The number of values supplied doesn't match the " +
+ s"size of the matrix! values.length: ${values.length}, numRows * numCols: ${numRows * numCols}")
+
+ /**
+ * Column-major dense matrix.
+ * The entry values are stored in a single array of doubles with columns listed in sequence.
+ * For example, the following matrix
+ * {{{
+ * 1.0 2.0
+ * 3.0 4.0
+ * 5.0 6.0
+ * }}}
+ * is stored as `[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]`.
+ *
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param values matrix entries in column major
+ */
+ def this(numRows: Int, numCols: Int, values: Array[Double]) =
+ this(numRows, numCols, values, false)
+
+ override def equals(o: Any): Boolean = o match {
+ case m: Matrix => toBreeze == m.toBreeze
+ case _ => false
+ }
+
+ override def hashCode: Int = {
+ Seq(numRows, numCols, toArray).##
+ }
+
+ private[ml] def toBreeze: BM[Double] = {
+ if (!isTransposed) {
+ new BDM[Double](numRows, numCols, values)
+ } else {
+ val breezeMatrix = new BDM[Double](numCols, numRows, values)
+ breezeMatrix.t
+ }
+ }
+
+ private[ml] def apply(i: Int): Double = values(i)
+
+ override def apply(i: Int, j: Int): Double = values(index(i, j))
+
+ private[ml] def index(i: Int, j: Int): Int = {
+ require(i >= 0 && i < numRows, s"Expected 0 <= i < $numRows, got i = $i.")
+ require(j >= 0 && j < numCols, s"Expected 0 <= j < $numCols, got j = $j.")
+ if (!isTransposed) i + numRows * j else j + numCols * i
+ }
+
+ private[ml] def update(i: Int, j: Int, v: Double): Unit = {
+ values(index(i, j)) = v
+ }
+
+ override def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone())
+
+ private[spark] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f),
+ isTransposed)
+
+ private[ml] def update(f: Double => Double): DenseMatrix = {
+ val len = values.length
+ var i = 0
+ while (i < len) {
+ values(i) = f(values(i))
+ i += 1
+ }
+ this
+ }
+
+ override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
+
+ private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
+ if (!isTransposed) {
+ // outer loop over columns
+ var j = 0
+ while (j < numCols) {
+ var i = 0
+ val indStart = j * numRows
+ while (i < numRows) {
+ f(i, j, values(indStart + i))
+ i += 1
+ }
+ j += 1
+ }
+ } else {
+ // outer loop over rows
+ var i = 0
+ while (i < numRows) {
+ var j = 0
+ val indStart = i * numCols
+ while (j < numCols) {
+ f(i, j, values(indStart + j))
+ j += 1
+ }
+ i += 1
+ }
+ }
+ }
+
+ override def numNonzeros: Int = values.count(_ != 0)
+
+ override def numActives: Int = values.length
+
+ /**
+ * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
+ * set to false.
+ */
+ def toSparse: SparseMatrix = {
+ val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
+ val colPtrs: Array[Int] = new Array[Int](numCols + 1)
+ val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt
+ var nnz = 0
+ var j = 0
+ while (j < numCols) {
+ var i = 0
+ while (i < numRows) {
+ val v = values(index(i, j))
+ if (v != 0.0) {
+ rowIndices += i
+ spVals += v
+ nnz += 1
+ }
+ i += 1
+ }
+ j += 1
+ colPtrs(j) = nnz
+ }
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result())
+ }
+
+ override def colIter: Iterator[Vector] = {
+ if (isTransposed) {
+ Iterator.tabulate(numCols) { j =>
+ val col = new Array[Double](numRows)
+ blas.dcopy(numRows, values, j, numCols, col, 0, 1)
+ new DenseVector(col)
+ }
+ } else {
+ Iterator.tabulate(numCols) { j =>
+ new DenseVector(values.slice(j * numRows, (j + 1) * numRows))
+ }
+ }
+ }
+}
+
+/**
+ * Factory methods for [[org.apache.spark.ml.linalg.DenseMatrix]].
+ */
+object DenseMatrix {
+
+ /**
+ * Generate a `DenseMatrix` consisting of zeros.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros
+ */
+ def zeros(numRows: Int, numCols: Int): DenseMatrix = {
+ require(numRows.toLong * numCols <= Int.MaxValue,
+ s"$numRows x $numCols dense matrix is too large to allocate")
+ new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols))
+ }
+
+ /**
+ * Generate a `DenseMatrix` consisting of ones.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones
+ */
+ def ones(numRows: Int, numCols: Int): DenseMatrix = {
+ require(numRows.toLong * numCols <= Int.MaxValue,
+ s"$numRows x $numCols dense matrix is too large to allocate")
+ new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0))
+ }
+
+ /**
+ * Generate an Identity Matrix in `DenseMatrix` format.
+ * @param n number of rows and columns of the matrix
+ * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal
+ */
+ def eye(n: Int): DenseMatrix = {
+ val identity = DenseMatrix.zeros(n, n)
+ var i = 0
+ while (i < n) {
+ identity.update(i, i, 1.0)
+ i += 1
+ }
+ identity
+ }
+
+ /**
+ * Generate a `DenseMatrix` consisting of `i.i.d.` uniform random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param rng a random number generator
+ * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
+ */
+ def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
+ require(numRows.toLong * numCols <= Int.MaxValue,
+ s"$numRows x $numCols dense matrix is too large to allocate")
+ new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextDouble()))
+ }
+
+ /**
+ * Generate a `DenseMatrix` consisting of `i.i.d.` gaussian random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param rng a random number generator
+ * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
+ */
+ def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
+ require(numRows.toLong * numCols <= Int.MaxValue,
+ s"$numRows x $numCols dense matrix is too large to allocate")
+ new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextGaussian()))
+ }
+
+ /**
+ * Generate a diagonal matrix in `DenseMatrix` format from the supplied values.
+ * @param vector a `Vector` that will form the values on the diagonal of the matrix
+ * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values`
+ * on the diagonal
+ */
+ def diag(vector: Vector): DenseMatrix = {
+ val n = vector.size
+ val matrix = DenseMatrix.zeros(n, n)
+ val values = vector.toArray
+ var i = 0
+ while (i < n) {
+ matrix.update(i, i, values(i))
+ i += 1
+ }
+ matrix
+ }
+}
+
+/**
+ * Column-major sparse matrix.
+ * The entry values are stored in Compressed Sparse Column (CSC) format.
+ * For example, the following matrix
+ * {{{
+ * 1.0 0.0 4.0
+ * 0.0 3.0 5.0
+ * 2.0 0.0 6.0
+ * }}}
+ * is stored as `values: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]`,
+ * `rowIndices=[0, 2, 1, 0, 1, 2]`, `colPointers=[0, 2, 3, 6]`.
+ *
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param colPtrs the index corresponding to the start of a new column (if not transposed)
+ * @param rowIndices the row index of the entry (if not transposed). They must be in strictly
+ * increasing order for each column
+ * @param values nonzero matrix entries in column major (if not transposed)
+ * @param isTransposed whether the matrix is transposed. If true, the matrix can be considered
+ * Compressed Sparse Row (CSR) format, where `colPtrs` behaves as rowPtrs,
+ * and `rowIndices` behave as colIndices, and `values` are stored in row major.
+ */
+class SparseMatrix (
+ val numRows: Int,
+ val numCols: Int,
+ val colPtrs: Array[Int],
+ val rowIndices: Array[Int],
+ val values: Array[Double],
+ override val isTransposed: Boolean) extends Matrix {
+
+ require(values.length == rowIndices.length, "The number of row indices and values don't match! " +
+ s"values.length: ${values.length}, rowIndices.length: ${rowIndices.length}")
+ // The Or statement is for the case when the matrix is transposed
+ require(colPtrs.length == numCols + 1 || colPtrs.length == numRows + 1, "The length of the " +
+ "column indices should be the number of columns + 1. Currently, colPointers.length: " +
+ s"${colPtrs.length}, numCols: $numCols")
+ require(values.length == colPtrs.last, "The last value of colPtrs must equal the number of " +
+ s"elements. values.length: ${values.length}, colPtrs.last: ${colPtrs.last}")
+
+ /**
+ * Column-major sparse matrix.
+ * The entry values are stored in Compressed Sparse Column (CSC) format.
+ * For example, the following matrix
+ * {{{
+ * 1.0 0.0 4.0
+ * 0.0 3.0 5.0
+ * 2.0 0.0 6.0
+ * }}}
+ * is stored as `values: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]`,
+ * `rowIndices=[0, 2, 1, 0, 1, 2]`, `colPointers=[0, 2, 3, 6]`.
+ *
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param colPtrs the index corresponding to the start of a new column
+ * @param rowIndices the row index of the entry. They must be in strictly increasing
+ * order for each column
+ * @param values non-zero matrix entries in column major
+ */
+ def this(
+ numRows: Int,
+ numCols: Int,
+ colPtrs: Array[Int],
+ rowIndices: Array[Int],
+ values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false)
+
+ override def equals(o: Any): Boolean = o match {
+ case m: Matrix => toBreeze == m.toBreeze
+ case _ => false
+ }
+
+ private[ml] def toBreeze: BM[Double] = {
+ if (!isTransposed) {
+ new BSM[Double](values, numRows, numCols, colPtrs, rowIndices)
+ } else {
+ val breezeMatrix = new BSM[Double](values, numCols, numRows, colPtrs, rowIndices)
+ breezeMatrix.t
+ }
+ }
+
+ override def apply(i: Int, j: Int): Double = {
+ val ind = index(i, j)
+ if (ind < 0) 0.0 else values(ind)
+ }
+
+ private[ml] def index(i: Int, j: Int): Int = {
+ require(i >= 0 && i < numRows, s"Expected 0 <= i < $numRows, got i = $i.")
+ require(j >= 0 && j < numCols, s"Expected 0 <= j < $numCols, got j = $j.")
+ if (!isTransposed) {
+ Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i)
+ } else {
+ Arrays.binarySearch(rowIndices, colPtrs(i), colPtrs(i + 1), j)
+ }
+ }
+
+ private[ml] def update(i: Int, j: Int, v: Double): Unit = {
+ val ind = index(i, j)
+ if (ind < 0) {
+ throw new NoSuchElementException("The given row and column indices correspond to a zero " +
+ "value. Only non-zero elements in Sparse Matrices can be updated.")
+ } else {
+ values(ind) = v
+ }
+ }
+
+ override def copy: SparseMatrix = {
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.clone())
+ }
+
+ private[spark] def map(f: Double => Double) =
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.map(f), isTransposed)
+
+ private[ml] def update(f: Double => Double): SparseMatrix = {
+ val len = values.length
+ var i = 0
+ while (i < len) {
+ values(i) = f(values(i))
+ i += 1
+ }
+ this
+ }
+
+ override def transpose: SparseMatrix =
+ new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed)
+
+ private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
+ if (!isTransposed) {
+ var j = 0
+ while (j < numCols) {
+ var idx = colPtrs(j)
+ val idxEnd = colPtrs(j + 1)
+ while (idx < idxEnd) {
+ f(rowIndices(idx), j, values(idx))
+ idx += 1
+ }
+ j += 1
+ }
+ } else {
+ var i = 0
+ while (i < numRows) {
+ var idx = colPtrs(i)
+ val idxEnd = colPtrs(i + 1)
+ while (idx < idxEnd) {
+ val j = rowIndices(idx)
+ f(i, j, values(idx))
+ idx += 1
+ }
+ i += 1
+ }
+ }
+ }
+
+ /**
+ * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
+ * set to false.
+ */
+ def toDense: DenseMatrix = {
+ new DenseMatrix(numRows, numCols, toArray)
+ }
+
+ override def numNonzeros: Int = values.count(_ != 0)
+
+ override def numActives: Int = values.length
+
+ override def colIter: Iterator[Vector] = {
+ if (isTransposed) {
+ val indicesArray = Array.fill(numCols)(MArrayBuilder.make[Int])
+ val valuesArray = Array.fill(numCols)(MArrayBuilder.make[Double])
+ var i = 0
+ while (i < numRows) {
+ var k = colPtrs(i)
+ val rowEnd = colPtrs(i + 1)
+ while (k < rowEnd) {
+ val j = rowIndices(k)
+ indicesArray(j) += i
+ valuesArray(j) += values(k)
+ k += 1
+ }
+ i += 1
+ }
+ Iterator.tabulate(numCols) { j =>
+ val ii = indicesArray(j).result()
+ val vv = valuesArray(j).result()
+ new SparseVector(numRows, ii, vv)
+ }
+ } else {
+ Iterator.tabulate(numCols) { j =>
+ val colStart = colPtrs(j)
+ val colEnd = colPtrs(j + 1)
+ val ii = rowIndices.slice(colStart, colEnd)
+ val vv = values.slice(colStart, colEnd)
+ new SparseVector(numRows, ii, vv)
+ }
+ }
+ }
+}
+
+/**
+ * Factory methods for [[org.apache.spark.ml.linalg.SparseMatrix]].
+ */
+object SparseMatrix {
+
+ /**
+ * Generate a `SparseMatrix` from Coordinate List (COO) format. Input must be an array of
+ * (i, j, value) tuples. Entries that have duplicate values of i and j are
+ * added together. Tuples where value is equal to zero will be omitted.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param entries Array of (i, j, value) tuples
+ * @return The corresponding `SparseMatrix`
+ */
+ def fromCOO(numRows: Int, numCols: Int, entries: Iterable[(Int, Int, Double)]): SparseMatrix = {
+ val sortedEntries = entries.toSeq.sortBy(v => (v._2, v._1))
+ val numEntries = sortedEntries.size
+ if (sortedEntries.nonEmpty) {
+ // Since the entries are sorted by column index, we only need to check the first and the last.
+ for (col <- Seq(sortedEntries.head._2, sortedEntries.last._2)) {
+ require(col >= 0 && col < numCols, s"Column index out of range [0, $numCols): $col.")
+ }
+ }
+ val colPtrs = new Array[Int](numCols + 1)
+ val rowIndices = MArrayBuilder.make[Int]
+ rowIndices.sizeHint(numEntries)
+ val values = MArrayBuilder.make[Double]
+ values.sizeHint(numEntries)
+ var nnz = 0
+ var prevCol = 0
+ var prevRow = -1
+ var prevVal = 0.0
+ // Append a dummy entry to include the last one at the end of the loop.
+ (sortedEntries.view :+ (numRows, numCols, 1.0)).foreach { case (i, j, v) =>
+ if (v != 0) {
+ if (i == prevRow && j == prevCol) {
+ prevVal += v
+ } else {
+ if (prevVal != 0) {
+ require(prevRow >= 0 && prevRow < numRows,
+ s"Row index out of range [0, $numRows): $prevRow.")
+ nnz += 1
+ rowIndices += prevRow
+ values += prevVal
+ }
+ prevRow = i
+ prevVal = v
+ while (prevCol < j) {
+ colPtrs(prevCol + 1) = nnz
+ prevCol += 1
+ }
+ }
+ }
+ }
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), values.result())
+ }
+
+ /**
+ * Generate an Identity Matrix in `SparseMatrix` format.
+ * @param n number of rows and columns of the matrix
+ * @return `SparseMatrix` with size `n` x `n` and values of ones on the diagonal
+ */
+ def speye(n: Int): SparseMatrix = {
+ new SparseMatrix(n, n, (0 to n).toArray, (0 until n).toArray, Array.fill(n)(1.0))
+ }
+
+ /**
+ * Generates the skeleton of a random `SparseMatrix` with a given random number generator.
+ * The values of the matrix returned are undefined.
+ */
+ private def genRandMatrix(
+ numRows: Int,
+ numCols: Int,
+ density: Double,
+ rng: Random): SparseMatrix = {
+ require(numRows > 0, s"numRows must be greater than 0 but got $numRows")
+ require(numCols > 0, s"numCols must be greater than 0 but got $numCols")
+ require(density >= 0.0 && density <= 1.0,
+ s"density must be a double in the range 0.0 <= d <= 1.0. Currently, density: $density")
+ val size = numRows.toLong * numCols
+ val expected = size * density
+ assert(expected < Int.MaxValue,
+ "The expected number of nonzeros cannot be greater than Int.MaxValue.")
+ val nnz = math.ceil(expected).toInt
+ if (density == 0.0) {
+ new SparseMatrix(numRows, numCols, new Array[Int](numCols + 1), Array[Int](), Array[Double]())
+ } else if (density == 1.0) {
+ val colPtrs = Array.tabulate(numCols + 1)(j => j * numRows)
+ val rowIndices = Array.tabulate(size.toInt)(idx => idx % numRows)
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices, new Array[Double](numRows * numCols))
+ } else if (density < 0.34) {
+ // draw-by-draw, expected number of iterations is less than 1.5 * nnz
+ val entries = MHashSet[(Int, Int)]()
+ while (entries.size < nnz) {
+ entries += ((rng.nextInt(numRows), rng.nextInt(numCols)))
+ }
+ SparseMatrix.fromCOO(numRows, numCols, entries.map(v => (v._1, v._2, 1.0)))
+ } else {
+ // selection-rejection method
+ var idx = 0L
+ var numSelected = 0
+ var j = 0
+ val colPtrs = new Array[Int](numCols + 1)
+ val rowIndices = new Array[Int](nnz)
+ while (j < numCols && numSelected < nnz) {
+ var i = 0
+ while (i < numRows && numSelected < nnz) {
+ if (rng.nextDouble() < 1.0 * (nnz - numSelected) / (size - idx)) {
+ rowIndices(numSelected) = i
+ numSelected += 1
+ }
+ i += 1
+ idx += 1
+ }
+ colPtrs(j + 1) = numSelected
+ j += 1
+ }
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices, new Array[Double](nnz))
+ }
+ }
+
+ /**
+ * Generate a `SparseMatrix` consisting of `i.i.d`. uniform random numbers. The number of non-zero
+ * elements equal the ceiling of `numRows` x `numCols` x `density`
+ *
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param density the desired density for the matrix
+ * @param rng a random number generator
+ * @return `SparseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
+ */
+ def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = {
+ val mat = genRandMatrix(numRows, numCols, density, rng)
+ mat.update(i => rng.nextDouble())
+ }
+
+ /**
+ * Generate a `SparseMatrix` consisting of `i.i.d`. gaussian random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param density the desired density for the matrix
+ * @param rng a random number generator
+ * @return `SparseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
+ */
+ def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = {
+ val mat = genRandMatrix(numRows, numCols, density, rng)
+ mat.update(i => rng.nextGaussian())
+ }
+
+ /**
+ * Generate a diagonal matrix in `SparseMatrix` format from the supplied values.
+ * @param vector a `Vector` that will form the values on the diagonal of the matrix
+ * @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero
+ * `values` on the diagonal
+ */
+ def spdiag(vector: Vector): SparseMatrix = {
+ val n = vector.size
+ vector match {
+ case sVec: SparseVector =>
+ SparseMatrix.fromCOO(n, n, sVec.indices.zip(sVec.values).map(v => (v._1, v._1, v._2)))
+ case dVec: DenseVector =>
+ val entries = dVec.values.zipWithIndex
+ val nnzVals = entries.filter(v => v._1 != 0.0)
+ SparseMatrix.fromCOO(n, n, nnzVals.map(v => (v._2, v._2, v._1)))
+ }
+ }
+}
+
+/**
+ * Factory methods for [[org.apache.spark.ml.linalg.Matrix]].
+ */
+object Matrices {
+
+ /**
+ * Creates a column-major dense matrix.
+ *
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param values matrix entries in column major
+ */
+ def dense(numRows: Int, numCols: Int, values: Array[Double]): Matrix = {
+ new DenseMatrix(numRows, numCols, values)
+ }
+
+ /**
+ * Creates a column-major sparse matrix in Compressed Sparse Column (CSC) format.
+ *
+ * @param numRows number of rows
+ * @param numCols number of columns
+ * @param colPtrs the index corresponding to the start of a new column
+ * @param rowIndices the row index of the entry
+ * @param values non-zero matrix entries in column major
+ */
+ def sparse(
+ numRows: Int,
+ numCols: Int,
+ colPtrs: Array[Int],
+ rowIndices: Array[Int],
+ values: Array[Double]): Matrix = {
+ new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
+ }
+
+ /**
+ * Creates a Matrix instance from a breeze matrix.
+ * @param breeze a breeze matrix
+ * @return a Matrix instance
+ */
+ private[ml] def fromBreeze(breeze: BM[Double]): Matrix = {
+ breeze match {
+ case dm: BDM[Double] =>
+ new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose)
+ case sm: BSM[Double] =>
+ // Spark-11507. work around breeze issue 479.
+ val mat = if (sm.colPtrs.last != sm.data.length) {
+ val matCopy = sm.copy
+ matCopy.compact()
+ matCopy
+ } else {
+ sm
+ }
+ // There is no isTranspose flag for sparse matrices in Breeze
+ new SparseMatrix(mat.rows, mat.cols, mat.colPtrs, mat.rowIndices, mat.data)
+ case _ =>
+ throw new UnsupportedOperationException(
+ s"Do not support conversion from type ${breeze.getClass.getName}.")
+ }
+ }
+
+ /**
+ * Generate a `Matrix` consisting of zeros.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @return `Matrix` with size `numRows` x `numCols` and values of zeros
+ */
+ def zeros(numRows: Int, numCols: Int): Matrix = DenseMatrix.zeros(numRows, numCols)
+
+ /**
+ * Generate a `DenseMatrix` consisting of ones.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @return `Matrix` with size `numRows` x `numCols` and values of ones
+ */
+ def ones(numRows: Int, numCols: Int): Matrix = DenseMatrix.ones(numRows, numCols)
+
+ /**
+ * Generate a dense Identity Matrix in `Matrix` format.
+ * @param n number of rows and columns of the matrix
+ * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
+ */
+ def eye(n: Int): Matrix = DenseMatrix.eye(n)
+
+ /**
+ * Generate a sparse Identity Matrix in `Matrix` format.
+ * @param n number of rows and columns of the matrix
+ * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
+ */
+ def speye(n: Int): Matrix = SparseMatrix.speye(n)
+
+ /**
+ * Generate a `DenseMatrix` consisting of `i.i.d.` uniform random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param rng a random number generator
+ * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
+ */
+ def rand(numRows: Int, numCols: Int, rng: Random): Matrix =
+ DenseMatrix.rand(numRows, numCols, rng)
+
+ /**
+ * Generate a `SparseMatrix` consisting of `i.i.d.` gaussian random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param density the desired density for the matrix
+ * @param rng a random number generator
+ * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
+ */
+ def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix =
+ SparseMatrix.sprand(numRows, numCols, density, rng)
+
+ /**
+ * Generate a `DenseMatrix` consisting of `i.i.d.` gaussian random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param rng a random number generator
+ * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
+ */
+ def randn(numRows: Int, numCols: Int, rng: Random): Matrix =
+ DenseMatrix.randn(numRows, numCols, rng)
+
+ /**
+ * Generate a `SparseMatrix` consisting of `i.i.d.` gaussian random numbers.
+ * @param numRows number of rows of the matrix
+ * @param numCols number of columns of the matrix
+ * @param density the desired density for the matrix
+ * @param rng a random number generator
+ * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
+ */
+ def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix =
+ SparseMatrix.sprandn(numRows, numCols, density, rng)
+
+ /**
+ * Generate a diagonal matrix in `Matrix` format from the supplied values.
+ * @param vector a `Vector` that will form the values on the diagonal of the matrix
+ * @return Square `Matrix` with size `values.length` x `values.length` and `values`
+ * on the diagonal
+ */
+ def diag(vector: Vector): Matrix = DenseMatrix.diag(vector)
+
+ /**
+ * Horizontally concatenate a sequence of matrices. The returned matrix will be in the format
+ * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in
+ * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned.
+ * @param matrices array of matrices
+ * @return a single `Matrix` composed of the matrices that were horizontally concatenated
+ */
+ def horzcat(matrices: Array[Matrix]): Matrix = {
+ if (matrices.isEmpty) {
+ return new DenseMatrix(0, 0, Array[Double]())
+ } else if (matrices.length == 1) {
+ return matrices(0)
+ }
+ val numRows = matrices(0).numRows
+ var hasSparse = false
+ var numCols = 0
+ matrices.foreach { mat =>
+ require(numRows == mat.numRows, "The number of rows of the matrices in this sequence, " +
+ "don't match!")
+ mat match {
+ case sparse: SparseMatrix => hasSparse = true
+ case dense: DenseMatrix => // empty on purpose
+ case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " +
+ s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}")
+ }
+ numCols += mat.numCols
+ }
+ if (!hasSparse) {
+ new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray))
+ } else {
+ var startCol = 0
+ val entries: Array[(Int, Int, Double)] = matrices.flatMap { mat =>
+ val nCols = mat.numCols
+ mat match {
+ case spMat: SparseMatrix =>
+ val data = new Array[(Int, Int, Double)](spMat.values.length)
+ var cnt = 0
+ spMat.foreachActive { (i, j, v) =>
+ data(cnt) = (i, j + startCol, v)
+ cnt += 1
+ }
+ startCol += nCols
+ data
+ case dnMat: DenseMatrix =>
+ val data = new ArrayBuffer[(Int, Int, Double)]()
+ dnMat.foreachActive { (i, j, v) =>
+ if (v != 0.0) {
+ data.append((i, j + startCol, v))
+ }
+ }
+ startCol += nCols
+ data
+ }
+ }
+ SparseMatrix.fromCOO(numRows, numCols, entries)
+ }
+ }
+
+ /**
+ * Vertically concatenate a sequence of matrices. The returned matrix will be in the format
+ * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in
+ * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned.
+ * @param matrices array of matrices
+ * @return a single `Matrix` composed of the matrices that were vertically concatenated
+ */
+ def vertcat(matrices: Array[Matrix]): Matrix = {
+ if (matrices.isEmpty) {
+ return new DenseMatrix(0, 0, Array[Double]())
+ } else if (matrices.length == 1) {
+ return matrices(0)
+ }
+ val numCols = matrices(0).numCols
+ var hasSparse = false
+ var numRows = 0
+ matrices.foreach { mat =>
+ require(numCols == mat.numCols, "The number of rows of the matrices in this sequence, " +
+ "don't match!")
+ mat match {
+ case sparse: SparseMatrix => hasSparse = true
+ case dense: DenseMatrix => // empty on purpose
+ case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " +
+ s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}")
+ }
+ numRows += mat.numRows
+ }
+ if (!hasSparse) {
+ val allValues = new Array[Double](numRows * numCols)
+ var startRow = 0
+ matrices.foreach { mat =>
+ var j = 0
+ val nRows = mat.numRows
+ mat.foreachActive { (i, j, v) =>
+ val indStart = j * numRows + startRow
+ allValues(indStart + i) = v
+ }
+ startRow += nRows
+ }
+ new DenseMatrix(numRows, numCols, allValues)
+ } else {
+ var startRow = 0
+ val entries: Array[(Int, Int, Double)] = matrices.flatMap { mat =>
+ val nRows = mat.numRows
+ mat match {
+ case spMat: SparseMatrix =>
+ val data = new Array[(Int, Int, Double)](spMat.values.length)
+ var cnt = 0
+ spMat.foreachActive { (i, j, v) =>
+ data(cnt) = (i + startRow, j, v)
+ cnt += 1
+ }
+ startRow += nRows
+ data
+ case dnMat: DenseMatrix =>
+ val data = new ArrayBuffer[(Int, Int, Double)]()
+ dnMat.foreachActive { (i, j, v) =>
+ if (v != 0.0) {
+ data.append((i + startRow, j, v))
+ }
+ }
+ startRow += nRows
+ data
+ }
+ }
+ SparseMatrix.fromCOO(numRows, numCols, entries)
+ }
+ }
+}
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
new file mode 100644
index 0000000000..fd4ce9adb8
--- /dev/null
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -0,0 +1,736 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.linalg
+
+import java.lang.{Double => JavaDouble, Integer => JavaInteger, Iterable => JavaIterable}
+import java.util
+
+import scala.annotation.varargs
+import scala.collection.JavaConverters._
+
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
+import org.json4s.DefaultFormats
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, parse => parseJson, render}
+
+/**
+ * Represents a numeric vector, whose index type is Int and value type is Double.
+ *
+ * Note: Users should not implement this interface.
+ */
+sealed trait Vector extends Serializable {
+
+ /**
+ * Size of the vector.
+ */
+ def size: Int
+
+ /**
+ * Converts the instance to a double array.
+ */
+ def toArray: Array[Double]
+
+ override def equals(other: Any): Boolean = {
+ other match {
+ case v2: Vector =>
+ if (this.size != v2.size) return false
+ (this, v2) match {
+ case (s1: SparseVector, s2: SparseVector) =>
+ Vectors.equals(s1.indices, s1.values, s2.indices, s2.values)
+ case (s1: SparseVector, d1: DenseVector) =>
+ Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values)
+ case (d1: DenseVector, s1: SparseVector) =>
+ Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values)
+ case (_, _) => util.Arrays.equals(this.toArray, v2.toArray)
+ }
+ case _ => false
+ }
+ }
+
+ /**
+ * Returns a hash code value for the vector. The hash code is based on its size and its first 128
+ * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
+ */
+ override def hashCode(): Int = {
+ // This is a reference implementation. It calls return in foreachActive, which is slow.
+ // Subclasses should override it with optimized implementation.
+ var result: Int = 31 + size
+ var nnz = 0
+ this.foreachActive { (index, value) =>
+ if (nnz < Vectors.MAX_HASH_NNZ) {
+ // ignore explicit 0 for comparison between sparse and dense
+ if (value != 0) {
+ result = 31 * result + index
+ val bits = java.lang.Double.doubleToLongBits(value)
+ result = 31 * result + (bits ^ (bits >>> 32)).toInt
+ nnz += 1
+ }
+ } else {
+ return result
+ }
+ }
+ result
+ }
+
+ /**
+ * Converts the instance to a breeze vector.
+ */
+ private[spark] def toBreeze: BV[Double]
+
+ /**
+ * Gets the value of the ith element.
+ * @param i index
+ */
+ def apply(i: Int): Double = toBreeze(i)
+
+ /**
+ * Makes a deep copy of this vector.
+ */
+ def copy: Vector = {
+ throw new NotImplementedError(s"copy is not implemented for ${this.getClass}.")
+ }
+
+ /**
+ * Applies a function `f` to all the active elements of dense and sparse vector.
+ *
+ * @param f the function takes two parameters where the first parameter is the index of
+ * the vector with type `Int`, and the second parameter is the corresponding value
+ * with type `Double`.
+ */
+ def foreachActive(f: (Int, Double) => Unit): Unit
+
+ /**
+ * Number of active entries. An "active entry" is an element which is explicitly stored,
+ * regardless of its value. Note that inactive entries have value 0.
+ */
+ def numActives: Int
+
+ /**
+ * Number of nonzero elements. This scans all active values and count nonzeros.
+ */
+ def numNonzeros: Int
+
+ /**
+ * Converts this vector to a sparse vector with all explicit zeros removed.
+ */
+ def toSparse: SparseVector
+
+ /**
+ * Converts this vector to a dense vector.
+ */
+ def toDense: DenseVector = new DenseVector(this.toArray)
+
+ /**
+ * Returns a vector in either dense or sparse format, whichever uses less storage.
+ */
+ def compressed: Vector = {
+ val nnz = numNonzeros
+ // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
+ if (1.5 * (nnz + 1.0) < size) {
+ toSparse
+ } else {
+ toDense
+ }
+ }
+
+ /**
+ * Find the index of a maximal element. Returns the first maximal element in case of a tie.
+ * Returns -1 if vector has length 0.
+ */
+ def argmax: Int
+
+ /**
+ * Converts the vector to a JSON string.
+ */
+ def toJson: String
+}
+
+/**
+ * Factory methods for [[org.apache.spark.ml.linalg.Vector]].
+ * We don't use the name `Vector` because Scala imports
+ * [[scala.collection.immutable.Vector]] by default.
+ */
+object Vectors {
+
+ /**
+ * Creates a dense vector from its values.
+ */
+ @varargs
+ def dense(firstValue: Double, otherValues: Double*): Vector =
+ new DenseVector((firstValue +: otherValues).toArray)
+
+ // A dummy implicit is used to avoid signature collision with the one generated by @varargs.
+ /**
+ * Creates a dense vector from a double array.
+ */
+ def dense(values: Array[Double]): Vector = new DenseVector(values)
+
+ /**
+ * Creates a sparse vector providing its index array and value array.
+ *
+ * @param size vector size.
+ * @param indices index array, must be strictly increasing.
+ * @param values value array, must have the same length as indices.
+ */
+ def sparse(size: Int, indices: Array[Int], values: Array[Double]): Vector =
+ new SparseVector(size, indices, values)
+
+ /**
+ * Creates a sparse vector using unordered (index, value) pairs.
+ *
+ * @param size vector size.
+ * @param elements vector elements in (index, value) pairs.
+ */
+ def sparse(size: Int, elements: Seq[(Int, Double)]): Vector = {
+ require(size > 0, "The size of the requested sparse vector must be greater than 0.")
+
+ val (indices, values) = elements.sortBy(_._1).unzip
+ var prev = -1
+ indices.foreach { i =>
+ require(prev < i, s"Found duplicate indices: $i.")
+ prev = i
+ }
+ require(prev < size, s"You may not write an element to index $prev because the declared " +
+ s"size of your vector is $size")
+
+ new SparseVector(size, indices.toArray, values.toArray)
+ }
+
+ /**
+ * Creates a sparse vector using unordered (index, value) pairs in a Java friendly way.
+ *
+ * @param size vector size.
+ * @param elements vector elements in (index, value) pairs.
+ */
+ def sparse(size: Int, elements: JavaIterable[(JavaInteger, JavaDouble)]): Vector = {
+ sparse(size, elements.asScala.map { case (i, x) =>
+ (i.intValue(), x.doubleValue())
+ }.toSeq)
+ }
+
+ /**
+ * Creates a vector of all zeros.
+ *
+ * @param size vector size
+ * @return a zero vector
+ */
+ def zeros(size: Int): Vector = {
+ new DenseVector(new Array[Double](size))
+ }
+
+ /**
+ * Parses the JSON representation of a vector into a [[Vector]].
+ */
+ def fromJson(json: String): Vector = {
+ implicit val formats = DefaultFormats
+ val jValue = parseJson(json)
+ (jValue \ "type").extract[Int] match {
+ case 0 => // sparse
+ val size = (jValue \ "size").extract[Int]
+ val indices = (jValue \ "indices").extract[Seq[Int]].toArray
+ val values = (jValue \ "values").extract[Seq[Double]].toArray
+ sparse(size, indices, values)
+ case 1 => // dense
+ val values = (jValue \ "values").extract[Seq[Double]].toArray
+ dense(values)
+ case _ =>
+ throw new IllegalArgumentException(s"Cannot parse $json into a vector.")
+ }
+ }
+
+ /**
+ * Creates a vector instance from a breeze vector.
+ */
+ private[spark] def fromBreeze(breezeVector: BV[Double]): Vector = {
+ breezeVector match {
+ case v: BDV[Double] =>
+ if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
+ new DenseVector(v.data)
+ } else {
+ new DenseVector(v.toArray) // Can't use underlying array directly, so make a new one
+ }
+ case v: BSV[Double] =>
+ if (v.index.length == v.used) {
+ new SparseVector(v.length, v.index, v.data)
+ } else {
+ new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
+ }
+ case v: BV[_] =>
+ sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
+ }
+ }
+
+ /**
+ * Returns the p-norm of this vector.
+ * @param vector input vector.
+ * @param p norm.
+ * @return norm in L^p^ space.
+ */
+ def norm(vector: Vector, p: Double): Double = {
+ require(p >= 1.0, "To compute the p-norm of the vector, we require that you specify a p>=1. " +
+ s"You specified p=$p.")
+ val values = vector match {
+ case DenseVector(vs) => vs
+ case SparseVector(n, ids, vs) => vs
+ case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
+ }
+ val size = values.length
+
+ if (p == 1) {
+ var sum = 0.0
+ var i = 0
+ while (i < size) {
+ sum += math.abs(values(i))
+ i += 1
+ }
+ sum
+ } else if (p == 2) {
+ var sum = 0.0
+ var i = 0
+ while (i < size) {
+ sum += values(i) * values(i)
+ i += 1
+ }
+ math.sqrt(sum)
+ } else if (p == Double.PositiveInfinity) {
+ var max = 0.0
+ var i = 0
+ while (i < size) {
+ val value = math.abs(values(i))
+ if (value > max) max = value
+ i += 1
+ }
+ max
+ } else {
+ var sum = 0.0
+ var i = 0
+ while (i < size) {
+ sum += math.pow(math.abs(values(i)), p)
+ i += 1
+ }
+ math.pow(sum, 1.0 / p)
+ }
+ }
+
+ /**
+ * Returns the squared distance between two Vectors.
+ * @param v1 first Vector.
+ * @param v2 second Vector.
+ * @return squared distance between two Vectors.
+ */
+ def sqdist(v1: Vector, v2: Vector): Double = {
+ require(v1.size == v2.size, s"Vector dimensions do not match: Dim(v1)=${v1.size} and Dim(v2)" +
+ s"=${v2.size}.")
+ var squaredDistance = 0.0
+ (v1, v2) match {
+ case (v1: SparseVector, v2: SparseVector) =>
+ val v1Values = v1.values
+ val v1Indices = v1.indices
+ val v2Values = v2.values
+ val v2Indices = v2.indices
+ val nnzv1 = v1Indices.length
+ val nnzv2 = v2Indices.length
+
+ var kv1 = 0
+ var kv2 = 0
+ while (kv1 < nnzv1 || kv2 < nnzv2) {
+ var score = 0.0
+
+ if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
+ score = v1Values(kv1)
+ kv1 += 1
+ } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
+ score = v2Values(kv2)
+ kv2 += 1
+ } else {
+ score = v1Values(kv1) - v2Values(kv2)
+ kv1 += 1
+ kv2 += 1
+ }
+ squaredDistance += score * score
+ }
+
+ case (v1: SparseVector, v2: DenseVector) =>
+ squaredDistance = sqdist(v1, v2)
+
+ case (v1: DenseVector, v2: SparseVector) =>
+ squaredDistance = sqdist(v2, v1)
+
+ case (DenseVector(vv1), DenseVector(vv2)) =>
+ var kv = 0
+ val sz = vv1.length
+ while (kv < sz) {
+ val score = vv1(kv) - vv2(kv)
+ squaredDistance += score * score
+ kv += 1
+ }
+ case _ =>
+ throw new IllegalArgumentException("Do not support vector type " + v1.getClass +
+ " and " + v2.getClass)
+ }
+ squaredDistance
+ }
+
+ /**
+ * Returns the squared distance between DenseVector and SparseVector.
+ */
+ private[ml] def sqdist(v1: SparseVector, v2: DenseVector): Double = {
+ var kv1 = 0
+ var kv2 = 0
+ val indices = v1.indices
+ var squaredDistance = 0.0
+ val nnzv1 = indices.length
+ val nnzv2 = v2.size
+ var iv1 = if (nnzv1 > 0) indices(kv1) else -1
+
+ while (kv2 < nnzv2) {
+ var score = 0.0
+ if (kv2 != iv1) {
+ score = v2(kv2)
+ } else {
+ score = v1.values(kv1) - v2(kv2)
+ if (kv1 < nnzv1 - 1) {
+ kv1 += 1
+ iv1 = indices(kv1)
+ }
+ }
+ squaredDistance += score * score
+ kv2 += 1
+ }
+ squaredDistance
+ }
+
+ /**
+ * Check equality between sparse/dense vectors
+ */
+ private[ml] def equals(
+ v1Indices: IndexedSeq[Int],
+ v1Values: Array[Double],
+ v2Indices: IndexedSeq[Int],
+ v2Values: Array[Double]): Boolean = {
+ val v1Size = v1Values.length
+ val v2Size = v2Values.length
+ var k1 = 0
+ var k2 = 0
+ var allEqual = true
+ while (allEqual) {
+ while (k1 < v1Size && v1Values(k1) == 0) k1 += 1
+ while (k2 < v2Size && v2Values(k2) == 0) k2 += 1
+
+ if (k1 >= v1Size || k2 >= v2Size) {
+ return k1 >= v1Size && k2 >= v2Size // check end alignment
+ }
+ allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2)
+ k1 += 1
+ k2 += 1
+ }
+ allEqual
+ }
+
+ /** Max number of nonzero entries used in computing hash code. */
+ private[linalg] val MAX_HASH_NNZ = 128
+}
+
+/**
+ * A dense vector represented by a value array.
+ */
+class DenseVector (val values: Array[Double]) extends Vector {
+
+ override def size: Int = values.length
+
+ override def toString: String = values.mkString("[", ",", "]")
+
+ override def toArray: Array[Double] = values
+
+ private[spark] override def toBreeze: BV[Double] = new BDV[Double](values)
+
+ override def apply(i: Int): Double = values(i)
+
+ override def copy: DenseVector = {
+ new DenseVector(values.clone())
+ }
+
+ override def foreachActive(f: (Int, Double) => Unit): Unit = {
+ var i = 0
+ val localValuesSize = values.length
+ val localValues = values
+
+ while (i < localValuesSize) {
+ f(i, localValues(i))
+ i += 1
+ }
+ }
+
+ override def hashCode(): Int = {
+ var result: Int = 31 + size
+ var i = 0
+ val end = values.length
+ var nnz = 0
+ while (i < end && nnz < Vectors.MAX_HASH_NNZ) {
+ val v = values(i)
+ if (v != 0.0) {
+ result = 31 * result + i
+ val bits = java.lang.Double.doubleToLongBits(values(i))
+ result = 31 * result + (bits ^ (bits >>> 32)).toInt
+ nnz += 1
+ }
+ i += 1
+ }
+ result
+ }
+
+ override def numActives: Int = size
+
+ override def numNonzeros: Int = {
+ // same as values.count(_ != 0.0) but faster
+ var nnz = 0
+ values.foreach { v =>
+ if (v != 0.0) {
+ nnz += 1
+ }
+ }
+ nnz
+ }
+
+ override def toSparse: SparseVector = {
+ val nnz = numNonzeros
+ val ii = new Array[Int](nnz)
+ val vv = new Array[Double](nnz)
+ var k = 0
+ foreachActive { (i, v) =>
+ if (v != 0) {
+ ii(k) = i
+ vv(k) = v
+ k += 1
+ }
+ }
+ new SparseVector(size, ii, vv)
+ }
+
+ override def argmax: Int = {
+ if (size == 0) {
+ -1
+ } else {
+ var maxIdx = 0
+ var maxValue = values(0)
+ var i = 1
+ while (i < size) {
+ if (values(i) > maxValue) {
+ maxIdx = i
+ maxValue = values(i)
+ }
+ i += 1
+ }
+ maxIdx
+ }
+ }
+
+ override def toJson: String = {
+ val jValue = ("type" -> 1) ~ ("values" -> values.toSeq)
+ compact(render(jValue))
+ }
+}
+
+object DenseVector {
+
+ /** Extracts the value array from a dense vector. */
+ def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
+}
+
+/**
+ * A sparse vector represented by an index array and an value array.
+ *
+ * @param size size of the vector.
+ * @param indices index array, assume to be strictly increasing.
+ * @param values value array, must have the same length as the index array.
+ */
+class SparseVector (
+ override val size: Int,
+ val indices: Array[Int],
+ val values: Array[Double]) extends Vector {
+
+ require(indices.length == values.length, "Sparse vectors require that the dimension of the" +
+ s" indices match the dimension of the values. You provided ${indices.length} indices and " +
+ s" ${values.length} values.")
+ require(indices.length <= size, s"You provided ${indices.length} indices and values, " +
+ s"which exceeds the specified vector size ${size}.")
+
+ override def toString: String =
+ s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})"
+
+ override def toArray: Array[Double] = {
+ val data = new Array[Double](size)
+ var i = 0
+ val nnz = indices.length
+ while (i < nnz) {
+ data(indices(i)) = values(i)
+ i += 1
+ }
+ data
+ }
+
+ override def copy: SparseVector = {
+ new SparseVector(size, indices.clone(), values.clone())
+ }
+
+ private[spark] override def toBreeze: BV[Double] = new BSV[Double](indices, values, size)
+
+ override def foreachActive(f: (Int, Double) => Unit): Unit = {
+ var i = 0
+ val localValuesSize = values.length
+ val localIndices = indices
+ val localValues = values
+
+ while (i < localValuesSize) {
+ f(localIndices(i), localValues(i))
+ i += 1
+ }
+ }
+
+ override def hashCode(): Int = {
+ var result: Int = 31 + size
+ val end = values.length
+ var k = 0
+ var nnz = 0
+ while (k < end && nnz < Vectors.MAX_HASH_NNZ) {
+ val v = values(k)
+ if (v != 0.0) {
+ val i = indices(k)
+ result = 31 * result + i
+ val bits = java.lang.Double.doubleToLongBits(v)
+ result = 31 * result + (bits ^ (bits >>> 32)).toInt
+ nnz += 1
+ }
+ k += 1
+ }
+ result
+ }
+
+ override def numActives: Int = values.length
+
+ override def numNonzeros: Int = {
+ var nnz = 0
+ values.foreach { v =>
+ if (v != 0.0) {
+ nnz += 1
+ }
+ }
+ nnz
+ }
+
+ override def toSparse: SparseVector = {
+ val nnz = numNonzeros
+ if (nnz == numActives) {
+ this
+ } else {
+ val ii = new Array[Int](nnz)
+ val vv = new Array[Double](nnz)
+ var k = 0
+ foreachActive { (i, v) =>
+ if (v != 0.0) {
+ ii(k) = i
+ vv(k) = v
+ k += 1
+ }
+ }
+ new SparseVector(size, ii, vv)
+ }
+ }
+
+ override def argmax: Int = {
+ if (size == 0) {
+ -1
+ } else {
+ // Find the max active entry.
+ var maxIdx = indices(0)
+ var maxValue = values(0)
+ var maxJ = 0
+ var j = 1
+ val na = numActives
+ while (j < na) {
+ val v = values(j)
+ if (v > maxValue) {
+ maxValue = v
+ maxIdx = indices(j)
+ maxJ = j
+ }
+ j += 1
+ }
+
+ // If the max active entry is nonpositive and there exists inactive ones, find the first zero.
+ if (maxValue <= 0.0 && na < size) {
+ if (maxValue == 0.0) {
+ // If there exists an inactive entry before maxIdx, find it and return its index.
+ if (maxJ < maxIdx) {
+ var k = 0
+ while (k < maxJ && indices(k) == k) {
+ k += 1
+ }
+ maxIdx = k
+ }
+ } else {
+ // If the max active value is negative, find and return the first inactive index.
+ var k = 0
+ while (k < na && indices(k) == k) {
+ k += 1
+ }
+ maxIdx = k
+ }
+ }
+
+ maxIdx
+ }
+ }
+
+ /**
+ * Create a slice of this vector based on the given indices.
+ * @param selectedIndices Unsorted list of indices into the vector.
+ * This does NOT do bound checking.
+ * @return New SparseVector with values in the order specified by the given indices.
+ *
+ * NOTE: The API needs to be discussed before making this public.
+ * Also, if we have a version assuming indices are sorted, we should optimize it.
+ */
+ private[spark] def slice(selectedIndices: Array[Int]): SparseVector = {
+ var currentIdx = 0
+ val (sliceInds, sliceVals) = selectedIndices.flatMap { origIdx =>
+ val iIdx = java.util.Arrays.binarySearch(this.indices, origIdx)
+ val i_v = if (iIdx >= 0) {
+ Iterator((currentIdx, this.values(iIdx)))
+ } else {
+ Iterator()
+ }
+ currentIdx += 1
+ i_v
+ }.unzip
+ new SparseVector(selectedIndices.length, sliceInds.toArray, sliceVals.toArray)
+ }
+
+ override def toJson: String = {
+ val jValue = ("type" -> 0) ~
+ ("size" -> size) ~
+ ("indices" -> indices.toSeq) ~
+ ("values" -> values.toSeq)
+ compact(render(jValue))
+ }
+}
+
+object SparseVector {
+ def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] =
+ Some((sv.size, sv.indices, sv.values))
+}