[SPARK-17471][ML] Add compressed method to ML matrices

## What changes were proposed in this pull request? This patch adds a `compressed` method to ML `Matrix` class, which returns the minimal storage representation of the matrix - either sparse or dense. Because the space occupied by a sparse matrix is dependent upon its layout (i.e. column major or row major), this method must consider both cases. It may also be useful to force the layout to be column or row major beforehand, so an overload is added which takes in a `columnMajor: Boolean` parameter. The compressed implementation relies upon two new abstract methods `toDense(columnMajor: Boolean)` and `toSparse(columnMajor: Boolean)`, similar to the compressed method implemented in the `Vector` class. These methods also allow the layout of the resulting matrix to be specified via the `columnMajor` parameter. More detail on the new methods is given below. ## How was this patch tested? Added many new unit tests ## New methods (summary, not exhaustive list) **Matrix trait** - `private[ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix` (abstract) - converts the matrix (either sparse or dense) to dense format - `private[ml] def toSparseMatrix(columnMajor: Boolean): SparseMatrix` (abstract) - converts the matrix (either sparse or dense) to sparse format - `def toDense: DenseMatrix = toDense(true)` - converts the matrix (either sparse or dense) to dense format in column major layout - `def toSparse: SparseMatrix = toSparse(true)` - converts the matrix (either sparse or dense) to sparse format in column major layout - `def compressed: Matrix` - finds the minimum space representation of this matrix, considering both column and row major layouts, and converts it - `def compressed(columnMajor: Boolean): Matrix` - finds the minimum space representation of this matrix considering only column OR row major, and converts it **DenseMatrix class** - `private[ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix` - converts the dense matrix to a dense matrix, optionally changing the layout (data is NOT duplicated if the layouts are the same) - `private[ml] def toSparseMatrix(columnMajor: Boolean): SparseMatrix` - converts the dense matrix to sparse matrix, using the specified layout **SparseMatrix class** - `private[ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix` - converts the sparse matrix to a dense matrix, using the specified layout - `private[ml] def toSparseMatrix(columnMajors: Boolean): SparseMatrix` - converts the sparse matrix to sparse matrix. If the sparse matrix contains any explicit zeros, they are removed. If the layout requested does not match the current layout, data is copied to a new representation. If the layouts match and no explicit zeros exist, the current matrix is returned. Author: sethah <seth.hendrickson16@gmail.com> Closes #15628 from sethah/matrix_compress.
author: sethah <seth.hendrickson16@gmail.com> 2017-03-24 20:32:42 +0000
committer: DB Tsai <dbtsai@dbtsai.com> 2017-03-24 20:32:42 +0000
commit: e8810b73c495b6d437dd3b9bb334762126b3c063 (patch)
tree: 36f7d9e0f7f05088161a653bf6abd43a14b4d903 /mllib-local
parent: 707e501832fa7adde0a884c528a7352983d83520 (diff)
download: spark-e8810b73c495b6d437dd3b9bb334762126b3c063.tar.gz
spark-e8810b73c495b6d437dd3b9bb334762126b3c063.tar.bz2
spark-e8810b73c495b6d437dd3b9bb334762126b3c063.zip
3 files changed, 654 insertions, 45 deletions
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index d9ffdeb797..07f3bc2728 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -44,6 +44,12 @@ sealed trait Matrix extends Serializable {
   @Since("2.0.0")
   val isTransposed: Boolean = false
 
+  /** Indicates whether the values backing this matrix are arranged in column major order. */
+  private[ml] def isColMajor: Boolean = !isTransposed
+
+  /** Indicates whether the values backing this matrix are arranged in row major order. */
+  private[ml] def isRowMajor: Boolean = isTransposed
+
   /** Converts to a dense array in column major. */
   @Since("2.0.0")
   def toArray: Array[Double] = {
@@ -148,7 +154,8 @@ sealed trait Matrix extends Serializable {
    *          and column indices respectively with the type `Int`, and the final parameter is the
    *          corresponding value in the matrix with type `Double`.
    */
-  private[spark] def foreachActive(f: (Int, Int, Double) => Unit)
+  @Since("2.2.0")
+  def foreachActive(f: (Int, Int, Double) => Unit): Unit
 
   /**
    * Find the number of non-zero active values.
@@ -161,6 +168,116 @@ sealed trait Matrix extends Serializable {
    */
   @Since("2.0.0")
   def numActives: Int
+
+  /**
+   * Converts this matrix to a sparse matrix.
+   *
+   * @param colMajor Whether the values of the resulting sparse matrix should be in column major
+   *                    or row major order. If `false`, resulting matrix will be row major.
+   */
+  private[ml] def toSparseMatrix(colMajor: Boolean): SparseMatrix
+
+  /**
+   * Converts this matrix to a sparse matrix in column major order.
+   */
+  @Since("2.2.0")
+  def toSparseColMajor: SparseMatrix = toSparseMatrix(colMajor = true)
+
+  /**
+   * Converts this matrix to a sparse matrix in row major order.
+   */
+  @Since("2.2.0")
+  def toSparseRowMajor: SparseMatrix = toSparseMatrix(colMajor = false)
+
+  /**
+   * Converts this matrix to a sparse matrix while maintaining the layout of the current matrix.
+   */
+  @Since("2.2.0")
+  def toSparse: SparseMatrix = toSparseMatrix(colMajor = isColMajor)
+
+  /**
+   * Converts this matrix to a dense matrix.
+   *
+   * @param colMajor Whether the values of the resulting dense matrix should be in column major
+   *                    or row major order. If `false`, resulting matrix will be row major.
+   */
+  private[ml] def toDenseMatrix(colMajor: Boolean): DenseMatrix
+
+  /**
+   * Converts this matrix to a dense matrix while maintaining the layout of the current matrix.
+   */
+  @Since("2.2.0")
+  def toDense: DenseMatrix = toDenseMatrix(colMajor = isColMajor)
+
+  /**
+   * Converts this matrix to a dense matrix in row major order.
+   */
+  @Since("2.2.0")
+  def toDenseRowMajor: DenseMatrix = toDenseMatrix(colMajor = false)
+
+  /**
+   * Converts this matrix to a dense matrix in column major order.
+   */
+  @Since("2.2.0")
+  def toDenseColMajor: DenseMatrix = toDenseMatrix(colMajor = true)
+
+  /**
+   * Returns a matrix in dense or sparse column major format, whichever uses less storage.
+   */
+  @Since("2.2.0")
+  def compressedColMajor: Matrix = {
+    if (getDenseSizeInBytes <= getSparseSizeInBytes(colMajor = true)) {
+      this.toDenseColMajor
+    } else {
+      this.toSparseColMajor
+    }
+  }
+
+  /**
+   * Returns a matrix in dense or sparse row major format, whichever uses less storage.
+   */
+  @Since("2.2.0")
+  def compressedRowMajor: Matrix = {
+    if (getDenseSizeInBytes <= getSparseSizeInBytes(colMajor = false)) {
+      this.toDenseRowMajor
+    } else {
+      this.toSparseRowMajor
+    }
+  }
+
+  /**
+   * Returns a matrix in dense column major, dense row major, sparse row major, or sparse column
+   * major format, whichever uses less storage. When dense representation is optimal, it maintains
+   * the current layout order.
+   */
+  @Since("2.2.0")
+  def compressed: Matrix = {
+    val cscSize = getSparseSizeInBytes(colMajor = true)
+    val csrSize = getSparseSizeInBytes(colMajor = false)
+    if (getDenseSizeInBytes <= math.min(cscSize, csrSize)) {
+      // dense matrix size is the same for column major and row major, so maintain current layout
+      this.toDense
+    } else if (cscSize <= csrSize) {
+      this.toSparseColMajor
+    } else {
+      this.toSparseRowMajor
+    }
+  }
+
+  /** Gets the size of the dense representation of this `Matrix`. */
+  private[ml] def getDenseSizeInBytes: Long = {
+    Matrices.getDenseSize(numCols, numRows)
+  }
+
+  /** Gets the size of the minimal sparse representation of this `Matrix`. */
+  private[ml] def getSparseSizeInBytes(colMajor: Boolean): Long = {
+    val nnz = numNonzeros
+    val numPtrs = if (colMajor) numCols + 1L else numRows + 1L
+    Matrices.getSparseSize(nnz, numPtrs)
+  }
+
+  /** Gets the current size in bytes of this `Matrix`. Useful for testing */
+  private[ml] def getSizeInBytes: Long
 }
 
 /**
@@ -258,7 +375,7 @@ class DenseMatrix @Since("2.0.0") (
 
   override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
 
-  private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
+  override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
     if (!isTransposed) {
       // outer loop over columns
       var j = 0
@@ -291,31 +408,49 @@ class DenseMatrix @Since("2.0.0") (
   override def numActives: Int = values.length
 
   /**
-   * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
-   * set to false.
+   * Generate a `SparseMatrix` from the given `DenseMatrix`.
+   *
+   * @param colMajor Whether the resulting `SparseMatrix` values will be in column major order.
    */
-  @Since("2.0.0")
-  def toSparse: SparseMatrix = {
-    val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
-    val colPtrs: Array[Int] = new Array[Int](numCols + 1)
-    val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt
-    var nnz = 0
-    var j = 0
-    while (j < numCols) {
-      var i = 0
-      while (i < numRows) {
-        val v = values(index(i, j))
-        if (v != 0.0) {
-          rowIndices += i
-          spVals += v
-          nnz += 1
+  private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = {
+    if (!colMajor) this.transpose.toSparseColMajor.transpose
+    else {
+      val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
+      val colPtrs: Array[Int] = new Array[Int](numCols + 1)
+      val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt
+      var nnz = 0
+      var j = 0
+      while (j < numCols) {
+        var i = 0
+        while (i < numRows) {
+          val v = values(index(i, j))
+          if (v != 0.0) {
+            rowIndices += i
+            spVals += v
+            nnz += 1
+          }
+          i += 1
         }
-        i += 1
+        j += 1
+        colPtrs(j) = nnz
       }
-      j += 1
-      colPtrs(j) = nnz
+      new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result())
+    }
+  }
+
+  /**
+   * Generate a `DenseMatrix` from this `DenseMatrix`.
+   *
+   * @param colMajor Whether the resulting `DenseMatrix` values will be in column major order.
+   */
+  private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = {
+    if (isRowMajor && colMajor) {
+      new DenseMatrix(numRows, numCols, this.toArray, isTransposed = false)
+    } else if (isColMajor && !colMajor) {
+      new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true)
+    } else {
+      this
     }
-    new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result())
   }
 
   override def colIter: Iterator[Vector] = {
@@ -331,6 +466,8 @@ class DenseMatrix @Since("2.0.0") (
       }
     }
   }
+
+  private[ml] def getSizeInBytes: Long = Matrices.getDenseSize(numCols, numRows)
 }
 
 /**
@@ -560,7 +697,7 @@ class SparseMatrix @Since("2.0.0") (
   override def transpose: SparseMatrix =
     new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed)
 
-  private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
+  override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
     if (!isTransposed) {
       var j = 0
       while (j < numCols) {
@@ -587,18 +724,67 @@ class SparseMatrix @Since("2.0.0") (
     }
   }
 
+  override def numNonzeros: Int = values.count(_ != 0)
+
+  override def numActives: Int = values.length
+
   /**
-   * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
-   * set to false.
+   * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they
+   * exist.
+   *
+   * @param colMajor Whether or not the resulting `SparseMatrix` values are in column major
+   *                    order.
    */
-  @Since("2.0.0")
-  def toDense: DenseMatrix = {
-    new DenseMatrix(numRows, numCols, toArray)
+  private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = {
+    if (isColMajor && !colMajor) {
+      // it is col major and we want row major, use breeze to remove explicit zeros
+      val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t
+      Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix]
+    } else if (isRowMajor && colMajor) {
+      // it is row major and we want col major, use breeze to remove explicit zeros
+      val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]]
+      Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix]
+    } else {
+      val nnz = numNonzeros
+      if (nnz != numActives) {
+        // remove explicit zeros
+        val rr = new Array[Int](nnz)
+        val vv = new Array[Double](nnz)
+        val numPtrs = if (isRowMajor) numRows else numCols
+        val cc = new Array[Int](numPtrs + 1)
+        var nzIdx = 0
+        var j = 0
+        while (j < numPtrs) {
+          var idx = colPtrs(j)
+          val idxEnd = colPtrs(j + 1)
+          cc(j) = nzIdx
+          while (idx < idxEnd) {
+            if (values(idx) != 0.0) {
+              vv(nzIdx) = values(idx)
+              rr(nzIdx) = rowIndices(idx)
+              nzIdx += 1
+            }
+            idx += 1
+          }
+          j += 1
+        }
+        cc(j) = nnz
+        new SparseMatrix(numRows, numCols, cc, rr, vv, isTransposed = isTransposed)
+      } else {
+        this
+      }
+    }
   }
 
-  override def numNonzeros: Int = values.count(_ != 0)
-
-  override def numActives: Int = values.length
+  /**
+   * Generate a `DenseMatrix` from the given `SparseMatrix`.
+   *
+   * @param colMajor Whether the resulting `DenseMatrix` values are in column major order.
+   */
+  private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = {
+    if (colMajor) new DenseMatrix(numRows, numCols, this.toArray)
+    else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true)
+  }
 
   override def colIter: Iterator[Vector] = {
     if (isTransposed) {
@@ -631,6 +817,8 @@ class SparseMatrix @Since("2.0.0") (
       }
     }
   }
+
+  private[ml] def getSizeInBytes: Long = Matrices.getSparseSize(numActives, colPtrs.length)
 }
 
 /**
@@ -1079,4 +1267,26 @@ object Matrices {
       SparseMatrix.fromCOO(numRows, numCols, entries)
     }
   }
+
+  private[ml] def getSparseSize(numActives: Long, numPtrs: Long): Long = {
+    /*
+      Sparse matrices store two int arrays, one double array, two ints, and one boolean:
+      8 * values.length + 4 * rowIndices.length + 4 * colPtrs.length + arrayHeader * 3 + 2 * 4 + 1
+     */
+    val doubleBytes = java.lang.Double.BYTES
+    val intBytes = java.lang.Integer.BYTES
+    val arrayHeader = 12L
+    doubleBytes * numActives + intBytes * numActives + intBytes * numPtrs + arrayHeader * 3L + 9L
+  }
+
+  private[ml] def getDenseSize(numCols: Long, numRows: Long): Long = {
+    /*
+      Dense matrices store one double array, two ints, and one boolean:
+      8 * values.length + arrayHeader + 2 * 4 + 1
+     */
+    val doubleBytes = java.lang.Double.BYTES
+    val arrayHeader = 12L
+    doubleBytes * numCols * numRows + arrayHeader + 9L
+  }
+
 }
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
index 9c0aa73938..9f82020868 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
@@ -160,22 +160,416 @@ class MatricesSuite extends SparkMLFunSuite {
     assert(sparseMat.values(2) === 10.0)
   }
 
-  test("toSparse, toDense") {
-    val m = 3
-    val n = 2
-    val values = Array(1.0, 2.0, 4.0, 5.0)
-    val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0)
-    val colPtrs = Array(0, 2, 4)
-    val rowIndices = Array(0, 1, 1, 2)
+  test("dense to dense") {
+    /*
+      dm1 =  4.0 2.0 -8.0
+            -1.0 7.0  4.0
+
+      dm2 = 5.0 -9.0  4.0
+            1.0 -3.0 -8.0
+     */
+    val dm1 = new DenseMatrix(2, 3, Array(4.0, -1.0, 2.0, 7.0, -8.0, 4.0))
+    val dm2 = new DenseMatrix(2, 3, Array(5.0, -9.0, 4.0, 1.0, -3.0, -8.0), isTransposed = true)
+
+    val dm8 = dm1.toDenseColMajor
+    assert(dm8 === dm1)
+    assert(dm8.isColMajor)
+    assert(dm8.values.equals(dm1.values))
+
+    val dm5 = dm2.toDenseColMajor
+    assert(dm5 === dm2)
+    assert(dm5.isColMajor)
+    assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0))
+
+    val dm4 = dm1.toDenseRowMajor
+    assert(dm4 === dm1)
+    assert(dm4.isRowMajor)
+    assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0))
+
+    val dm6 = dm2.toDenseRowMajor
+    assert(dm6 === dm2)
+    assert(dm6.isRowMajor)
+    assert(dm6.values.equals(dm2.values))
+
+    val dm3 = dm1.toDense
+    assert(dm3 === dm1)
+    assert(dm3.isColMajor)
+    assert(dm3.values.equals(dm1.values))
+
+    val dm9 = dm2.toDense
+    assert(dm9 === dm2)
+    assert(dm9.isRowMajor)
+    assert(dm9.values.equals(dm2.values))
+  }
 
-    val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values)
-    val deMat1 = new DenseMatrix(m, n, allValues)
+  test("dense to sparse") {
+    /*
+      dm1 = 0.0 4.0 5.0
+            0.0 2.0 0.0
+
+      dm2 = 0.0 4.0 5.0
+            0.0 2.0 0.0
 
-    val spMat2 = deMat1.toSparse
-    val deMat2 = spMat1.toDense
+      dm3 = 0.0 0.0 0.0
+            0.0 0.0 0.0
+     */
+    val dm1 = new DenseMatrix(2, 3, Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0))
+    val dm2 = new DenseMatrix(2, 3, Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0), isTransposed = true)
+    val dm3 = new DenseMatrix(2, 3, Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
+
+    val sm1 = dm1.toSparseColMajor
+    assert(sm1 === dm1)
+    assert(sm1.isColMajor)
+    assert(sm1.values === Array(4.0, 2.0, 5.0))
+
+    val sm3 = dm2.toSparseColMajor
+    assert(sm3 === dm2)
+    assert(sm3.isColMajor)
+    assert(sm3.values === Array(4.0, 2.0, 5.0))
+
+    val sm5 = dm3.toSparseColMajor
+    assert(sm5 === dm3)
+    assert(sm5.values === Array.empty[Double])
+    assert(sm5.isColMajor)
+
+    val sm2 = dm1.toSparseRowMajor
+    assert(sm2 === dm1)
+    assert(sm2.isRowMajor)
+    assert(sm2.values === Array(4.0, 5.0, 2.0))
+
+    val sm4 = dm2.toSparseRowMajor
+    assert(sm4 === dm2)
+    assert(sm4.isRowMajor)
+    assert(sm4.values === Array(4.0, 5.0, 2.0))
+
+    val sm6 = dm3.toSparseRowMajor
+    assert(sm6 === dm3)
+    assert(sm6.values === Array.empty[Double])
+    assert(sm6.isRowMajor)
+
+    val sm7 = dm1.toSparse
+    assert(sm7 === dm1)
+    assert(sm7.values === Array(4.0, 2.0, 5.0))
+    assert(sm7.isColMajor)
+
+    val sm10 = dm2.toSparse
+    assert(sm10 === dm2)
+    assert(sm10.values === Array(4.0, 5.0, 2.0))
+    assert(sm10.isRowMajor)
+  }
+
+  test("sparse to sparse") {
+    /*
+      sm1 = sm2 = sm3 = sm4 = 0.0 4.0 5.0
+                              0.0 2.0 0.0
+      smZeros = 0.0 0.0 0.0
+                0.0 0.0 0.0
+     */
+    val sm1 = new SparseMatrix(2, 3, Array(0, 0, 2, 3), Array(0, 1, 0), Array(4.0, 2.0, 5.0))
+    val sm2 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 1), Array(4.0, 5.0, 2.0),
+      isTransposed = true)
+    val sm3 = new SparseMatrix(2, 3, Array(0, 0, 2, 4), Array(0, 1, 0, 1),
+      Array(4.0, 2.0, 5.0, 0.0))
+    val sm4 = new SparseMatrix(2, 3, Array(0, 2, 4), Array(1, 2, 1, 2),
+      Array(4.0, 5.0, 2.0, 0.0), isTransposed = true)
+    val smZeros = new SparseMatrix(2, 3, Array(0, 2, 4, 6), Array(0, 1, 0, 1, 0, 1),
+      Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
+
+    val sm6 = sm1.toSparseColMajor
+    assert(sm6 === sm1)
+    assert(sm6.isColMajor)
+    assert(sm6.values.equals(sm1.values))
+
+    val sm7 = sm2.toSparseColMajor
+    assert(sm7 === sm2)
+    assert(sm7.isColMajor)
+    assert(sm7.values === Array(4.0, 2.0, 5.0))
+
+    val sm16 = sm3.toSparseColMajor
+    assert(sm16 === sm3)
+    assert(sm16.isColMajor)
+    assert(sm16.values === Array(4.0, 2.0, 5.0))
+
+    val sm14 = sm4.toSparseColMajor
+    assert(sm14 === sm4)
+    assert(sm14.values === Array(4.0, 2.0, 5.0))
+    assert(sm14.isColMajor)
+
+    val sm15 = smZeros.toSparseColMajor
+    assert(sm15 === smZeros)
+    assert(sm15.values === Array.empty[Double])
+    assert(sm15.isColMajor)
+
+    val sm5 = sm1.toSparseRowMajor
+    assert(sm5 === sm1)
+    assert(sm5.isRowMajor)
+    assert(sm5.values === Array(4.0, 5.0, 2.0))
+
+    val sm8 = sm2.toSparseRowMajor
+    assert(sm8 === sm2)
+    assert(sm8.isRowMajor)
+    assert(sm8.values.equals(sm2.values))
+
+    val sm10 = sm3.toSparseRowMajor
+    assert(sm10 === sm3)
+    assert(sm10.values === Array(4.0, 5.0, 2.0))
+    assert(sm10.isRowMajor)
+
+    val sm11 = sm4.toSparseRowMajor
+    assert(sm11 === sm4)
+    assert(sm11.values === Array(4.0, 5.0, 2.0))
+    assert(sm11.isRowMajor)
+
+    val sm17 = smZeros.toSparseRowMajor
+    assert(sm17 === smZeros)
+    assert(sm17.values === Array.empty[Double])
+    assert(sm17.isRowMajor)
+
+    val sm9 = sm3.toSparse
+    assert(sm9 === sm3)
+    assert(sm9.values === Array(4.0, 2.0, 5.0))
+    assert(sm9.isColMajor)
+
+    val sm12 = sm4.toSparse
+    assert(sm12 === sm4)
+    assert(sm12.values === Array(4.0, 5.0, 2.0))
+    assert(sm12.isRowMajor)
+
+    val sm13 = smZeros.toSparse
+    assert(sm13 === smZeros)
+    assert(sm13.values === Array.empty[Double])
+    assert(sm13.isColMajor)
+  }
+
+  test("sparse to dense") {
+    /*
+      sm1 = sm2 = 0.0 4.0 5.0
+                  0.0 2.0 0.0
+
+      sm3 = 0.0 0.0 0.0
+            0.0 0.0 0.0
+     */
+    val sm1 = new SparseMatrix(2, 3, Array(0, 0, 2, 3), Array(0, 1, 0), Array(4.0, 2.0, 5.0))
+    val sm2 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 1), Array(4.0, 5.0, 2.0),
+      isTransposed = true)
+    val sm3 = new SparseMatrix(2, 3, Array(0, 0, 0, 0), Array.empty[Int], Array.empty[Double])
+
+    val dm6 = sm1.toDenseColMajor
+    assert(dm6 === sm1)
+    assert(dm6.isColMajor)
+    assert(dm6.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0))
+
+    val dm7 = sm2.toDenseColMajor
+    assert(dm7 === sm2)
+    assert(dm7.isColMajor)
+    assert(dm7.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0))
+
+    val dm2 = sm1.toDenseRowMajor
+    assert(dm2 === sm1)
+    assert(dm2.isRowMajor)
+    assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0))
+
+    val dm4 = sm2.toDenseRowMajor
+    assert(dm4 === sm2)
+    assert(dm4.isRowMajor)
+    assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0))
+
+    val dm1 = sm1.toDense
+    assert(dm1 === sm1)
+    assert(dm1.isColMajor)
+    assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0))
+
+    val dm3 = sm2.toDense
+    assert(dm3 === sm2)
+    assert(dm3.isRowMajor)
+    assert(dm3.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0))
+
+    val dm5 = sm3.toDense
+    assert(dm5 === sm3)
+    assert(dm5.isColMajor)
+    assert(dm5.values === Array.fill(6)(0.0))
+  }
+
+  test("compressed dense") {
+    /*
+      dm1 = 1.0 0.0 0.0 0.0
+            1.0 0.0 0.0 0.0
+            0.0 0.0 0.0 0.0
+
+      dm2 = 1.0 1.0 0.0 0.0
+            0.0 0.0 0.0 0.0
+            0.0 0.0 0.0 0.0
+     */
+    // this should compress to a sparse matrix
+    val dm1 = new DenseMatrix(3, 4, Array.fill(2)(1.0) ++ Array.fill(10)(0.0))
+
+    // optimal compression layout is row major since numRows < numCols
+    val cm1 = dm1.compressed.asInstanceOf[SparseMatrix]
+    assert(cm1 === dm1)
+    assert(cm1.isRowMajor)
+    assert(cm1.getSizeInBytes < dm1.getSizeInBytes)
+
+    // force compressed column major
+    val cm2 = dm1.compressedColMajor.asInstanceOf[SparseMatrix]
+    assert(cm2 === dm1)
+    assert(cm2.isColMajor)
+    assert(cm2.getSizeInBytes < dm1.getSizeInBytes)
+
+    // optimal compression layout for transpose is column major
+    val dm2 = dm1.transpose
+    val cm3 = dm2.compressed.asInstanceOf[SparseMatrix]
+    assert(cm3 === dm2)
+    assert(cm3.isColMajor)
+    assert(cm3.getSizeInBytes < dm2.getSizeInBytes)
+
+    /*
+      dm3 = 1.0 1.0 1.0 0.0
+            1.0 1.0 0.0 0.0
+            1.0 1.0 0.0 0.0
+
+      dm4 = 1.0 1.0 1.0 1.0
+            1.0 1.0 1.0 0.0
+            0.0 0.0 0.0 0.0
+     */
+    // this should compress to a dense matrix
+    val dm3 = new DenseMatrix(3, 4, Array.fill(7)(1.0) ++ Array.fill(5)(0.0))
+    val dm4 = new DenseMatrix(3, 4, Array.fill(7)(1.0) ++ Array.fill(5)(0.0), isTransposed = true)
+
+    val cm4 = dm3.compressed.asInstanceOf[DenseMatrix]
+    assert(cm4 === dm3)
+    assert(cm4.isColMajor)
+    assert(cm4.values.equals(dm3.values))
+    assert(cm4.getSizeInBytes === dm3.getSizeInBytes)
+
+    // force compressed row major
+    val cm5 = dm3.compressedRowMajor.asInstanceOf[DenseMatrix]
+    assert(cm5 === dm3)
+    assert(cm5.isRowMajor)
+    assert(cm5.getSizeInBytes === dm3.getSizeInBytes)
+
+    val cm6 = dm4.compressed.asInstanceOf[DenseMatrix]
+    assert(cm6 === dm4)
+    assert(cm6.isRowMajor)
+    assert(cm6.values.equals(dm4.values))
+    assert(cm6.getSizeInBytes === dm4.getSizeInBytes)
+
+    val cm7 = dm4.compressedColMajor.asInstanceOf[DenseMatrix]
+    assert(cm7 === dm4)
+    assert(cm7.isColMajor)
+    assert(cm7.getSizeInBytes === dm4.getSizeInBytes)
+
+    // this has the same size sparse or dense
+    val dm5 = new DenseMatrix(4, 4, Array.fill(7)(1.0) ++ Array.fill(9)(0.0))
+    // should choose dense to break ties
+    val cm8 = dm5.compressed.asInstanceOf[DenseMatrix]
+    assert(cm8.getSizeInBytes === dm5.toSparseColMajor.getSizeInBytes)
+  }
 
-    assert(spMat1.asBreeze === spMat2.asBreeze)
-    assert(deMat1.asBreeze === deMat2.asBreeze)
+  test("compressed sparse") {
+    /*
+       sm1 = 0.0 -1.0
+             0.0  0.0
+             0.0  0.0
+             0.0  0.0
+
+       sm2 = 0.0 0.0 0.0 0.0
+            -1.0 0.0 0.0 0.0
+     */
+    // these should compress to sparse matrices
+    val sm1 = new SparseMatrix(4, 2, Array(0, 0, 1), Array(0), Array(-1.0))
+    val sm2 = sm1.transpose
+
+    val cm1 = sm1.compressed.asInstanceOf[SparseMatrix]
+    // optimal is column major
+    assert(cm1 === sm1)
+    assert(cm1.isColMajor)
+    assert(cm1.values.equals(sm1.values))
+    assert(cm1.getSizeInBytes === sm1.getSizeInBytes)
+
+    val cm2 = sm1.compressedRowMajor.asInstanceOf[SparseMatrix]
+    assert(cm2 === sm1)
+    assert(cm2.isRowMajor)
+    // forced to be row major, so we have increased the size
+    assert(cm2.getSizeInBytes > sm1.getSizeInBytes)
+    assert(cm2.getSizeInBytes < sm1.toDense.getSizeInBytes)
+
+    val cm9 = sm1.compressedColMajor.asInstanceOf[SparseMatrix]
+    assert(cm9 === sm1)
+    assert(cm9.values.equals(sm1.values))
+    assert(cm9.getSizeInBytes === sm1.getSizeInBytes)
+
+    val cm3 = sm2.compressed.asInstanceOf[SparseMatrix]
+    assert(cm3 === sm2)
+    assert(cm3.isRowMajor)
+    assert(cm3.values.equals(sm2.values))
+    assert(cm3.getSizeInBytes === sm2.getSizeInBytes)
+
+    val cm8 = sm2.compressedColMajor.asInstanceOf[SparseMatrix]
+    assert(cm8 === sm2)
+    assert(cm8.isColMajor)
+    // forced to be col major, so we have increased the size
+    assert(cm8.getSizeInBytes > sm2.getSizeInBytes)
+    assert(cm8.getSizeInBytes < sm2.toDense.getSizeInBytes)
+
+    val cm10 = sm2.compressedRowMajor.asInstanceOf[SparseMatrix]
+    assert(cm10 === sm2)
+    assert(cm10.isRowMajor)
+    assert(cm10.values.equals(sm2.values))
+    assert(cm10.getSizeInBytes === sm2.getSizeInBytes)
+
+
+    /*
+       sm3 = 0.0 -1.0
+             2.0  3.0
+            -4.0  9.0
+     */
+    // this should compress to a dense matrix
+    val sm3 = new SparseMatrix(3, 2, Array(0, 2, 5), Array(1, 2, 0, 1, 2),
+      Array(2.0, -4.0, -1.0, 3.0, 9.0))
+
+    // dense is optimal, and maintains column major
+    val cm4 = sm3.compressed.asInstanceOf[DenseMatrix]
+    assert(cm4 === sm3)
+    assert(cm4.isColMajor)
+    assert(cm4.getSizeInBytes < sm3.getSizeInBytes)
+
+    val cm5 = sm3.compressedRowMajor.asInstanceOf[DenseMatrix]
+    assert(cm5 === sm3)
+    assert(cm5.isRowMajor)
+    assert(cm5.getSizeInBytes < sm3.getSizeInBytes)
+
+    val cm11 = sm3.compressedColMajor.asInstanceOf[DenseMatrix]
+    assert(cm11 === sm3)
+    assert(cm11.isColMajor)
+    assert(cm11.getSizeInBytes < sm3.getSizeInBytes)
+
+    /*
+      sm4 = 1.0 0.0 0.0 ...
+
+      sm5 = 1.0
+            0.0
+            0.0
+            ...
+     */
+    val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(1.0))
+    val cm6 = sm4.compressed.asInstanceOf[SparseMatrix]
+    assert(cm6 === sm4)
+    assert(cm6.isColMajor)
+    assert(cm6.getSizeInBytes <= sm4.getSizeInBytes)
+
+    val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(1.0),
+      isTransposed = true)
+    val cm7 = sm5.compressed.asInstanceOf[SparseMatrix]
+    assert(cm7 === sm5)
+    assert(cm7.isRowMajor)
+    assert(cm7.getSizeInBytes <= sm5.getSizeInBytes)
+
+    // this has the same size sparse or dense
+    val sm6 = new SparseMatrix(4, 4, Array(0, 4, 7, 7, 7), Array(0, 1, 2, 3, 0, 1, 2),
+      Array.fill(7)(1.0))
+    // should choose dense to break ties
+    val cm12 = sm6.compressed.asInstanceOf[DenseMatrix]
+    assert(cm12.getSizeInBytes === sm6.getSizeInBytes)
   }
 
   test("map, update") {
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
index ea22c2787f..dfbdaf19d3 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
@@ -336,6 +336,11 @@ class VectorsSuite extends SparkMLFunSuite {
     val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
     val sv1c = sv1.compressed.asInstanceOf[DenseVector]
     assert(sv1 === sv1c)
+
+    val sv2 = Vectors.sparse(Int.MaxValue, Array(0), Array(3.4))
+    val sv2c = sv2.compressed.asInstanceOf[SparseVector]
+    assert(sv2c === sv2)
+    assert(sv2c.numActives === 1)
   }
 
   test("SparseVector.slice") {
author	sethah <seth.hendrickson16@gmail.com>	2017-03-24 20:32:42 +0000
committer	DB Tsai <dbtsai@dbtsai.com>	2017-03-24 20:32:42 +0000
commit	e8810b73c495b6d437dd3b9bb334762126b3c063 (patch)
tree	36f7d9e0f7f05088161a653bf6abd43a14b4d903 /mllib-local
parent	707e501832fa7adde0a884c528a7352983d83520 (diff)
download	spark-e8810b73c495b6d437dd3b9bb334762126b3c063.tar.gz spark-e8810b73c495b6d437dd3b9bb334762126b3c063.tar.bz2 spark-e8810b73c495b6d437dd3b9bb334762126b3c063.zip