aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-01-28 10:06:37 -0800
committerXiangrui Meng <meng@databricks.com>2015-01-28 10:06:37 -0800
commiteeb53bf90e93b298eff48387d2e9ad699b52d001 (patch)
treed73dadc6186e24e0f93efe4c3fc71facc898c8d4 /mllib/src/test
parent622ff09d036b40caa4c177508e8a948beccfd88f (diff)
downloadspark-eeb53bf90e93b298eff48387d2e9ad699b52d001.tar.gz
spark-eeb53bf90e93b298eff48387d2e9ad699b52d001.tar.bz2
spark-eeb53bf90e93b298eff48387d2e9ad699b52d001.zip
[SPARK-3974][MLlib] Distributed Block Matrix Abstractions
This pull request includes the abstractions for the distributed BlockMatrix representation. `BlockMatrix` will allow users to store very large matrices in small blocks of local matrices. Specific partitioners, such as `RowBasedPartitioner` and `ColumnBasedPartitioner`, are implemented in order to optimize addition and multiplication operations that will be added in a following PR. This work is based on the ml-matrix repo developed at the AMPLab at UC Berkeley, CA. https://github.com/amplab/ml-matrix Additional thanks to rezazadeh, shivaram, and mengxr for guidance on the design. Author: Burak Yavuz <brkyvz@gmail.com> Author: Xiangrui Meng <meng@databricks.com> Author: Burak Yavuz <brkyvz@dn51t42l.sunet> Author: Burak Yavuz <brkyvz@dn51t4rd.sunet> Author: Burak Yavuz <brkyvz@dn0a221430.sunet> Closes #3200 from brkyvz/SPARK-3974 and squashes the following commits: a8eace2 [Burak Yavuz] Merge pull request #2 from mengxr/brkyvz-SPARK-3974 feb32a7 [Xiangrui Meng] update tests e1d3ee8 [Xiangrui Meng] minor updates 24ec7b8 [Xiangrui Meng] update grid partitioner 5eecd48 [Burak Yavuz] fixed gridPartitioner and added tests 140f20e [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into SPARK-3974 1694c9e [Burak Yavuz] almost finished addressing comments f9d664b [Burak Yavuz] updated API and modified partitioning scheme eebbdf7 [Burak Yavuz] preliminary changes addressing code review 1a63b20 [Burak Yavuz] [SPARK-3974] Remove setPartition method. Isn't required 1e8bb2a [Burak Yavuz] [SPARK-3974] Change return type of cache and persist 239ab4b [Burak Yavuz] [SPARK-3974] Addressed @jkbradley's comments ba414d2 [Burak Yavuz] [SPARK-3974] fixed frobenius norm ab6cde0 [Burak Yavuz] [SPARK-3974] Modifications cleaning code up, making size calculation more robust 9ae85aa [Burak Yavuz] [SPARK-3974] Made partitioner a variable inside BlockMatrix instead of a constructor variable d033861 [Burak Yavuz] [SPARK-3974] Removed SubMatrixInfo and added constructor without partitioner 49b9586 [Burak Yavuz] [SPARK-3974] Updated testing utils from master 645afbe [Burak Yavuz] [SPARK-3974] Pull latest master b05aabb [Burak Yavuz] [SPARK-3974] Updated tests to reflect changes 19c17e8 [Burak Yavuz] [SPARK-3974] Changed blockIdRow and blockIdCol 589fbb6 [Burak Yavuz] [SPARK-3974] Code review feedback addressed aa8f086 [Burak Yavuz] [SPARK-3974] Additional comments added f378e16 [Burak Yavuz] [SPARK-3974] Block Matrix Abstractions ready b693209 [Burak Yavuz] Ready for Pull request
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala135
1 files changed, 135 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
new file mode 100644
index 0000000000..05efbc8e8d
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg.distributed
+
+import scala.util.Random
+
+import breeze.linalg.{DenseMatrix => BDM}
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
+
+ val m = 5
+ val n = 4
+ val rowPerPart = 2
+ val colPerPart = 2
+ val numPartitions = 3
+ var gridBasedMat: BlockMatrix = _
+
+ override def beforeAll() {
+ super.beforeAll()
+
+ val blocks: Seq[((Int, Int), Matrix)] = Seq(
+ ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))),
+ ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))),
+ ((1, 0), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))),
+ ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))),
+ ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0))))
+
+ gridBasedMat = new BlockMatrix(sc.parallelize(blocks, numPartitions), rowPerPart, colPerPart)
+ }
+
+ test("size") {
+ assert(gridBasedMat.numRows() === m)
+ assert(gridBasedMat.numCols() === n)
+ }
+
+ test("grid partitioner") {
+ val random = new Random()
+ // This should generate a 4x4 grid of 1x2 blocks.
+ val part0 = GridPartitioner(4, 7, suggestedNumPartitions = 12)
+ val expected0 = Array(
+ Array(0, 0, 4, 4, 8, 8, 12),
+ Array(1, 1, 5, 5, 9, 9, 13),
+ Array(2, 2, 6, 6, 10, 10, 14),
+ Array(3, 3, 7, 7, 11, 11, 15))
+ for (i <- 0 until 4; j <- 0 until 7) {
+ assert(part0.getPartition((i, j)) === expected0(i)(j))
+ assert(part0.getPartition((i, j, random.nextInt())) === expected0(i)(j))
+ }
+
+ intercept[IllegalArgumentException] {
+ part0.getPartition((-1, 0))
+ }
+
+ intercept[IllegalArgumentException] {
+ part0.getPartition((4, 0))
+ }
+
+ intercept[IllegalArgumentException] {
+ part0.getPartition((0, -1))
+ }
+
+ intercept[IllegalArgumentException] {
+ part0.getPartition((0, 7))
+ }
+
+ val part1 = GridPartitioner(2, 2, suggestedNumPartitions = 5)
+ val expected1 = Array(
+ Array(0, 2),
+ Array(1, 3))
+ for (i <- 0 until 2; j <- 0 until 2) {
+ assert(part1.getPartition((i, j)) === expected1(i)(j))
+ assert(part1.getPartition((i, j, random.nextInt())) === expected1(i)(j))
+ }
+
+ val part2 = GridPartitioner(2, 2, suggestedNumPartitions = 5)
+ assert(part0 !== part2)
+ assert(part1 === part2)
+
+ val part3 = new GridPartitioner(2, 3, rowsPerPart = 1, colsPerPart = 2)
+ val expected3 = Array(
+ Array(0, 0, 2),
+ Array(1, 1, 3))
+ for (i <- 0 until 2; j <- 0 until 3) {
+ assert(part3.getPartition((i, j)) === expected3(i)(j))
+ assert(part3.getPartition((i, j, random.nextInt())) === expected3(i)(j))
+ }
+
+ val part4 = GridPartitioner(2, 3, rowsPerPart = 1, colsPerPart = 2)
+ assert(part3 === part4)
+
+ intercept[IllegalArgumentException] {
+ new GridPartitioner(2, 2, rowsPerPart = 0, colsPerPart = 1)
+ }
+
+ intercept[IllegalArgumentException] {
+ GridPartitioner(2, 2, rowsPerPart = 1, colsPerPart = 0)
+ }
+
+ intercept[IllegalArgumentException] {
+ GridPartitioner(2, 2, suggestedNumPartitions = 0)
+ }
+ }
+
+ test("toBreeze and toLocalMatrix") {
+ val expected = BDM(
+ (1.0, 0.0, 0.0, 0.0),
+ (0.0, 2.0, 1.0, 0.0),
+ (3.0, 1.0, 1.0, 0.0),
+ (0.0, 1.0, 2.0, 1.0),
+ (0.0, 0.0, 1.0, 5.0))
+
+ val dense = Matrices.fromBreeze(expected).asInstanceOf[DenseMatrix]
+ assert(gridBasedMat.toLocalMatrix() === dense)
+ assert(gridBasedMat.toBreeze() === expected)
+ }
+}