[SPARK-6486] [MLLIB] [PYTHON] Add BlockMatrix to PySpark.

mengxr This adds the `BlockMatrix` to PySpark. I have the conversions to `IndexedRowMatrix` and `CoordinateMatrix` ready as well, so once PR #7554 is completed (which relies on PR #7746), this PR can be finished. Author: Mike Dusenberry <mwdusenb@us.ibm.com> Closes #7761 from dusenberrymw/SPARK-6486_Add_BlockMatrix_to_PySpark and squashes the following commits: 27195c2 [Mike Dusenberry] Adding one more check to _convert_to_matrix_block_tuple, and a few minor documentation changes. ae50883 [Mike Dusenberry] Minor update: BlockMatrix should inherit from DistributedMatrix. b8acc1c [Mike Dusenberry] Moving BlockMatrix to pyspark.mllib.linalg.distributed, updating the logic to match that of the other distributed matrices, adding conversions, and adding documentation. c014002 [Mike Dusenberry] Using properties for better documentation. 3bda6ab [Mike Dusenberry] Adding documentation. 8fb3095 [Mike Dusenberry] Small cleanup. e17af2e [Mike Dusenberry] Adding BlockMatrix to PySpark.
author: Mike Dusenberry <mwdusenb@us.ibm.com> 2015-08-05 07:40:50 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-08-05 07:40:50 -0700
commit: 34dcf10104460816382908b2b8eeb6c925e862bf (patch)
tree: a4767e939dd4a33e2d3eda23696ffcf03f55c4fa /mllib
parent: 519cf6d3f764a977770266784d6902fe205a070f (diff)
download: spark-34dcf10104460816382908b2b8eeb6c925e862bf.tar.gz
spark-34dcf10104460816382908b2b8eeb6c925e862bf.tar.bz2
spark-34dcf10104460816382908b2b8eeb6c925e862bf.zip
1 files changed, 25 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index d2b3fae381..f585aacd45 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -1129,6 +1129,21 @@ private[python] class PythonMLLibAPI extends Serializable {
   }
 
   /**
+   * Wrapper around BlockMatrix constructor.
+   */
+  def createBlockMatrix(blocks: DataFrame, rowsPerBlock: Int, colsPerBlock: Int,
+                        numRows: Long, numCols: Long): BlockMatrix = {
+    // We use DataFrames for serialization of sub-matrix blocks from
+    // Python, so map each Row in the DataFrame back to a
+    // ((blockRowIndex, blockColIndex), sub-matrix) tuple.
+    val blockTuples = blocks.map {
+      case Row(Row(blockRowIndex: Long, blockColIndex: Long), subMatrix: Matrix) =>
+        ((blockRowIndex.toInt, blockColIndex.toInt), subMatrix)
+    }
+    new BlockMatrix(blockTuples, rowsPerBlock, colsPerBlock, numRows, numCols)
+  }
+
+  /**
    * Return the rows of an IndexedRowMatrix.
    */
   def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = {
@@ -1147,6 +1162,16 @@ private[python] class PythonMLLibAPI extends Serializable {
     val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext)
     sqlContext.createDataFrame(coordinateMatrix.entries)
   }
+
+  /**
+   * Return the sub-matrix blocks of a BlockMatrix.
+   */
+  def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = {
+    // We use DataFrames for serialization of sub-matrix blocks to
+    // Python, so return a DataFrame.
+    val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext)
+    sqlContext.createDataFrame(blockMatrix.blocks)
+  }
 }
 
 /**
author	Mike Dusenberry <mwdusenb@us.ibm.com>	2015-08-05 07:40:50 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-08-05 07:40:50 -0700
commit	34dcf10104460816382908b2b8eeb6c925e862bf (patch)
tree	a4767e939dd4a33e2d3eda23696ffcf03f55c4fa /mllib
parent	519cf6d3f764a977770266784d6902fe205a070f (diff)
download	spark-34dcf10104460816382908b2b8eeb6c925e862bf.tar.gz spark-34dcf10104460816382908b2b8eeb6c925e862bf.tar.bz2 spark-34dcf10104460816382908b2b8eeb6c925e862bf.zip