aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorMike Dusenberry <mwdusenb@us.ibm.com>2015-08-05 07:40:50 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-05 07:40:50 -0700
commit34dcf10104460816382908b2b8eeb6c925e862bf (patch)
treea4767e939dd4a33e2d3eda23696ffcf03f55c4fa /mllib
parent519cf6d3f764a977770266784d6902fe205a070f (diff)
downloadspark-34dcf10104460816382908b2b8eeb6c925e862bf.tar.gz
spark-34dcf10104460816382908b2b8eeb6c925e862bf.tar.bz2
spark-34dcf10104460816382908b2b8eeb6c925e862bf.zip
[SPARK-6486] [MLLIB] [PYTHON] Add BlockMatrix to PySpark.
mengxr This adds the `BlockMatrix` to PySpark. I have the conversions to `IndexedRowMatrix` and `CoordinateMatrix` ready as well, so once PR #7554 is completed (which relies on PR #7746), this PR can be finished. Author: Mike Dusenberry <mwdusenb@us.ibm.com> Closes #7761 from dusenberrymw/SPARK-6486_Add_BlockMatrix_to_PySpark and squashes the following commits: 27195c2 [Mike Dusenberry] Adding one more check to _convert_to_matrix_block_tuple, and a few minor documentation changes. ae50883 [Mike Dusenberry] Minor update: BlockMatrix should inherit from DistributedMatrix. b8acc1c [Mike Dusenberry] Moving BlockMatrix to pyspark.mllib.linalg.distributed, updating the logic to match that of the other distributed matrices, adding conversions, and adding documentation. c014002 [Mike Dusenberry] Using properties for better documentation. 3bda6ab [Mike Dusenberry] Adding documentation. 8fb3095 [Mike Dusenberry] Small cleanup. e17af2e [Mike Dusenberry] Adding BlockMatrix to PySpark.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala25
1 files changed, 25 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index d2b3fae381..f585aacd45 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -1129,6 +1129,21 @@ private[python] class PythonMLLibAPI extends Serializable {
}
/**
+ * Wrapper around BlockMatrix constructor.
+ */
+ def createBlockMatrix(blocks: DataFrame, rowsPerBlock: Int, colsPerBlock: Int,
+ numRows: Long, numCols: Long): BlockMatrix = {
+ // We use DataFrames for serialization of sub-matrix blocks from
+ // Python, so map each Row in the DataFrame back to a
+ // ((blockRowIndex, blockColIndex), sub-matrix) tuple.
+ val blockTuples = blocks.map {
+ case Row(Row(blockRowIndex: Long, blockColIndex: Long), subMatrix: Matrix) =>
+ ((blockRowIndex.toInt, blockColIndex.toInt), subMatrix)
+ }
+ new BlockMatrix(blockTuples, rowsPerBlock, colsPerBlock, numRows, numCols)
+ }
+
+ /**
* Return the rows of an IndexedRowMatrix.
*/
def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = {
@@ -1147,6 +1162,16 @@ private[python] class PythonMLLibAPI extends Serializable {
val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext)
sqlContext.createDataFrame(coordinateMatrix.entries)
}
+
+ /**
+ * Return the sub-matrix blocks of a BlockMatrix.
+ */
+ def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = {
+ // We use DataFrames for serialization of sub-matrix blocks to
+ // Python, so return a DataFrame.
+ val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext)
+ sqlContext.createDataFrame(blockMatrix.blocks)
+ }
}
/**