[SPARK-6577] [MLlib] [PySpark] SparseMatrix should be supported in PySpark

Supporting of SparseMatrix in PySpark. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #5355 from MechCoder/spark-6577 and squashes the following commits: 7492190 [MechCoder] More readable code for densifying ea2c54b [MechCoder] Check bounds for indexing 454ef2c [MechCoder] Made the following changes 1. Used convert_to_array for array conversion. 2. Used F order for toArray 3. Minor improvements in speed. db76caf [MechCoder] Add support for CSR matrix 29653e7 [MechCoder] Renamed indices to rowIndices and indptr to colPtrs b6384fe [MechCoder] [SPARK-6577] SparseMatrix should be supported in PySpark
author: MechCoder <manojkumarsivaraj334@gmail.com> 2015-04-09 23:10:13 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-04-09 23:10:13 -0700
commit: e2360810f50de77f79d372cc9b46db117d451cfc (patch)
tree: 7a0d651141c4149fe36dd2276c66f5ab2214185f /python/pyspark/mllib/tests.py
parent: b5c51c8df480f1a82a82e4d597d8eea631bffb4e (diff)
download: spark-e2360810f50de77f79d372cc9b46db117d451cfc.tar.gz
spark-e2360810f50de77f79d372cc9b46db117d451cfc.tar.bz2
spark-e2360810f50de77f79d372cc9b46db117d451cfc.zip
1 files changed, 50 insertions, 2 deletions
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 61ef398487..3b40158c12 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -24,7 +24,7 @@ import sys
 import tempfile
 import array as pyarray
 
-from numpy import array, array_equal
+from numpy import array, array_equal, zeros
 from py4j.protocol import Py4JJavaError
 
 if sys.version_info[:2] <= (2, 6):
@@ -38,7 +38,7 @@ else:
 
 from pyspark.mllib.common import _to_java_object_rdd
 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
-    DenseMatrix, Vectors, Matrices
+    DenseMatrix, SparseMatrix, Vectors, Matrices
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.stat import Statistics
@@ -144,6 +144,54 @@ class VectorTests(PySparkTestCase):
             for j in range(2):
                 self.assertEquals(mat[i, j], expected[i][j])
 
+    def test_sparse_matrix(self):
+        # Test sparse matrix creation.
+        sm1 = SparseMatrix(
+            3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
+        self.assertEquals(sm1.numRows, 3)
+        self.assertEquals(sm1.numCols, 4)
+        self.assertEquals(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
+        self.assertEquals(sm1.rowIndices.tolist(), [1, 2, 1, 2])
+        self.assertEquals(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
+
+        # Test indexing
+        expected = [
+            [0, 0, 0, 0],
+            [1, 0, 4, 0],
+            [2, 0, 5, 0]]
+
+        for i in range(3):
+            for j in range(4):
+                self.assertEquals(expected[i][j], sm1[i, j])
+        self.assertTrue(array_equal(sm1.toArray(), expected))
+
+        # Test conversion to dense and sparse.
+        smnew = sm1.toDense().toSparse()
+        self.assertEquals(sm1.numRows, smnew.numRows)
+        self.assertEquals(sm1.numCols, smnew.numCols)
+        self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
+        self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
+        self.assertTrue(array_equal(sm1.values, smnew.values))
+
+        sm1t = SparseMatrix(
+            3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
+            isTransposed=True)
+        self.assertEquals(sm1t.numRows, 3)
+        self.assertEquals(sm1t.numCols, 4)
+        self.assertEquals(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
+        self.assertEquals(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
+        self.assertEquals(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
+
+        expected = [
+            [3, 2, 0, 0],
+            [0, 0, 4, 0],
+            [9, 0, 8, 0]]
+
+        for i in range(3):
+            for j in range(4):
+                self.assertEquals(expected[i][j], sm1t[i, j])
+        self.assertTrue(array_equal(sm1t.toArray(), expected))
+
 
 class ListTests(PySparkTestCase):
author	MechCoder <manojkumarsivaraj334@gmail.com>	2015-04-09 23:10:13 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-04-09 23:10:13 -0700
commit	e2360810f50de77f79d372cc9b46db117d451cfc (patch)
tree	7a0d651141c4149fe36dd2276c66f5ab2214185f /python/pyspark/mllib/tests.py
parent	b5c51c8df480f1a82a82e4d597d8eea631bffb4e (diff)
download	spark-e2360810f50de77f79d372cc9b46db117d451cfc.tar.gz spark-e2360810f50de77f79d372cc9b46db117d451cfc.tar.bz2 spark-e2360810f50de77f79d372cc9b46db117d451cfc.zip