aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/tests.py
diff options
context:
space:
mode:
authorMechCoder <manojkumarsivaraj334@gmail.com>2015-04-09 23:10:13 -0700
committerXiangrui Meng <meng@databricks.com>2015-04-09 23:10:13 -0700
commite2360810f50de77f79d372cc9b46db117d451cfc (patch)
tree7a0d651141c4149fe36dd2276c66f5ab2214185f /python/pyspark/mllib/tests.py
parentb5c51c8df480f1a82a82e4d597d8eea631bffb4e (diff)
downloadspark-e2360810f50de77f79d372cc9b46db117d451cfc.tar.gz
spark-e2360810f50de77f79d372cc9b46db117d451cfc.tar.bz2
spark-e2360810f50de77f79d372cc9b46db117d451cfc.zip
[SPARK-6577] [MLlib] [PySpark] SparseMatrix should be supported in PySpark
Supporting of SparseMatrix in PySpark. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #5355 from MechCoder/spark-6577 and squashes the following commits: 7492190 [MechCoder] More readable code for densifying ea2c54b [MechCoder] Check bounds for indexing 454ef2c [MechCoder] Made the following changes 1. Used convert_to_array for array conversion. 2. Used F order for toArray 3. Minor improvements in speed. db76caf [MechCoder] Add support for CSR matrix 29653e7 [MechCoder] Renamed indices to rowIndices and indptr to colPtrs b6384fe [MechCoder] [SPARK-6577] SparseMatrix should be supported in PySpark
Diffstat (limited to 'python/pyspark/mllib/tests.py')
-rw-r--r--python/pyspark/mllib/tests.py52
1 files changed, 50 insertions, 2 deletions
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 61ef398487..3b40158c12 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -24,7 +24,7 @@ import sys
import tempfile
import array as pyarray
-from numpy import array, array_equal
+from numpy import array, array_equal, zeros
from py4j.protocol import Py4JJavaError
if sys.version_info[:2] <= (2, 6):
@@ -38,7 +38,7 @@ else:
from pyspark.mllib.common import _to_java_object_rdd
from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
- DenseMatrix, Vectors, Matrices
+ DenseMatrix, SparseMatrix, Vectors, Matrices
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.stat import Statistics
@@ -144,6 +144,54 @@ class VectorTests(PySparkTestCase):
for j in range(2):
self.assertEquals(mat[i, j], expected[i][j])
+ def test_sparse_matrix(self):
+ # Test sparse matrix creation.
+ sm1 = SparseMatrix(
+ 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
+ self.assertEquals(sm1.numRows, 3)
+ self.assertEquals(sm1.numCols, 4)
+ self.assertEquals(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
+ self.assertEquals(sm1.rowIndices.tolist(), [1, 2, 1, 2])
+ self.assertEquals(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
+
+ # Test indexing
+ expected = [
+ [0, 0, 0, 0],
+ [1, 0, 4, 0],
+ [2, 0, 5, 0]]
+
+ for i in range(3):
+ for j in range(4):
+ self.assertEquals(expected[i][j], sm1[i, j])
+ self.assertTrue(array_equal(sm1.toArray(), expected))
+
+ # Test conversion to dense and sparse.
+ smnew = sm1.toDense().toSparse()
+ self.assertEquals(sm1.numRows, smnew.numRows)
+ self.assertEquals(sm1.numCols, smnew.numCols)
+ self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
+ self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
+ self.assertTrue(array_equal(sm1.values, smnew.values))
+
+ sm1t = SparseMatrix(
+ 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
+ isTransposed=True)
+ self.assertEquals(sm1t.numRows, 3)
+ self.assertEquals(sm1t.numCols, 4)
+ self.assertEquals(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
+ self.assertEquals(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
+ self.assertEquals(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
+
+ expected = [
+ [3, 2, 0, 0],
+ [0, 0, 4, 0],
+ [9, 0, 8, 0]]
+
+ for i in range(3):
+ for j in range(4):
+ self.assertEquals(expected[i][j], sm1t[i, j])
+ self.assertTrue(array_equal(sm1t.toArray(), expected))
+
class ListTests(PySparkTestCase):