aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzero323 <zero323@users.noreply.github.com>2016-10-03 17:57:54 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-10-03 17:57:54 -0700
commitd8399b600cef706c22d381b01fab19c610db439a (patch)
tree55a0a1ab872fd290d60e156fc3920bb04924327a
parent1f31bdaef670dd43999613deae3620f4ddcd1fbf (diff)
downloadspark-d8399b600cef706c22d381b01fab19c610db439a.tar.gz
spark-d8399b600cef706c22d381b01fab19c610db439a.tar.bz2
spark-d8399b600cef706c22d381b01fab19c610db439a.zip
[SPARK-17587][PYTHON][MLLIB] SparseVector __getitem__ should follow __getitem__ contract
## What changes were proposed in this pull request? Replaces` ValueError` with `IndexError` when index passed to `ml` / `mllib` `SparseVector.__getitem__` is out of range. This ensures correct iteration behavior. Replaces `ValueError` with `IndexError` for `DenseMatrix` and `SparkMatrix` in `ml` / `mllib`. ## How was this patch tested? PySpark `ml` / `mllib` unit tests. Additional unit tests to prove that the problem has been resolved. Author: zero323 <zero323@users.noreply.github.com> Closes #15144 from zero323/SPARK-17587.
-rw-r--r--python/pyspark/ml/linalg/__init__.py10
-rwxr-xr-xpython/pyspark/ml/tests.py16
-rw-r--r--python/pyspark/mllib/linalg/__init__.py10
-rw-r--r--python/pyspark/mllib/tests.py16
4 files changed, 36 insertions, 16 deletions
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index 05c0ac862f..a5df727fdb 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -713,7 +713,7 @@ class SparseVector(Vector):
"Indices must be of type integer, got type %s" % type(index))
if index >= self.size or index < -self.size:
- raise ValueError("Index %d out of bounds." % index)
+ raise IndexError("Index %d out of bounds." % index)
if index < 0:
index += self.size
@@ -960,10 +960,10 @@ class DenseMatrix(Matrix):
def __getitem__(self, indices):
i, j = indices
if i < 0 or i >= self.numRows:
- raise ValueError("Row index %d is out of range [0, %d)"
+ raise IndexError("Row index %d is out of range [0, %d)"
% (i, self.numRows))
if j >= self.numCols or j < 0:
- raise ValueError("Column index %d is out of range [0, %d)"
+ raise IndexError("Column index %d is out of range [0, %d)"
% (j, self.numCols))
if self.isTransposed:
@@ -1090,10 +1090,10 @@ class SparseMatrix(Matrix):
def __getitem__(self, indices):
i, j = indices
if i < 0 or i >= self.numRows:
- raise ValueError("Row index %d is out of range [0, %d)"
+ raise IndexError("Row index %d is out of range [0, %d)"
% (i, self.numRows))
if j < 0 or j >= self.numCols:
- raise ValueError("Column index %d is out of range [0, %d)"
+ raise IndexError("Column index %d is out of range [0, %d)"
% (j, self.numCols))
# If a CSR matrix is given, then the row index should be searched
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6886ed321e..e233549850 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1316,7 +1316,7 @@ class VectorTests(MLlibTestCase):
self.assertEqual(sv[-3], 0.)
self.assertEqual(sv[-5], 0.)
for ind in [5, -6]:
- self.assertRaises(ValueError, sv.__getitem__, ind)
+ self.assertRaises(IndexError, sv.__getitem__, ind)
for ind in [7.8, '1']:
self.assertRaises(TypeError, sv.__getitem__, ind)
@@ -1324,11 +1324,15 @@ class VectorTests(MLlibTestCase):
self.assertEqual(zeros[0], 0.0)
self.assertEqual(zeros[3], 0.0)
for ind in [4, -5]:
- self.assertRaises(ValueError, zeros.__getitem__, ind)
+ self.assertRaises(IndexError, zeros.__getitem__, ind)
empty = SparseVector(0, {})
for ind in [-1, 0, 1]:
- self.assertRaises(ValueError, empty.__getitem__, ind)
+ self.assertRaises(IndexError, empty.__getitem__, ind)
+
+ def test_sparse_vector_iteration(self):
+ self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
+ self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
def test_matrix_indexing(self):
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
@@ -1337,6 +1341,9 @@ class VectorTests(MLlibTestCase):
for j in range(2):
self.assertEqual(mat[i, j], expected[i][j])
+ for i, j in [(-1, 0), (4, 1), (3, 4)]:
+ self.assertRaises(IndexError, mat.__getitem__, (i, j))
+
def test_repr_dense_matrix(self):
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
self.assertTrue(
@@ -1408,6 +1415,9 @@ class VectorTests(MLlibTestCase):
self.assertEqual(expected[i][j], sm1[i, j])
self.assertTrue(array_equal(sm1.toArray(), expected))
+ for i, j in [(-1, 1), (4, 3), (3, 5)]:
+ self.assertRaises(IndexError, sm1.__getitem__, (i, j))
+
# Test conversion to dense and sparse.
smnew = sm1.toDense().toSparse()
self.assertEqual(sm1.numRows, smnew.numRows)
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 9672dbde82..d37e715c8d 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -802,7 +802,7 @@ class SparseVector(Vector):
"Indices must be of type integer, got type %s" % type(index))
if index >= self.size or index < -self.size:
- raise ValueError("Index %d out of bounds." % index)
+ raise IndexError("Index %d out of bounds." % index)
if index < 0:
index += self.size
@@ -1115,10 +1115,10 @@ class DenseMatrix(Matrix):
def __getitem__(self, indices):
i, j = indices
if i < 0 or i >= self.numRows:
- raise ValueError("Row index %d is out of range [0, %d)"
+ raise IndexError("Row index %d is out of range [0, %d)"
% (i, self.numRows))
if j >= self.numCols or j < 0:
- raise ValueError("Column index %d is out of range [0, %d)"
+ raise IndexError("Column index %d is out of range [0, %d)"
% (j, self.numCols))
if self.isTransposed:
@@ -1245,10 +1245,10 @@ class SparseMatrix(Matrix):
def __getitem__(self, indices):
i, j = indices
if i < 0 or i >= self.numRows:
- raise ValueError("Row index %d is out of range [0, %d)"
+ raise IndexError("Row index %d is out of range [0, %d)"
% (i, self.numRows))
if j < 0 or j >= self.numCols:
- raise ValueError("Column index %d is out of range [0, %d)"
+ raise IndexError("Column index %d is out of range [0, %d)"
% (j, self.numCols))
# If a CSR matrix is given, then the row index should be searched
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 3f3dfd186c..c519883cdd 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -260,7 +260,7 @@ class VectorTests(MLlibTestCase):
self.assertEqual(sv[-3], 0.)
self.assertEqual(sv[-5], 0.)
for ind in [5, -6]:
- self.assertRaises(ValueError, sv.__getitem__, ind)
+ self.assertRaises(IndexError, sv.__getitem__, ind)
for ind in [7.8, '1']:
self.assertRaises(TypeError, sv.__getitem__, ind)
@@ -268,11 +268,15 @@ class VectorTests(MLlibTestCase):
self.assertEqual(zeros[0], 0.0)
self.assertEqual(zeros[3], 0.0)
for ind in [4, -5]:
- self.assertRaises(ValueError, zeros.__getitem__, ind)
+ self.assertRaises(IndexError, zeros.__getitem__, ind)
empty = SparseVector(0, {})
for ind in [-1, 0, 1]:
- self.assertRaises(ValueError, empty.__getitem__, ind)
+ self.assertRaises(IndexError, empty.__getitem__, ind)
+
+ def test_sparse_vector_iteration(self):
+ self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
+ self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
def test_matrix_indexing(self):
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
@@ -281,6 +285,9 @@ class VectorTests(MLlibTestCase):
for j in range(2):
self.assertEqual(mat[i, j], expected[i][j])
+ for i, j in [(-1, 0), (4, 1), (3, 4)]:
+ self.assertRaises(IndexError, mat.__getitem__, (i, j))
+
def test_repr_dense_matrix(self):
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
self.assertTrue(
@@ -352,6 +359,9 @@ class VectorTests(MLlibTestCase):
self.assertEqual(expected[i][j], sm1[i, j])
self.assertTrue(array_equal(sm1.toArray(), expected))
+ for i, j in [(-1, 1), (4, 3), (3, 5)]:
+ self.assertRaises(IndexError, sm1.__getitem__, (i, j))
+
# Test conversion to dense and sparse.
smnew = sm1.toDense().toSparse()
self.assertEqual(sm1.numRows, smnew.numRows)