diff options
author | Davies Liu <davies@databricks.com> | 2014-11-24 16:37:14 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-11-24 16:37:14 -0800 |
commit | b660de7a9cbdea3df4a37fbcf60c1c33c71782b8 (patch) | |
tree | 34bec7b4e6789d63846f7519e1c9d51351033b61 /python/pyspark/mllib/tests.py | |
parent | cb0e9b0980f38befe88bf52aa037fe33262730f7 (diff) | |
download | spark-b660de7a9cbdea3df4a37fbcf60c1c33c71782b8.tar.gz spark-b660de7a9cbdea3df4a37fbcf60c1c33c71782b8.tar.bz2 spark-b660de7a9cbdea3df4a37fbcf60c1c33c71782b8.zip |
[SPARK-4562] [MLlib] speedup vector
This PR change the underline array of DenseVector to numpy.ndarray to avoid the conversion, because most of the users will using numpy.array.
It also improve the serialization of DenseVector.
Before this change:
trial | trainingTime | testTime
-------|--------|--------
0 | 5.126 | 1.786
1 |2.698 |1.693
After the change:
trial | trainingTime | testTime
-------|--------|--------
0 |4.692 |0.554
1 |2.307 |0.525
This could partially fix the performance regression during test.
Author: Davies Liu <davies@databricks.com>
Closes #3420 from davies/ser2 and squashes the following commits:
0e1e6f3 [Davies Liu] fix tests
426f5db [Davies Liu] impove toArray()
44707ec [Davies Liu] add name for ISO-8859-1
fa7d791 [Davies Liu] address comments
1cfb137 [Davies Liu] handle zero sparse vector
2548ee2 [Davies Liu] fix tests
9e6389d [Davies Liu] bugfix
470f702 [Davies Liu] speed up DenseMatrix
f0d3c40 [Davies Liu] speedup SparseVector
ef6ce70 [Davies Liu] speed up dense vector
Diffstat (limited to 'python/pyspark/mllib/tests.py')
-rw-r--r-- | python/pyspark/mllib/tests.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 9fa4d6f6a2..8332f8e061 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -33,7 +33,8 @@ if sys.version_info[:2] <= (2, 6): else: import unittest -from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector +from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\ + DenseMatrix from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.random import RandomRDDs from pyspark.mllib.stat import Statistics @@ -62,6 +63,7 @@ def _squared_distance(a, b): class VectorTests(PySparkTestCase): def _test_serialize(self, v): + self.assertEqual(v, ser.loads(ser.dumps(v))) jvec = self.sc._jvm.SerDe.loads(bytearray(ser.dumps(v))) nv = ser.loads(str(self.sc._jvm.SerDe.dumps(jvec))) self.assertEqual(v, nv) @@ -75,6 +77,8 @@ class VectorTests(PySparkTestCase): self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) self._test_serialize(DenseVector(pyarray.array('d', range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2})) + self._test_serialize(SparseVector(3, {})) + self._test_serialize(DenseMatrix(2, 3, range(6))) def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) |