From 6c6f32574023b8e43a24f2081ff17e6e446de2f3 Mon Sep 17 00:00:00 2001 From: freeman Date: Mon, 5 Jan 2015 13:10:59 -0800 Subject: [SPARK-5089][PYSPARK][MLLIB] Fix vector convert This is a small change addressing a potentially significant bug in how PySpark + MLlib handles non-float64 numpy arrays. The automatic conversion to `DenseVector` that occurs when passing RDDs to MLlib algorithms in PySpark should automatically upcast to float64s, but currently this wasn't actually happening. As a result, non-float64 would be silently parsed inappropriately during SerDe, yielding erroneous results when running, for example, KMeans. The PR includes the fix, as well as a new test for the correct conversion behavior. davies Author: freeman Closes #3902 from freeman-lab/fix-vector-convert and squashes the following commits: 764db47 [freeman] Add a test for proper conversion behavior 704f97e [freeman] Return array after changing type --- python/pyspark/mllib/linalg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/pyspark/mllib/linalg.py') diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index f7aa2b0cb0..4f8491f43e 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -178,7 +178,7 @@ class DenseVector(Vector): elif not isinstance(ar, np.ndarray): ar = np.array(ar, dtype=np.float64) if ar.dtype != np.float64: - ar.astype(np.float64) + ar = ar.astype(np.float64) self.array = ar def __reduce__(self): -- cgit v1.2.3