From df360917990ad95dde3c8e016ec42507d1566355 Mon Sep 17 00:00:00 2001 From: Sandeep Date: Tue, 15 Apr 2014 00:19:43 -0700 Subject: SPARK-1426: Make MLlib work with NumPy versions older than 1.7 Currently it requires NumPy 1.7 due to using the copyto method (http://docs.scipy.org/doc/numpy/reference/generated/numpy.copyto.html) for extracting data out of an array. Replace it with a fallback Author: Sandeep Closes #391 from techaddict/1426 and squashes the following commits: d365962 [Sandeep] SPARK-1426: Make MLlib work with NumPy versions older than 1.7 Currently it requires NumPy 1.7 due to using the copyto method (http://docs.scipy.org/doc/numpy/reference/generated/numpy.copyto.html) for extracting data out of an array. Replace it with a fallback --- docs/mllib-guide.md | 9 ++++----- docs/python-programming-guide.md | 6 +++--- python/pyspark/mllib/__init__.py | 6 +++--- python/pyspark/mllib/_common.py | 11 ++++++----- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index a5e0cc5080..eff856104c 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -11,7 +11,7 @@ namely, binary classification, regression, clustering and collaborative filtering, as well as an underlying gradient descent optimization primitive. # Available Methods -The following links provide a detailed explanation of the methods and usage examples for each of them: +The following links provide a detailed explanation of the methods and usage examples for each of them: * Classification and Regression * Binary Classification @@ -33,10 +33,9 @@ The following links provide a detailed explanation of the methods and usage exam # Dependencies MLlib uses the [jblas](https://github.com/mikiobraun/jblas) linear algebra library, which itself -depends on native Fortran routines. You may need to install the +depends on native Fortran routines. You may need to install the [gfortran runtime library](https://github.com/mikiobraun/jblas/wiki/Missing-Libraries) -if it is not already present on your nodes. MLlib will throw a linking error if it cannot +if it is not already present on your nodes. MLlib will throw a linking error if it cannot detect these libraries automatically. -To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer. - +To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer. diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md index 888631e702..39de603b29 100644 --- a/docs/python-programming-guide.md +++ b/docs/python-programming-guide.md @@ -100,8 +100,8 @@ $ MASTER=local[4] ./bin/pyspark ## IPython -It is also possible to launch PySpark in [IPython](http://ipython.org), the -enhanced Python interpreter. PySpark works with IPython 1.0.0 and later. To +It is also possible to launch PySpark in [IPython](http://ipython.org), the +enhanced Python interpreter. PySpark works with IPython 1.0.0 and later. To use IPython, set the `IPYTHON` variable to `1` when running `bin/pyspark`: {% highlight bash %} @@ -153,7 +153,7 @@ Many of the methods also contain [doctests](http://docs.python.org/2/library/doc # Libraries [MLlib](mllib-guide.html) is also available in PySpark. To use it, you'll need -[NumPy](http://www.numpy.org) version 1.7 or newer. The [MLlib guide](mllib-guide.html) contains +[NumPy](http://www.numpy.org) version 1.4 or newer. The [MLlib guide](mllib-guide.html) contains some example applications. # Where to Go from Here diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py index 538ff26ce7..4149f54931 100644 --- a/python/pyspark/mllib/__init__.py +++ b/python/pyspark/mllib/__init__.py @@ -19,8 +19,8 @@ Python bindings for MLlib. """ -# MLlib currently needs and NumPy 1.7+, so complain if lower +# MLlib currently needs and NumPy 1.4+, so complain if lower import numpy -if numpy.version.version < '1.7': - raise Exception("MLlib requires NumPy 1.7+") +if numpy.version.version < '1.4': + raise Exception("MLlib requires NumPy 1.4+") diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py index 7ef251d24c..e19f5d2aaa 100644 --- a/python/pyspark/mllib/_common.py +++ b/python/pyspark/mllib/_common.py @@ -15,7 +15,7 @@ # limitations under the License. # -from numpy import ndarray, copyto, float64, int64, int32, ones, array_equal, array, dot, shape, complex, issubdtype +from numpy import ndarray, float64, int64, int32, ones, array_equal, array, dot, shape, complex, issubdtype from pyspark import SparkContext, RDD import numpy as np @@ -72,8 +72,8 @@ def _serialize_double_vector(v): header = ndarray(shape=[2], buffer=ba, dtype="int64") header[0] = 1 header[1] = length - copyto(ndarray(shape=[length], buffer=ba, offset=16, - dtype="float64"), v) + arr_mid = ndarray(shape=[length], buffer=ba, offset=16, dtype="float64") + arr_mid[...] = v return ba def _deserialize_double_vector(ba): @@ -112,8 +112,9 @@ def _serialize_double_matrix(m): header[0] = 2 header[1] = rows header[2] = cols - copyto(ndarray(shape=[rows, cols], buffer=ba, offset=24, - dtype="float64", order='C'), m) + arr_mid = ndarray(shape=[rows, cols], buffer=ba, offset=24, + dtype="float64", order='C') + arr_mid[...] = m return ba else: raise TypeError("_serialize_double_matrix called on a " -- cgit v1.2.3