aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark
diff options
context:
space:
mode:
authorNick Pentreath <nickp@za.ibm.com>2016-06-30 17:52:15 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-06-30 17:52:15 -0700
commitdab10516138867b7c4fc6d42168497e82853b539 (patch)
treeed85182e87f912dcad810d9564cc7db47a7b2727 /python/pyspark
parent85f2303ecadd9bf6d9694a2743dda075654c5ccf (diff)
downloadspark-dab10516138867b7c4fc6d42168497e82853b539.tar.gz
spark-dab10516138867b7c4fc6d42168497e82853b539.tar.bz2
spark-dab10516138867b7c4fc6d42168497e82853b539.zip
[SPARK-16328][ML][MLLIB][PYSPARK] Add 'asML' and 'fromML' conversion methods to PySpark linalg
The move to `ml.linalg` created `asML`/`fromML` utility methods in Scala/Java for converting between representations. These are missing in Python, this PR adds them. ## How was this patch tested? New doctests. Author: Nick Pentreath <nickp@za.ibm.com> Closes #13997 from MLnick/SPARK-16328-python-linalg-convert.
Diffstat (limited to 'python/pyspark')
-rw-r--r--python/pyspark/mllib/linalg/__init__.py99
-rw-r--r--python/pyspark/mllib/tests.py69
2 files changed, 168 insertions, 0 deletions
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 3a345b2b56..15dc53a959 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -39,6 +39,7 @@ else:
import numpy as np
from pyspark import since
+from pyspark.ml import linalg as newlinalg
from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
IntegerType, ByteType, BooleanType
@@ -247,6 +248,15 @@ class Vector(object):
"""
raise NotImplementedError
+ def asML(self):
+ """
+ Convert this vector to the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :return: :py:class:`pyspark.ml.linalg.Vector`
+ """
+ raise NotImplementedError
+
class DenseVector(Vector):
"""
@@ -408,6 +418,17 @@ class DenseVector(Vector):
"""
return self.array
+ def asML(self):
+ """
+ Convert this vector to the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :return: :py:class:`pyspark.ml.linalg.DenseVector`
+
+ .. versionadded:: 2.0.0
+ """
+ return newlinalg.DenseVector(self.array)
+
@property
def values(self):
"""
@@ -737,6 +758,17 @@ class SparseVector(Vector):
arr[self.indices] = self.values
return arr
+ def asML(self):
+ """
+ Convert this vector to the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :return: :py:class:`pyspark.ml.linalg.SparseVector`
+
+ .. versionadded:: 2.0.0
+ """
+ return newlinalg.SparseVector(self.size, self.indices, self.values)
+
def __len__(self):
return self.size
@@ -846,6 +878,24 @@ class Vectors(object):
return DenseVector(elements)
@staticmethod
+ def fromML(vec):
+ """
+ Convert a vector from the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :param vec: a :py:class:`pyspark.ml.linalg.Vector`
+ :return: a :py:class:`pyspark.mllib.linalg.Vector`
+
+ .. versionadded:: 2.0.0
+ """
+ if isinstance(vec, newlinalg.DenseVector):
+ return DenseVector(vec.array)
+ elif isinstance(vec, newlinalg.SparseVector):
+ return SparseVector(vec.size, vec.indices, vec.values)
+ else:
+ raise TypeError("Unsupported vector type %s" % type(vec))
+
+ @staticmethod
def stringify(vector):
"""
Converts a vector into a string, which can be recognized by
@@ -945,6 +995,13 @@ class Matrix(object):
"""
raise NotImplementedError
+ def asML(self):
+ """
+ Convert this matrix to the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+ """
+ raise NotImplementedError
+
@staticmethod
def _convert_to_array(array_like, dtype):
"""
@@ -1044,6 +1101,17 @@ class DenseMatrix(Matrix):
return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
+ def asML(self):
+ """
+ Convert this matrix to the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :return: :py:class:`pyspark.ml.linalg.DenseMatrix`
+
+ .. versionadded:: 2.0.0
+ """
+ return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)
+
def __getitem__(self, indices):
i, j = indices
if i < 0 or i >= self.numRows:
@@ -1216,6 +1284,18 @@ class SparseMatrix(Matrix):
densevals = np.ravel(self.toArray(), order='F')
return DenseMatrix(self.numRows, self.numCols, densevals)
+ def asML(self):
+ """
+ Convert this matrix to the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :return: :py:class:`pyspark.ml.linalg.SparseMatrix`
+
+ .. versionadded:: 2.0.0
+ """
+ return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,
+ self.values, self.isTransposed)
+
# TODO: More efficient implementation:
def __eq__(self, other):
return np.all(self.toArray() == other.toArray())
@@ -1236,6 +1316,25 @@ class Matrices(object):
"""
return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
+ @staticmethod
+ def fromML(mat):
+ """
+ Convert a matrix from the new mllib-local representation.
+ This does NOT copy the data; it copies references.
+
+ :param mat: a :py:class:`pyspark.ml.linalg.Matrix`
+ :return: a :py:class:`pyspark.mllib.linalg.Matrix`
+
+ .. versionadded:: 2.0.0
+ """
+ if isinstance(mat, newlinalg.DenseMatrix):
+ return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)
+ elif isinstance(mat, newlinalg.SparseMatrix):
+ return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,
+ mat.values, mat.isTransposed)
+ else:
+ raise TypeError("Unsupported matrix type %s" % type(mat))
+
class QRDecomposition(object):
"""
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 74cf7bb8ea..72fa8b5f3d 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -49,6 +49,7 @@ else:
import unittest
from pyspark import SparkContext
+import pyspark.ml.linalg as newlinalg
from pyspark.mllib.common import _to_java_object_rdd
from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel
from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
@@ -423,6 +424,74 @@ class VectorTests(MLlibTestCase):
tmp = SparseVector(4, [0, 2], [3, 0])
self.assertEqual(tmp.numNonzeros(), 1)
+ def test_ml_mllib_vector_conversion(self):
+ # to ml
+ # dense
+ mllibDV = Vectors.dense([1, 2, 3])
+ mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
+ mlDV2 = mllibDV.asML()
+ self.assertEqual(mlDV2, mlDV1)
+ # sparse
+ mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
+ mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
+ mlSV2 = mllibSV.asML()
+ self.assertEqual(mlSV2, mlSV1)
+ # from ml
+ # dense
+ mllibDV1 = Vectors.dense([1, 2, 3])
+ mlDV = newlinalg.Vectors.dense([1, 2, 3])
+ mllibDV2 = Vectors.fromML(mlDV)
+ self.assertEqual(mllibDV1, mllibDV2)
+ # sparse
+ mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
+ mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
+ mllibSV2 = Vectors.fromML(mlSV)
+ self.assertEqual(mllibSV1, mllibSV2)
+
+ def test_ml_mllib_matrix_conversion(self):
+ # to ml
+ # dense
+ mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
+ mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
+ mlDM2 = mllibDM.asML()
+ self.assertEqual(mlDM2, mlDM1)
+ # transposed
+ mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
+ mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
+ mlDMt2 = mllibDMt.asML()
+ self.assertEqual(mlDMt2, mlDMt1)
+ # sparse
+ mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+ mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+ mlSM2 = mllibSM.asML()
+ self.assertEqual(mlSM2, mlSM1)
+ # transposed
+ mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+ mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+ mlSMt2 = mllibSMt.asML()
+ self.assertEqual(mlSMt2, mlSMt1)
+ # from ml
+ # dense
+ mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
+ mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
+ mllibDM2 = Matrices.fromML(mlDM)
+ self.assertEqual(mllibDM1, mllibDM2)
+ # transposed
+ mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
+ mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
+ mllibDMt2 = Matrices.fromML(mlDMt)
+ self.assertEqual(mllibDMt1, mllibDMt2)
+ # sparse
+ mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+ mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+ mllibSM2 = Matrices.fromML(mlSM)
+ self.assertEqual(mllibSM1, mllibSM2)
+ # transposed
+ mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+ mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+ mllibSMt2 = Matrices.fromML(mlSMt)
+ self.assertEqual(mllibSMt1, mllibSMt2)
+
class ListTests(MLlibTestCase):