aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/linalg.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/mllib/linalg.py')
-rw-r--r--python/pyspark/mllib/linalg.py245
1 files changed, 245 insertions, 0 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
new file mode 100644
index 0000000000..0aa3a51de7
--- /dev/null
+++ b/python/pyspark/mllib/linalg.py
@@ -0,0 +1,245 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+MLlib utilities for linear algebra. For dense vectors, MLlib
+uses the NumPy C{array} type, so you can simply pass NumPy arrays
+around. For sparse vectors, users can construct a L{SparseVector}
+object from MLlib or pass SciPy C{scipy.sparse} column vectors if
+SciPy is available in their environment.
+"""
+
+from numpy import array, array_equal, ndarray, float64, int32
+
+
+class SparseVector(object):
+ """
+ A simple sparse vector class for passing data to MLlib. Users may
+ alternatively pass SciPy's {scipy.sparse} data types.
+ """
+
+ def __init__(self, size, *args):
+ """
+ Create a sparse vector, using either a dictionary, a list of
+ (index, value) pairs, or two separate arrays of indices and
+ values (sorted by index).
+
+ @param size: Size of the vector.
+ @param args: Non-zero entries, as a dictionary, list of tupes,
+ or two sorted lists containing indices and values.
+
+ >>> print SparseVector(4, {1: 1.0, 3: 5.5})
+ [1: 1.0, 3: 5.5]
+ >>> print SparseVector(4, [(1, 1.0), (3, 5.5)])
+ [1: 1.0, 3: 5.5]
+ >>> print SparseVector(4, [1, 3], [1.0, 5.5])
+ [1: 1.0, 3: 5.5]
+ """
+ assert type(size) == int, "first argument must be an int"
+ self.size = size
+ assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
+ if len(args) == 1:
+ pairs = args[0]
+ if type(pairs) == dict:
+ pairs = pairs.items()
+ pairs = sorted(pairs)
+ self.indices = array([p[0] for p in pairs], dtype=int32)
+ self.values = array([p[1] for p in pairs], dtype=float64)
+ else:
+ assert len(args[0]) == len(args[1]), "index and value arrays not same length"
+ self.indices = array(args[0], dtype=int32)
+ self.values = array(args[1], dtype=float64)
+ for i in xrange(len(self.indices) - 1):
+ if self.indices[i] >= self.indices[i + 1]:
+ raise TypeError("indices array must be sorted")
+
+ def dot(self, other):
+ """
+ Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
+
+ >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
+ >>> a.dot(a)
+ 25.0
+ >>> a.dot(array([1., 2., 3., 4.]))
+ 22.0
+ >>> b = SparseVector(4, [2, 4], [1.0, 2.0])
+ >>> a.dot(b)
+ 0.0
+ >>> a.dot(array([[1, 1], [2, 2], [3, 3], [4, 4]]))
+ array([ 22., 22.])
+ """
+ if type(other) == ndarray:
+ if other.ndim == 1:
+ result = 0.0
+ for i in xrange(len(self.indices)):
+ result += self.values[i] * other[self.indices[i]]
+ return result
+ elif other.ndim == 2:
+ results = [self.dot(other[:,i]) for i in xrange(other.shape[1])]
+ return array(results)
+ else:
+ raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
+ else:
+ result = 0.0
+ i, j = 0, 0
+ while i < len(self.indices) and j < len(other.indices):
+ if self.indices[i] == other.indices[j]:
+ result += self.values[i] * other.values[j]
+ i += 1
+ j += 1
+ elif self.indices[i] < other.indices[j]:
+ i += 1
+ else:
+ j += 1
+ return result
+
+ def squared_distance(self, other):
+ """
+ Squared distance from a SparseVector or 1-dimensional NumPy array.
+
+ >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
+ >>> a.squared_distance(a)
+ 0.0
+ >>> a.squared_distance(array([1., 2., 3., 4.]))
+ 11.0
+ >>> b = SparseVector(4, [2, 4], [1.0, 2.0])
+ >>> a.squared_distance(b)
+ 30.0
+ >>> b.squared_distance(a)
+ 30.0
+ """
+ if type(other) == ndarray:
+ if other.ndim == 1:
+ result = 0.0
+ j = 0 # index into our own array
+ for i in xrange(other.shape[0]):
+ if j < len(self.indices) and self.indices[j] == i:
+ diff = self.values[j] - other[i]
+ result += diff * diff
+ j += 1
+ else:
+ result += other[i] * other[i]
+ return result
+ else:
+ raise Exception("Cannot call squared_distance with %d-dimensional array" %
+ other.ndim)
+ else:
+ result = 0.0
+ i, j = 0, 0
+ while i < len(self.indices) and j < len(other.indices):
+ if self.indices[i] == other.indices[j]:
+ diff = self.values[i] - other.values[j]
+ result += diff * diff
+ i += 1
+ j += 1
+ elif self.indices[i] < other.indices[j]:
+ result += self.values[i] * self.values[i]
+ i += 1
+ else:
+ result += other.values[j] * other.values[j]
+ j += 1
+ while i < len(self.indices):
+ result += self.values[i] * self.values[i]
+ i += 1
+ while j < len(other.indices):
+ result += other.values[j] * other.values[j]
+ j += 1
+ return result
+
+ def __str__(self):
+ inds = self.indices
+ vals = self.values
+ entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
+ return "[" + entries + "]"
+
+ def __repr__(self):
+ inds = self.indices
+ vals = self.values
+ entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
+ return "SparseVector({0}, {{{1}}})".format(self.size, entries)
+
+ def __eq__(self, other):
+ """
+ Test SparseVectors for equality.
+
+ >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+ >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+ >>> v1 == v2
+ True
+ >>> v1 != v2
+ False
+ """
+
+ return (isinstance(other, self.__class__)
+ and other.size == self.size
+ and array_equal(other.indices, self.indices)
+ and array_equal(other.values, self.values))
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+
+
+class Vectors(object):
+ """
+ Factory methods for working with vectors. Note that dense vectors
+ are simply represented as NumPy array objects, so there is no need
+ to covert them for use in MLlib. For sparse vectors, the factory
+ methods in this class create an MLlib-compatible type, or users
+ can pass in SciPy's C{scipy.sparse} column vectors.
+ """
+
+ @staticmethod
+ def sparse(size, *args):
+ """
+ Create a sparse vector, using either a dictionary, a list of
+ (index, value) pairs, or two separate arrays of indices and
+ values (sorted by index).
+
+ @param size: Size of the vector.
+ @param args: Non-zero entries, as a dictionary, list of tupes,
+ or two sorted lists containing indices and values.
+
+ >>> print Vectors.sparse(4, {1: 1.0, 3: 5.5})
+ [1: 1.0, 3: 5.5]
+ >>> print Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
+ [1: 1.0, 3: 5.5]
+ >>> print Vectors.sparse(4, [1, 3], [1.0, 5.5])
+ [1: 1.0, 3: 5.5]
+ """
+ return SparseVector(size, *args)
+
+ @staticmethod
+ def dense(elements):
+ """
+ Create a dense vector of 64-bit floats from a Python list. Always
+ returns a NumPy array.
+
+ >>> Vectors.dense([1, 2, 3])
+ array([ 1., 2., 3.])
+ """
+ return array(elements, dtype=float64)
+
+
+def _test():
+ import doctest
+ (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
+ if failure_count:
+ exit(-1)
+
+if __name__ == "__main__":
+ _test()