From 189df165bb7cb8bc8ede48d0e7f8d8b5cd31d299 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 4 Jun 2014 12:56:56 -0700
Subject: [SPARK-1752][MLLIB] Standardize text format for vectors and labeled
 points

We should standardize the text format used to represent vectors and labeled points. The proposed formats are the following:

1. dense vector: `[v0,v1,..]`
2. sparse vector: `(size,[i0,i1],[v0,v1])`
3. labeled point: `(label,vector)`

where "(..)" indicates a tuple and "[...]" indicate an array. `loadLabeledPoints` is added to pyspark's `MLUtils`. I didn't add `loadVectors` to pyspark because `RDD.saveAsTextFile` cannot stringify dense vectors in the proposed format automatically.

`MLUtils#saveLabeledData` and `MLUtils#loadLabeledData` are deprecated. Users should use `RDD#saveAsTextFile` and `MLUtils#loadLabeledPoints` instead. In Scala, `MLUtils#loadLabeledPoints` is compatible with the format used by `MLUtils#loadLabeledData`.

CC: @mateiz, @srowen

Author: Xiangrui Meng <meng@databricks.com>

Closes #685 from mengxr/labeled-io and squashes the following commits:

2d1116a [Xiangrui Meng] make loadLabeledData/saveLabeledData deprecated since 1.0.1
297be75 [Xiangrui Meng] change LabeledPoint.parse to LabeledPointParser.parse to maintain binary compatibility
d6b1473 [Xiangrui Meng] Merge branch 'master' into labeled-io
56746ea [Xiangrui Meng] replace # by .
623a5f0 [Xiangrui Meng] merge master
f06d5ba [Xiangrui Meng] add docs and minor updates
640fe0c [Xiangrui Meng] throw SparkException
5bcfbc4 [Xiangrui Meng] update test to add scientific notations
e86bf38 [Xiangrui Meng] remove NumericTokenizer
050fca4 [Xiangrui Meng] use StringTokenizer
6155b75 [Xiangrui Meng] merge master
f644438 [Xiangrui Meng] remove parse methods based on eval from pyspark
a41675a [Xiangrui Meng] python loadLabeledPoint uses Scala's implementation
ce9a475 [Xiangrui Meng] add deserialize_labeled_point to pyspark with tests
e9fcd49 [Xiangrui Meng] add serializeLabeledPoint and tests
aea4ae3 [Xiangrui Meng] minor updates
810d6df [Xiangrui Meng] update tokenizer/parser implementation
7aac03a [Xiangrui Meng] remove Scala parsers
c1885c1 [Xiangrui Meng] add headers and minor changes
b0c50cb [Xiangrui Meng] add customized parser
d731817 [Xiangrui Meng] style update
63dc396 [Xiangrui Meng] add loadLabeledPoints to pyspark
ea122b5 [Xiangrui Meng] Merge branch 'master' into labeled-io
cd6c78f [Xiangrui Meng] add __str__ and parse to LabeledPoint
a7a178e [Xiangrui Meng] add stringify to pyspark's Vectors
5c2dbfa [Xiangrui Meng] add parse to pyspark's Vectors
7853f88 [Xiangrui Meng] update pyspark's SparseVector.__str__
e761d32 [Xiangrui Meng] make LabelPoint.parse compatible with the dense format used before v1.0 and deprecate loadLabeledData and saveLabeledData
9e63a02 [Xiangrui Meng] add loadVectors and loadLabeledPoints
19aa523 [Xiangrui Meng] update toString and add parsers for Vectors and LabeledPoint
---
 python/pyspark/mllib/linalg.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'python/pyspark/mllib/linalg.py')

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 2766842720..db39ed0acd 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -43,11 +43,11 @@ class SparseVector(object):
                or two sorted lists containing indices and values.
 
         >>> print SparseVector(4, {1: 1.0, 3: 5.5})
-        [1: 1.0, 3: 5.5]
+        (4,[1,3],[1.0,5.5])
         >>> print SparseVector(4, [(1, 1.0), (3, 5.5)])
-        [1: 1.0, 3: 5.5]
+        (4,[1,3],[1.0,5.5])
         >>> print SparseVector(4, [1, 3], [1.0, 5.5])
-        [1: 1.0, 3: 5.5]
+        (4,[1,3],[1.0,5.5])
         """
         self.size = int(size)
         assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
@@ -160,10 +160,9 @@ class SparseVector(object):
             return result
 
     def __str__(self):
-        inds = self.indices
-        vals = self.values
-        entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
-        return "[" + entries + "]"
+        inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
+        vals = "[" + ",".join([str(v) for v in self.values]) + "]"
+        return "(" + ",".join((str(self.size), inds, vals)) + ")"
 
     def __repr__(self):
         inds = self.indices
@@ -213,11 +212,11 @@ class Vectors(object):
                      or two sorted lists containing indices and values.
 
         >>> print Vectors.sparse(4, {1: 1.0, 3: 5.5})
-        [1: 1.0, 3: 5.5]
+        (4,[1,3],[1.0,5.5])
         >>> print Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
-        [1: 1.0, 3: 5.5]
+        (4,[1,3],[1.0,5.5])
         >>> print Vectors.sparse(4, [1, 3], [1.0, 5.5])
-        [1: 1.0, 3: 5.5]
+        (4,[1,3],[1.0,5.5])
         """
         return SparseVector(size, *args)
 
@@ -232,6 +231,21 @@ class Vectors(object):
         """
         return array(elements, dtype=float64)
 
+    @staticmethod
+    def stringify(vector):
+        """
+        Converts a vector into a string, which can be recognized by
+        Vectors.parse().
+
+        >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0]))
+        '(2,[1],[1.0])'
+        >>> Vectors.stringify(Vectors.dense([0.0, 1.0]))
+        '[0.0,1.0]'
+        """
+        if type(vector) == SparseVector:
+            return str(vector)
+        else:
+            return "[" + ",".join([str(v) for v in vector]) + "]"
 
 def _test():
     import doctest
-- 
cgit v1.2.3