From 189df165bb7cb8bc8ede48d0e7f8d8b5cd31d299 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 4 Jun 2014 12:56:56 -0700 Subject: [SPARK-1752][MLLIB] Standardize text format for vectors and labeled points We should standardize the text format used to represent vectors and labeled points. The proposed formats are the following: 1. dense vector: `[v0,v1,..]` 2. sparse vector: `(size,[i0,i1],[v0,v1])` 3. labeled point: `(label,vector)` where "(..)" indicates a tuple and "[...]" indicate an array. `loadLabeledPoints` is added to pyspark's `MLUtils`. I didn't add `loadVectors` to pyspark because `RDD.saveAsTextFile` cannot stringify dense vectors in the proposed format automatically. `MLUtils#saveLabeledData` and `MLUtils#loadLabeledData` are deprecated. Users should use `RDD#saveAsTextFile` and `MLUtils#loadLabeledPoints` instead. In Scala, `MLUtils#loadLabeledPoints` is compatible with the format used by `MLUtils#loadLabeledData`. CC: @mateiz, @srowen Author: Xiangrui Meng Closes #685 from mengxr/labeled-io and squashes the following commits: 2d1116a [Xiangrui Meng] make loadLabeledData/saveLabeledData deprecated since 1.0.1 297be75 [Xiangrui Meng] change LabeledPoint.parse to LabeledPointParser.parse to maintain binary compatibility d6b1473 [Xiangrui Meng] Merge branch 'master' into labeled-io 56746ea [Xiangrui Meng] replace # by . 623a5f0 [Xiangrui Meng] merge master f06d5ba [Xiangrui Meng] add docs and minor updates 640fe0c [Xiangrui Meng] throw SparkException 5bcfbc4 [Xiangrui Meng] update test to add scientific notations e86bf38 [Xiangrui Meng] remove NumericTokenizer 050fca4 [Xiangrui Meng] use StringTokenizer 6155b75 [Xiangrui Meng] merge master f644438 [Xiangrui Meng] remove parse methods based on eval from pyspark a41675a [Xiangrui Meng] python loadLabeledPoint uses Scala's implementation ce9a475 [Xiangrui Meng] add deserialize_labeled_point to pyspark with tests e9fcd49 [Xiangrui Meng] add serializeLabeledPoint and tests aea4ae3 [Xiangrui Meng] minor updates 810d6df [Xiangrui Meng] update tokenizer/parser implementation 7aac03a [Xiangrui Meng] remove Scala parsers c1885c1 [Xiangrui Meng] add headers and minor changes b0c50cb [Xiangrui Meng] add customized parser d731817 [Xiangrui Meng] style update 63dc396 [Xiangrui Meng] add loadLabeledPoints to pyspark ea122b5 [Xiangrui Meng] Merge branch 'master' into labeled-io cd6c78f [Xiangrui Meng] add __str__ and parse to LabeledPoint a7a178e [Xiangrui Meng] add stringify to pyspark's Vectors 5c2dbfa [Xiangrui Meng] add parse to pyspark's Vectors 7853f88 [Xiangrui Meng] update pyspark's SparseVector.__str__ e761d32 [Xiangrui Meng] make LabelPoint.parse compatible with the dense format used before v1.0 and deprecate loadLabeledData and saveLabeledData 9e63a02 [Xiangrui Meng] add loadVectors and loadLabeledPoints 19aa523 [Xiangrui Meng] update toString and add parsers for Vectors and LabeledPoint --- python/pyspark/mllib/linalg.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'python/pyspark/mllib/linalg.py') diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 2766842720..db39ed0acd 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -43,11 +43,11 @@ class SparseVector(object): or two sorted lists containing indices and values. >>> print SparseVector(4, {1: 1.0, 3: 5.5}) - [1: 1.0, 3: 5.5] + (4,[1,3],[1.0,5.5]) >>> print SparseVector(4, [(1, 1.0), (3, 5.5)]) - [1: 1.0, 3: 5.5] + (4,[1,3],[1.0,5.5]) >>> print SparseVector(4, [1, 3], [1.0, 5.5]) - [1: 1.0, 3: 5.5] + (4,[1,3],[1.0,5.5]) """ self.size = int(size) assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" @@ -160,10 +160,9 @@ class SparseVector(object): return result def __str__(self): - inds = self.indices - vals = self.values - entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))]) - return "[" + entries + "]" + inds = "[" + ",".join([str(i) for i in self.indices]) + "]" + vals = "[" + ",".join([str(v) for v in self.values]) + "]" + return "(" + ",".join((str(self.size), inds, vals)) + ")" def __repr__(self): inds = self.indices @@ -213,11 +212,11 @@ class Vectors(object): or two sorted lists containing indices and values. >>> print Vectors.sparse(4, {1: 1.0, 3: 5.5}) - [1: 1.0, 3: 5.5] + (4,[1,3],[1.0,5.5]) >>> print Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) - [1: 1.0, 3: 5.5] + (4,[1,3],[1.0,5.5]) >>> print Vectors.sparse(4, [1, 3], [1.0, 5.5]) - [1: 1.0, 3: 5.5] + (4,[1,3],[1.0,5.5]) """ return SparseVector(size, *args) @@ -232,6 +231,21 @@ class Vectors(object): """ return array(elements, dtype=float64) + @staticmethod + def stringify(vector): + """ + Converts a vector into a string, which can be recognized by + Vectors.parse(). + + >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0])) + '(2,[1],[1.0])' + >>> Vectors.stringify(Vectors.dense([0.0, 1.0])) + '[0.0,1.0]' + """ + if type(vector) == SparseVector: + return str(vector) + else: + return "[" + ",".join([str(v) for v in vector]) + "]" def _test(): import doctest -- cgit v1.2.3