From e9b275b7697e7ad3b52b157d3274acc17ca8d828 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Wed, 30 Jul 2014 17:34:32 -0700 Subject: SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course. Author: Sean Owen Closes #1663 from srowen/SPARK-2341 and squashes the following commits: 8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes 18a8c8e [Sean Owen] Updates from review 83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true) --- python/pyspark/mllib/util.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'python/pyspark/mllib/util.py') diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a707a9dcd5..d94900cefd 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -29,15 +29,18 @@ class MLUtils: Helper methods to load, save and pre-process data used in MLlib. """ + @deprecated @staticmethod def _parse_libsvm_line(line, multiclass): + return _parse_libsvm_line(line) + + @staticmethod + def _parse_libsvm_line(line): """ Parses a line in LIBSVM format into (label, indices, values). """ items = line.split(None) label = float(items[0]) - if not multiclass: - label = 1.0 if label > 0.5 else 0.0 nnz = len(items) - 1 indices = np.zeros(nnz, dtype=np.int32) values = np.zeros(nnz) @@ -64,8 +67,13 @@ class MLUtils: " but got " % type(v)) return " ".join(items) + @deprecated @staticmethod def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None): + return loadLibSVMFile(sc, path, numFeatures, minPartitions) + + @staticmethod + def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): """ Loads labeled data in the LIBSVM format into an RDD of LabeledPoint. The LIBSVM format is a text-based format used by @@ -81,13 +89,6 @@ class MLUtils: @param sc: Spark context @param path: file or directory path in any Hadoop-supported file system URI - @param multiclass: whether the input labels contain more than - two classes. If false, any label with value - greater than 0.5 will be mapped to 1.0, or - 0.0 otherwise. So it works for both +1/-1 and - 1/0 cases. If true, the double value parsed - directly from the label string will be used - as the label value. @param numFeatures: number of features, which will be determined from the input data if a nonpositive value is given. This is useful when the dataset is @@ -105,7 +106,7 @@ class MLUtils: >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0") >>> tempFile.flush() >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() - >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect() + >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() >>> tempFile.close() >>> type(examples[0]) == LabeledPoint True @@ -124,7 +125,7 @@ class MLUtils: """ lines = sc.textFile(path, minPartitions) - parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass)) + parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l)) if numFeatures <= 0: parsed.cache() numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1 -- cgit v1.2.3