diff options
author | Sean Owen <srowen@gmail.com> | 2014-07-30 17:34:32 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-07-30 17:34:32 -0700 |
commit | e9b275b7697e7ad3b52b157d3274acc17ca8d828 (patch) | |
tree | c72a43b2a387bf15f6960b99d4c3a42c2dedaead /python/pyspark/mllib/util.py | |
parent | 88a519db90d66ee5a1455ef4fcc1ad2a687e3d0b (diff) | |
download | spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.tar.gz spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.tar.bz2 spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.zip |
SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets
Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course.
Author: Sean Owen <srowen@gmail.com>
Closes #1663 from srowen/SPARK-2341 and squashes the following commits:
8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes
18a8c8e [Sean Owen] Updates from review
83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true)
Diffstat (limited to 'python/pyspark/mllib/util.py')
-rw-r--r-- | python/pyspark/mllib/util.py | 23 |
1 files changed, 12 insertions, 11 deletions
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a707a9dcd5..d94900cefd 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -29,15 +29,18 @@ class MLUtils: Helper methods to load, save and pre-process data used in MLlib. """ + @deprecated @staticmethod def _parse_libsvm_line(line, multiclass): + return _parse_libsvm_line(line) + + @staticmethod + def _parse_libsvm_line(line): """ Parses a line in LIBSVM format into (label, indices, values). """ items = line.split(None) label = float(items[0]) - if not multiclass: - label = 1.0 if label > 0.5 else 0.0 nnz = len(items) - 1 indices = np.zeros(nnz, dtype=np.int32) values = np.zeros(nnz) @@ -64,8 +67,13 @@ class MLUtils: " but got " % type(v)) return " ".join(items) + @deprecated @staticmethod def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None): + return loadLibSVMFile(sc, path, numFeatures, minPartitions) + + @staticmethod + def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): """ Loads labeled data in the LIBSVM format into an RDD of LabeledPoint. The LIBSVM format is a text-based format used by @@ -81,13 +89,6 @@ class MLUtils: @param sc: Spark context @param path: file or directory path in any Hadoop-supported file system URI - @param multiclass: whether the input labels contain more than - two classes. If false, any label with value - greater than 0.5 will be mapped to 1.0, or - 0.0 otherwise. So it works for both +1/-1 and - 1/0 cases. If true, the double value parsed - directly from the label string will be used - as the label value. @param numFeatures: number of features, which will be determined from the input data if a nonpositive value is given. This is useful when the dataset is @@ -105,7 +106,7 @@ class MLUtils: >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0") >>> tempFile.flush() >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() - >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect() + >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() >>> tempFile.close() >>> type(examples[0]) == LabeledPoint True @@ -124,7 +125,7 @@ class MLUtils: """ lines = sc.textFile(path, minPartitions) - parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass)) + parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l)) if numFeatures <= 0: parsed.cache() numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1 |