aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorSean Owen <srowen@gmail.com>2014-07-30 17:34:32 -0700
committerXiangrui Meng <meng@databricks.com>2014-07-30 17:34:32 -0700
commite9b275b7697e7ad3b52b157d3274acc17ca8d828 (patch)
treec72a43b2a387bf15f6960b99d4c3a42c2dedaead /python
parent88a519db90d66ee5a1455ef4fcc1ad2a687e3d0b (diff)
downloadspark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.tar.gz
spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.tar.bz2
spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.zip
SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets
Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course. Author: Sean Owen <srowen@gmail.com> Closes #1663 from srowen/SPARK-2341 and squashes the following commits: 8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes 18a8c8e [Sean Owen] Updates from review 83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true)
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/util.py23
1 files changed, 12 insertions, 11 deletions
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index a707a9dcd5..d94900cefd 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -29,15 +29,18 @@ class MLUtils:
Helper methods to load, save and pre-process data used in MLlib.
"""
+ @deprecated
@staticmethod
def _parse_libsvm_line(line, multiclass):
+ return _parse_libsvm_line(line)
+
+ @staticmethod
+ def _parse_libsvm_line(line):
"""
Parses a line in LIBSVM format into (label, indices, values).
"""
items = line.split(None)
label = float(items[0])
- if not multiclass:
- label = 1.0 if label > 0.5 else 0.0
nnz = len(items) - 1
indices = np.zeros(nnz, dtype=np.int32)
values = np.zeros(nnz)
@@ -64,8 +67,13 @@ class MLUtils:
" but got " % type(v))
return " ".join(items)
+ @deprecated
@staticmethod
def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+ return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+ @staticmethod
+ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
"""
Loads labeled data in the LIBSVM format into an RDD of
LabeledPoint. The LIBSVM format is a text-based format used by
@@ -81,13 +89,6 @@ class MLUtils:
@param sc: Spark context
@param path: file or directory path in any Hadoop-supported file
system URI
- @param multiclass: whether the input labels contain more than
- two classes. If false, any label with value
- greater than 0.5 will be mapped to 1.0, or
- 0.0 otherwise. So it works for both +1/-1 and
- 1/0 cases. If true, the double value parsed
- directly from the label string will be used
- as the label value.
@param numFeatures: number of features, which will be determined
from the input data if a nonpositive value
is given. This is useful when the dataset is
@@ -105,7 +106,7 @@ class MLUtils:
>>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
>>> tempFile.flush()
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
- >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()
+ >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
>>> tempFile.close()
>>> type(examples[0]) == LabeledPoint
True
@@ -124,7 +125,7 @@ class MLUtils:
"""
lines = sc.textFile(path, minPartitions)
- parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass))
+ parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
if numFeatures <= 0:
parsed.cache()
numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1