From e9b275b7697e7ad3b52b157d3274acc17ca8d828 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Wed, 30 Jul 2014 17:34:32 -0700
Subject: SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets

Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course.

Author: Sean Owen <srowen@gmail.com>

Closes #1663 from srowen/SPARK-2341 and squashes the following commits:

8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes
18a8c8e [Sean Owen] Updates from review
83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true)
---
 python/pyspark/mllib/util.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'python/pyspark/mllib/util.py')

diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index a707a9dcd5..d94900cefd 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -29,15 +29,18 @@ class MLUtils:
     Helper methods to load, save and pre-process data used in MLlib.
     """
 
+    @deprecated
     @staticmethod
     def _parse_libsvm_line(line, multiclass):
+        return _parse_libsvm_line(line)
+
+    @staticmethod
+    def _parse_libsvm_line(line):
         """
         Parses a line in LIBSVM format into (label, indices, values).
         """
         items = line.split(None)
         label = float(items[0])
-        if not multiclass:
-            label = 1.0 if label > 0.5 else 0.0
         nnz = len(items) - 1
         indices = np.zeros(nnz, dtype=np.int32)
         values = np.zeros(nnz)
@@ -64,8 +67,13 @@ class MLUtils:
                             " but got " % type(v))
         return " ".join(items)
 
+    @deprecated
     @staticmethod
     def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+        return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+    @staticmethod
+    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         """
         Loads labeled data in the LIBSVM format into an RDD of
         LabeledPoint. The LIBSVM format is a text-based format used by
@@ -81,13 +89,6 @@ class MLUtils:
         @param sc: Spark context
         @param path: file or directory path in any Hadoop-supported file
                      system URI
-        @param multiclass: whether the input labels contain more than
-                           two classes. If false, any label with value
-                           greater than 0.5 will be mapped to 1.0, or
-                           0.0 otherwise. So it works for both +1/-1 and
-                           1/0 cases. If true, the double value parsed
-                           directly from the label string will be used
-                           as the label value.
         @param numFeatures: number of features, which will be determined
                             from the input data if a nonpositive value
                             is given. This is useful when the dataset is
@@ -105,7 +106,7 @@ class MLUtils:
         >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
         >>> tempFile.flush()
         >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
-        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()
+        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
         >>> tempFile.close()
         >>> type(examples[0]) == LabeledPoint
         True
@@ -124,7 +125,7 @@ class MLUtils:
         """
 
         lines = sc.textFile(path, minPartitions)
-        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass))
+        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
             numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
-- 
cgit v1.2.3