aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
authorSean Owen <srowen@gmail.com>2014-07-30 17:34:32 -0700
committerXiangrui Meng <meng@databricks.com>2014-07-30 17:34:32 -0700
commite9b275b7697e7ad3b52b157d3274acc17ca8d828 (patch)
treec72a43b2a387bf15f6960b99d4c3a42c2dedaead /mllib/src/main
parent88a519db90d66ee5a1455ef4fcc1ad2a687e3d0b (diff)
downloadspark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.tar.gz
spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.tar.bz2
spark-e9b275b7697e7ad3b52b157d3274acc17ca8d828.zip
SPARK-2341 [MLLIB] loadLibSVMFile doesn't handle regression datasets
Per discussion at https://issues.apache.org/jira/browse/SPARK-2341 , this is a look at deprecating the multiclass parameter. Thoughts welcome of course. Author: Sean Owen <srowen@gmail.com> Closes #1663 from srowen/SPARK-2341 and squashes the following commits: 8a3abd7 [Sean Owen] Suppress MIMA error for removed package private classes 18a8c8e [Sean Owen] Updates from review 83d0092 [Sean Owen] Deprecated methods with multiclass, and instead always parse target as a double (ie. multiclass = true)
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala56
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala52
2 files changed, 16 insertions, 92 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
deleted file mode 100644
index e25bf18b78..0000000000
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-/** Trait for label parsers. */
-private trait LabelParser extends Serializable {
- /** Parses a string label into a double label. */
- def parse(labelString: String): Double
-}
-
-/** Factory methods for label parsers. */
-private object LabelParser {
- def getInstance(multiclass: Boolean): LabelParser = {
- if (multiclass) MulticlassLabelParser else BinaryLabelParser
- }
-}
-
-/**
- * Label parser for binary labels, which outputs 1.0 (positive) if the value is greater than 0.5,
- * or 0.0 (negative) otherwise. So it works with +1/-1 labeling and +1/0 labeling.
- */
-private object BinaryLabelParser extends LabelParser {
- /** Gets the default instance of BinaryLabelParser. */
- def getInstance(): LabelParser = this
-
- /**
- * Parses the input label into positive (1.0) if the value is greater than 0.5,
- * or negative (0.0) otherwise.
- */
- override def parse(labelString: String): Double = if (labelString.toDouble > 0.5) 1.0 else 0.0
-}
-
-/**
- * Label parser for multiclass labels, which converts the input label to double.
- */
-private object MulticlassLabelParser extends LabelParser {
- /** Gets the default instance of MulticlassLabelParser. */
- def getInstance(): LabelParser = this
-
- override def parse(labelString: String): Double = labelString.toDouble
-}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 30de24ad89..dc10a19478 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -55,7 +55,6 @@ object MLUtils {
*
* @param sc Spark context
* @param path file or directory path in any Hadoop-supported file system URI
- * @param labelParser parser for labels
* @param numFeatures number of features, which will be determined from the input data if a
* nonpositive value is given. This is useful when the dataset is already split
* into multiple files and you want to load them separately, because some
@@ -64,10 +63,9 @@ object MLUtils {
* @param minPartitions min number of partitions
* @return labeled data stored as an RDD[LabeledPoint]
*/
- private def loadLibSVMFile(
+ def loadLibSVMFile(
sc: SparkContext,
path: String,
- labelParser: LabelParser,
numFeatures: Int,
minPartitions: Int): RDD[LabeledPoint] = {
val parsed = sc.textFile(path, minPartitions)
@@ -75,7 +73,7 @@ object MLUtils {
.filter(line => !(line.isEmpty || line.startsWith("#")))
.map { line =>
val items = line.split(' ')
- val label = labelParser.parse(items.head)
+ val label = items.head.toDouble
val (indices, values) = items.tail.map { item =>
val indexAndValue = item.split(':')
val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
@@ -102,36 +100,14 @@ object MLUtils {
// Convenient methods for `loadLibSVMFile`.
- /**
- * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
- * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
- * Each line represents a labeled sparse feature vector using the following format:
- * {{{label index1:value1 index2:value2 ...}}}
- * where the indices are one-based and in ascending order.
- * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
- * where the feature indices are converted to zero-based.
- *
- * @param sc Spark context
- * @param path file or directory path in any Hadoop-supported file system URI
- * @param multiclass whether the input labels contain more than two classes. If false, any label
- * with value greater than 0.5 will be mapped to 1.0, or 0.0 otherwise. So it
- * works for both +1/-1 and 1/0 cases. If true, the double value parsed directly
- * from the label string will be used as the label value.
- * @param numFeatures number of features, which will be determined from the input data if a
- * nonpositive value is given. This is useful when the dataset is already split
- * into multiple files and you want to load them separately, because some
- * features may not present in certain files, which leads to inconsistent
- * feature dimensions.
- * @param minPartitions min number of partitions
- * @return labeled data stored as an RDD[LabeledPoint]
- */
- def loadLibSVMFile(
+ @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+ def loadLibSVMFile(
sc: SparkContext,
path: String,
multiclass: Boolean,
numFeatures: Int,
minPartitions: Int): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, LabelParser.getInstance(multiclass), numFeatures, minPartitions)
+ loadLibSVMFile(sc, path, numFeatures, minPartitions)
/**
* Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
@@ -140,26 +116,30 @@ object MLUtils {
def loadLibSVMFile(
sc: SparkContext,
path: String,
+ numFeatures: Int): RDD[LabeledPoint] =
+ loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
+
+ @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+ def loadLibSVMFile(
+ sc: SparkContext,
+ path: String,
multiclass: Boolean,
numFeatures: Int): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, multiclass, numFeatures, sc.defaultMinPartitions)
+ loadLibSVMFile(sc, path, numFeatures)
- /**
- * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the number of features
- * determined automatically and the default number of partitions.
- */
+ @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
def loadLibSVMFile(
sc: SparkContext,
path: String,
multiclass: Boolean): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, multiclass, -1, sc.defaultMinPartitions)
+ loadLibSVMFile(sc, path)
/**
* Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
* features determined automatically and the default number of partitions.
*/
def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, multiclass = false, -1, sc.defaultMinPartitions)
+ loadLibSVMFile(sc, path, -1)
/**
* Save labeled data in LIBSVM format.