aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala56
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala52
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala41
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala14
-rw-r--r--project/MimaExcludes.scala8
-rw-r--r--python/pyspark/mllib/util.py23
8 files changed, 46 insertions, 154 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
index 4811bb70e4..05b7d66f8d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -91,7 +91,7 @@ object LinearRegression extends App {
Logger.getRootLogger.setLevel(Level.WARN)
- val examples = MLUtils.loadLibSVMFile(sc, params.input, multiclass = true).cache()
+ val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
val splits = examples.randomSplit(Array(0.8, 0.2))
val training = splits(0).cache()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
index 537e68a099..88acd9dbb0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
@@ -22,7 +22,7 @@ import scopt.OptionParser
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.NaiveBayes
-import org.apache.spark.mllib.util.{MLUtils, MulticlassLabelParser}
+import org.apache.spark.mllib.util.MLUtils
/**
* An example naive Bayes app. Run with
@@ -76,7 +76,7 @@ object SparseNaiveBayes {
if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions
val examples =
- MLUtils.loadLibSVMFile(sc, params.input, multiclass = true, params.numFeatures, minPartitions)
+ MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
// Cache examples because it will be used in both training and evaluation.
examples.cache()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
deleted file mode 100644
index e25bf18b78..0000000000
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-/** Trait for label parsers. */
-private trait LabelParser extends Serializable {
- /** Parses a string label into a double label. */
- def parse(labelString: String): Double
-}
-
-/** Factory methods for label parsers. */
-private object LabelParser {
- def getInstance(multiclass: Boolean): LabelParser = {
- if (multiclass) MulticlassLabelParser else BinaryLabelParser
- }
-}
-
-/**
- * Label parser for binary labels, which outputs 1.0 (positive) if the value is greater than 0.5,
- * or 0.0 (negative) otherwise. So it works with +1/-1 labeling and +1/0 labeling.
- */
-private object BinaryLabelParser extends LabelParser {
- /** Gets the default instance of BinaryLabelParser. */
- def getInstance(): LabelParser = this
-
- /**
- * Parses the input label into positive (1.0) if the value is greater than 0.5,
- * or negative (0.0) otherwise.
- */
- override def parse(labelString: String): Double = if (labelString.toDouble > 0.5) 1.0 else 0.0
-}
-
-/**
- * Label parser for multiclass labels, which converts the input label to double.
- */
-private object MulticlassLabelParser extends LabelParser {
- /** Gets the default instance of MulticlassLabelParser. */
- def getInstance(): LabelParser = this
-
- override def parse(labelString: String): Double = labelString.toDouble
-}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 30de24ad89..dc10a19478 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -55,7 +55,6 @@ object MLUtils {
*
* @param sc Spark context
* @param path file or directory path in any Hadoop-supported file system URI
- * @param labelParser parser for labels
* @param numFeatures number of features, which will be determined from the input data if a
* nonpositive value is given. This is useful when the dataset is already split
* into multiple files and you want to load them separately, because some
@@ -64,10 +63,9 @@ object MLUtils {
* @param minPartitions min number of partitions
* @return labeled data stored as an RDD[LabeledPoint]
*/
- private def loadLibSVMFile(
+ def loadLibSVMFile(
sc: SparkContext,
path: String,
- labelParser: LabelParser,
numFeatures: Int,
minPartitions: Int): RDD[LabeledPoint] = {
val parsed = sc.textFile(path, minPartitions)
@@ -75,7 +73,7 @@ object MLUtils {
.filter(line => !(line.isEmpty || line.startsWith("#")))
.map { line =>
val items = line.split(' ')
- val label = labelParser.parse(items.head)
+ val label = items.head.toDouble
val (indices, values) = items.tail.map { item =>
val indexAndValue = item.split(':')
val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
@@ -102,36 +100,14 @@ object MLUtils {
// Convenient methods for `loadLibSVMFile`.
- /**
- * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
- * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
- * Each line represents a labeled sparse feature vector using the following format:
- * {{{label index1:value1 index2:value2 ...}}}
- * where the indices are one-based and in ascending order.
- * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
- * where the feature indices are converted to zero-based.
- *
- * @param sc Spark context
- * @param path file or directory path in any Hadoop-supported file system URI
- * @param multiclass whether the input labels contain more than two classes. If false, any label
- * with value greater than 0.5 will be mapped to 1.0, or 0.0 otherwise. So it
- * works for both +1/-1 and 1/0 cases. If true, the double value parsed directly
- * from the label string will be used as the label value.
- * @param numFeatures number of features, which will be determined from the input data if a
- * nonpositive value is given. This is useful when the dataset is already split
- * into multiple files and you want to load them separately, because some
- * features may not present in certain files, which leads to inconsistent
- * feature dimensions.
- * @param minPartitions min number of partitions
- * @return labeled data stored as an RDD[LabeledPoint]
- */
- def loadLibSVMFile(
+ @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+ def loadLibSVMFile(
sc: SparkContext,
path: String,
multiclass: Boolean,
numFeatures: Int,
minPartitions: Int): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, LabelParser.getInstance(multiclass), numFeatures, minPartitions)
+ loadLibSVMFile(sc, path, numFeatures, minPartitions)
/**
* Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
@@ -140,26 +116,30 @@ object MLUtils {
def loadLibSVMFile(
sc: SparkContext,
path: String,
+ numFeatures: Int): RDD[LabeledPoint] =
+ loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
+
+ @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+ def loadLibSVMFile(
+ sc: SparkContext,
+ path: String,
multiclass: Boolean,
numFeatures: Int): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, multiclass, numFeatures, sc.defaultMinPartitions)
+ loadLibSVMFile(sc, path, numFeatures)
- /**
- * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the number of features
- * determined automatically and the default number of partitions.
- */
+ @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
def loadLibSVMFile(
sc: SparkContext,
path: String,
multiclass: Boolean): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, multiclass, -1, sc.defaultMinPartitions)
+ loadLibSVMFile(sc, path)
/**
* Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
* features determined automatically and the default number of partitions.
*/
def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
- loadLibSVMFile(sc, path, multiclass = false, -1, sc.defaultMinPartitions)
+ loadLibSVMFile(sc, path, -1)
/**
* Save labeled data in LIBSVM format.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
deleted file mode 100644
index ac85677f2f..0000000000
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-import org.scalatest.FunSuite
-
-class LabelParsersSuite extends FunSuite {
- test("binary label parser") {
- for (parser <- Seq(BinaryLabelParser, BinaryLabelParser.getInstance())) {
- assert(parser.parse("+1") === 1.0)
- assert(parser.parse("1") === 1.0)
- assert(parser.parse("0") === 0.0)
- assert(parser.parse("-1") === 0.0)
- }
- }
-
- test("multiclass label parser") {
- for (parser <- Seq(MulticlassLabelParser, MulticlassLabelParser.getInstance())) {
- assert(parser.parse("0") == 0.0)
- assert(parser.parse("+1") === 1.0)
- assert(parser.parse("1") === 1.0)
- assert(parser.parse("2") === 2.0)
- assert(parser.parse("3") === 3.0)
- }
- }
-}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index c14870fb96..8ef2bb1bf6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -63,9 +63,9 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
test("loadLibSVMFile") {
val lines =
"""
- |+1 1:1.0 3:2.0 5:3.0
- |-1
- |-1 2:4.0 4:5.0 6:6.0
+ |1 1:1.0 3:2.0 5:3.0
+ |0
+ |0 2:4.0 4:5.0 6:6.0
""".stripMargin
val tempDir = Files.createTempDir()
tempDir.deleteOnExit()
@@ -73,7 +73,7 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
Files.write(lines, file, Charsets.US_ASCII)
val path = tempDir.toURI.toString
- val pointsWithNumFeatures = loadLibSVMFile(sc, path, multiclass = false, 6).collect()
+ val pointsWithNumFeatures = loadLibSVMFile(sc, path, 6).collect()
val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect()
for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) {
@@ -86,11 +86,11 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
assert(points(2).features === Vectors.sparse(6, Seq((1, 4.0), (3, 5.0), (5, 6.0))))
}
- val multiclassPoints = loadLibSVMFile(sc, path, multiclass = true).collect()
+ val multiclassPoints = loadLibSVMFile(sc, path).collect()
assert(multiclassPoints.length === 3)
assert(multiclassPoints(0).label === 1.0)
- assert(multiclassPoints(1).label === -1.0)
- assert(multiclassPoints(2).label === -1.0)
+ assert(multiclassPoints(1).label === 0.0)
+ assert(multiclassPoints(2).label === 0.0)
Utils.deleteRecursively(tempDir)
}
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 5ff88f0dd1..5a835f5820 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -97,6 +97,14 @@ object MimaExcludes {
"org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
ProblemFilters.exclude[IncompatibleMethTypeProblem](
"org.apache.spark.mllib.tree.impurity.Variance.calculate")
+ ) ++
+ Seq ( // Package-private classes removed in SPARK-2341
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
)
case v if v.startsWith("1.0") =>
Seq(
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index a707a9dcd5..d94900cefd 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -29,15 +29,18 @@ class MLUtils:
Helper methods to load, save and pre-process data used in MLlib.
"""
+ @deprecated
@staticmethod
def _parse_libsvm_line(line, multiclass):
+ return _parse_libsvm_line(line)
+
+ @staticmethod
+ def _parse_libsvm_line(line):
"""
Parses a line in LIBSVM format into (label, indices, values).
"""
items = line.split(None)
label = float(items[0])
- if not multiclass:
- label = 1.0 if label > 0.5 else 0.0
nnz = len(items) - 1
indices = np.zeros(nnz, dtype=np.int32)
values = np.zeros(nnz)
@@ -64,8 +67,13 @@ class MLUtils:
" but got " % type(v))
return " ".join(items)
+ @deprecated
@staticmethod
def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+ return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+ @staticmethod
+ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
"""
Loads labeled data in the LIBSVM format into an RDD of
LabeledPoint. The LIBSVM format is a text-based format used by
@@ -81,13 +89,6 @@ class MLUtils:
@param sc: Spark context
@param path: file or directory path in any Hadoop-supported file
system URI
- @param multiclass: whether the input labels contain more than
- two classes. If false, any label with value
- greater than 0.5 will be mapped to 1.0, or
- 0.0 otherwise. So it works for both +1/-1 and
- 1/0 cases. If true, the double value parsed
- directly from the label string will be used
- as the label value.
@param numFeatures: number of features, which will be determined
from the input data if a nonpositive value
is given. This is useful when the dataset is
@@ -105,7 +106,7 @@ class MLUtils:
>>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
>>> tempFile.flush()
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
- >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()
+ >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
>>> tempFile.close()
>>> type(examples[0]) == LabeledPoint
True
@@ -124,7 +125,7 @@ class MLUtils:
"""
lines = sc.textFile(path, minPartitions)
- parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass))
+ parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
if numFeatures <= 0:
parsed.cache()
numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1