aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorOleksiy Dyagilev <oleksiy_dyagilev@epam.com>2015-06-23 13:12:19 -0700
committerXiangrui Meng <meng@databricks.com>2015-06-23 13:12:19 -0700
commita8031183aff2e23de9204ddfc7e7f5edbf052a7e (patch)
tree357836fe5f56da368b3b4869d3f0f0ca089123e3 /mllib
parentf2fb0285ab6d4225c5350f109dea6c1c017bb491 (diff)
downloadspark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.tar.gz
spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.tar.bz2
spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.zip
[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector
fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev <oleksiy_dyagilev@epam.com> Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala5
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala7
3 files changed, 14 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index 308f7f3578..a841c5caf0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
}
} else if (token == ")") {
parsing = false
+ } else if (token.trim.isEmpty){
+ // ignore whitespaces between delim chars, e.g. ", ["
} else {
// expecting a number
items.append(parseDouble(token))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index d8364a06de..f8d0af8820 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite {
}
}
+ test("parse labeled points with whitespaces") {
+ val point = LabeledPoint.parse("(0.0, [1.0, 2.0])")
+ assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+ }
+
test("parse labeled points with v0.9 format") {
val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0")
assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index 8dcb9ba9be..fa4f74d71b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite {
}
}
}
+
+ test("parser with whitespaces") {
+ val s = "(0.0, [1.0, 2.0])"
+ val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+ assert(parsed(0).asInstanceOf[Double] === 0.0)
+ assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+ }
}