[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector

fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev <oleksiy_dyagilev@epam.com> Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position
author: Oleksiy Dyagilev <oleksiy_dyagilev@epam.com> 2015-06-23 13:12:19 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-06-23 13:12:19 -0700
commit: a8031183aff2e23de9204ddfc7e7f5edbf052a7e (patch)
tree: 357836fe5f56da368b3b4869d3f0f0ca089123e3 /mllib
parent: f2fb0285ab6d4225c5350f109dea6c1c017bb491 (diff)
download: spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.tar.gz
spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.tar.bz2
spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.zip
3 files changed, 14 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index 308f7f3578..a841c5caf0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
         }
       } else if (token == ")") {
         parsing = false
+      } else if (token.trim.isEmpty){
+          // ignore whitespaces between delim chars, e.g. ", ["
       } else {
         // expecting a number
         items.append(parseDouble(token))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index d8364a06de..f8d0af8820 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite {
     }
   }
 
+  test("parse labeled points with whitespaces") {
+    val point = LabeledPoint.parse("(0.0, [1.0, 2.0])")
+    assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test("parse labeled points with v0.9 format") {
     val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0")
     assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index 8dcb9ba9be..fa4f74d71b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite {
       }
     }
   }
+
+  test("parser with whitespaces") {
+    val s = "(0.0, [1.0, 2.0])"
+    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+    assert(parsed(0).asInstanceOf[Double] === 0.0)
+    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }
author	Oleksiy Dyagilev <oleksiy_dyagilev@epam.com>	2015-06-23 13:12:19 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-06-23 13:12:19 -0700
commit	a8031183aff2e23de9204ddfc7e7f5edbf052a7e (patch)
tree	357836fe5f56da368b3b4869d3f0f0ca089123e3 /mllib
parent	f2fb0285ab6d4225c5350f109dea6c1c017bb491 (diff)
download	spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.tar.gz spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.tar.bz2 spark-a8031183aff2e23de9204ddfc7e7f5edbf052a7e.zip