[SPARK-10117] [MLLIB] Implement SQL data source API for reading LIBSVM data

It is convenient to implement data source API for LIBSVM format to have a better integration with DataFrames and ML pipeline API. Two option is implemented. * `numFeatures`: Specify the dimension of features vector * `featuresType`: Specify the type of output vector. `sparse` is default. Author: lewuathe <lewuathe@me.com> Closes #8537 from Lewuathe/SPARK-10117 and squashes the following commits: 986999d [lewuathe] Change unit test phrase 11d513f [lewuathe] Fix some reviews 21600a4 [lewuathe] Merge branch 'master' into SPARK-10117 9ce63c7 [lewuathe] Rewrite service loader file 1fdd2df [lewuathe] Merge branch 'SPARK-10117' of github.com:Lewuathe/spark into SPARK-10117 ba3657c [lewuathe] Merge branch 'master' into SPARK-10117 0ea1c1c [lewuathe] LibSVMRelation is registered into META-INF 4f40891 [lewuathe] Improve test suites 5ab62ab [lewuathe] Merge branch 'master' into SPARK-10117 8660d0e [lewuathe] Fix Java unit test b56a948 [lewuathe] Merge branch 'master' into SPARK-10117 2c12894 [lewuathe] Remove unnecessary tag 7d693c2 [lewuathe] Resolv conflict 62010af [lewuathe] Merge branch 'master' into SPARK-10117 a97ee97 [lewuathe] Fix some points aef9564 [lewuathe] Fix 70ee4dd [lewuathe] Add Java test 3fd8dce [lewuathe] [SPARK-10117] Implement SQL data source API for reading LIBSVM data 40d3027 [lewuathe] Add Java test 7056d4a [lewuathe] Merge branch 'master' into SPARK-10117 99accaa [lewuathe] [SPARK-10117] Implement SQL data source API for reading LIBSVM data
author: lewuathe <lewuathe@me.com> 2015-09-09 09:29:10 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-09-09 09:29:10 -0700
commit: 2ddeb63126d26149eda197e85b7b26ef16a6e97c (patch)
tree: 6d208f63c719af95e7392b4dcf3cd21949301e0e /mllib/src/test/scala/org/apache
parent: c1bc4f439f54625c01a585691e5293cd9961eb0c (diff)
download: spark-2ddeb63126d26149eda197e85b7b26ef16a6e97c.tar.gz
spark-2ddeb63126d26149eda197e85b7b26ef16a6e97c.tar.bz2
spark-2ddeb63126d26149eda197e85b7b26ef16a6e97c.zip
1 files changed, 76 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala
new file mode 100644
index 0000000000..8ed134128c
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{SparseVector, Vectors, DenseVector}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
+
+class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
+  var path: String = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val lines =
+      """
+        |1 1:1.0 3:2.0 5:3.0
+        |0
+        |0 2:4.0 4:5.0 6:6.0
+      """.stripMargin
+    val tempDir = Utils.createTempDir()
+    val file = new File(tempDir.getPath, "part-00000")
+    Files.write(lines, file, Charsets.US_ASCII)
+    path = tempDir.toURI.toString
+  }
+
+  test("select as sparse vector") {
+    val df = sqlContext.read.format("libsvm").load(path)
+    assert(df.columns(0) == "label")
+    assert(df.columns(1) == "features")
+    val row1 = df.first()
+    assert(row1.getDouble(0) == 1.0)
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
+  }
+
+  test("select as dense vector") {
+    val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense"))
+      .load(path)
+    assert(df.columns(0) == "label")
+    assert(df.columns(1) == "features")
+    assert(df.count() == 3)
+    val row1 = df.first()
+    assert(row1.getDouble(0) == 1.0)
+    val v = row1.getAs[DenseVector](1)
+    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
+  }
+
+  test("select a vector with specifying the longer dimension") {
+    val df = sqlContext.read.option("numFeatures", "100").format("libsvm")
+      .load(path)
+    val row1 = df.first()
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
+  }
+}
author	lewuathe <lewuathe@me.com>	2015-09-09 09:29:10 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-09-09 09:29:10 -0700
commit	2ddeb63126d26149eda197e85b7b26ef16a6e97c (patch)
tree	6d208f63c719af95e7392b4dcf3cd21949301e0e /mllib/src/test/scala/org/apache
parent	c1bc4f439f54625c01a585691e5293cd9961eb0c (diff)
download	spark-2ddeb63126d26149eda197e85b7b26ef16a6e97c.tar.gz spark-2ddeb63126d26149eda197e85b7b26ef16a6e97c.tar.bz2 spark-2ddeb63126d26149eda197e85b7b26ef16a6e97c.zip