diff options
author | Xiangrui Meng <meng@databricks.com> | 2015-05-07 15:45:37 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-05-07 15:45:37 -0700 |
commit | e43803b8f477b2c8d28836ac163cb54328d13f1a (patch) | |
tree | 3da358b42d71c182cf1ff262dc0ec509124d3b39 /mllib | |
parent | 658a478d3f86456df09d0fbb1ba438fb36d8725c (diff) | |
download | spark-e43803b8f477b2c8d28836ac163cb54328d13f1a.tar.gz spark-e43803b8f477b2c8d28836ac163cb54328d13f1a.tar.bz2 spark-e43803b8f477b2c8d28836ac163cb54328d13f1a.zip |
[SPARK-6948] [MLLIB] compress vectors in VectorAssembler
The compression is based on storage. brkyvz
Author: Xiangrui Meng <meng@databricks.com>
Closes #5985 from mengxr/SPARK-6948 and squashes the following commits:
df56a00 [Xiangrui Meng] update python tests
6d90d45 [Xiangrui Meng] compress vectors in VectorAssembler
Diffstat (limited to 'mllib')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 | ||||
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala | 10 |
2 files changed, 10 insertions, 2 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index b5a69cee6d..796758a70e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -102,6 +102,6 @@ object VectorAssembler { case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } - Vectors.sparse(cur, indices.result(), values.result()) + Vectors.sparse(cur, indices.result(), values.result()).compressed } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index 57d0278e03..0db27607bc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} @@ -48,6 +48,14 @@ class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext { } } + test("assemble should compress vectors") { + import org.apache.spark.ml.feature.VectorAssembler.assemble + val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) + assert(v1.isInstanceOf[SparseVector]) + val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) + assert(v2.isInstanceOf[DenseVector]) + } + test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) |