aboutsummaryrefslogtreecommitdiff
path: root/mllib/src
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-04-24 08:27:48 -0700
committerXiangrui Meng <meng@databricks.com>2015-04-24 08:27:48 -0700
commit78b39c7e0de8c9dc748cfbf8f78578a9524b6a94 (patch)
treeb64aad685d1115cc26a18e5a20b11ee9956e0e72 /mllib/src
parent8509519d8bcf99e2d1b5e21da514d51357f9116d (diff)
downloadspark-78b39c7e0de8c9dc748cfbf8f78578a9524b6a94.tar.gz
spark-78b39c7e0de8c9dc748cfbf8f78578a9524b6a94.tar.bz2
spark-78b39c7e0de8c9dc748cfbf8f78578a9524b6a94.zip
[SPARK-7115] [MLLIB] skip the very first 1 in poly expansion
yinxusen Author: Xiangrui Meng <meng@databricks.com> Closes #5681 from mengxr/SPARK-7115 and squashes the following commits: 9ac27cd [Xiangrui Meng] skip the very first 1 in poly expansion
Diffstat (limited to 'mllib/src')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala22
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala22
2 files changed, 24 insertions, 20 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index c3a59a361d..d855f04799 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -87,7 +87,9 @@ object PolynomialExpansion {
if (multiplier == 0.0) {
// do nothing
} else if (degree == 0 || lastIdx < 0) {
- polyValues(curPolyIdx) = multiplier
+ if (curPolyIdx >= 0) { // skip the very first 1
+ polyValues(curPolyIdx) = multiplier
+ }
} else {
val v = values(lastIdx)
val lastIdx1 = lastIdx - 1
@@ -116,8 +118,10 @@ object PolynomialExpansion {
if (multiplier == 0.0) {
// do nothing
} else if (degree == 0 || lastIdx < 0) {
- polyIndices += curPolyIdx
- polyValues += multiplier
+ if (curPolyIdx >= 0) { // skip the very first 1
+ polyIndices += curPolyIdx
+ polyValues += multiplier
+ }
} else {
// Skip all zeros at the tail.
val v = values(lastIdx)
@@ -139,8 +143,8 @@ object PolynomialExpansion {
private def expand(dv: DenseVector, degree: Int): DenseVector = {
val n = dv.size
val polySize = getPolySize(n, degree)
- val polyValues = new Array[Double](polySize)
- expandDense(dv.values, n - 1, degree, 1.0, polyValues, 0)
+ val polyValues = new Array[Double](polySize - 1)
+ expandDense(dv.values, n - 1, degree, 1.0, polyValues, -1)
new DenseVector(polyValues)
}
@@ -149,12 +153,12 @@ object PolynomialExpansion {
val nnz = sv.values.length
val nnzPolySize = getPolySize(nnz, degree)
val polyIndices = mutable.ArrayBuilder.make[Int]
- polyIndices.sizeHint(nnzPolySize)
+ polyIndices.sizeHint(nnzPolySize - 1)
val polyValues = mutable.ArrayBuilder.make[Double]
- polyValues.sizeHint(nnzPolySize)
+ polyValues.sizeHint(nnzPolySize - 1)
expandSparse(
- sv.indices, sv.values, nnz - 1, sv.size - 1, degree, 1.0, polyIndices, polyValues, 0)
- new SparseVector(polySize, polyIndices.result(), polyValues.result())
+ sv.indices, sv.values, nnz - 1, sv.size - 1, degree, 1.0, polyIndices, polyValues, -1)
+ new SparseVector(polySize - 1, polyIndices.result(), polyValues.result())
}
def expand(v: Vector, degree: Int): Vector = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
index b0a537be42..c1d64fba0a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
@@ -44,11 +44,11 @@ class PolynomialExpansionSuite extends FunSuite with MLlibTestSparkContext {
)
val twoDegreeExpansion: Array[Vector] = Array(
- Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5), Array(1.0, -2.0, 4.0, 2.3, -4.6, 5.29)),
- Vectors.dense(1.0, -2.0, 4.0, 2.3, -4.6, 5.29),
- Vectors.dense(Array(1.0) ++ Array.fill[Double](9)(0.0)),
- Vectors.dense(1.0, 0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
- Vectors.sparse(10, Array(0), Array(1.0)))
+ Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
+ Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
+ Vectors.dense(new Array[Double](9)),
+ Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
+ Vectors.sparse(9, Array.empty, Array.empty))
val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected")
@@ -76,13 +76,13 @@ class PolynomialExpansionSuite extends FunSuite with MLlibTestSparkContext {
)
val threeDegreeExpansion: Array[Vector] = Array(
- Vectors.sparse(20, Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
- Array(1.0, -2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
- Vectors.dense(1.0, -2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
- Vectors.dense(Array(1.0) ++ Array.fill[Double](19)(0.0)),
- Vectors.dense(1.0, 0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
+ Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
+ Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
+ Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
+ Vectors.dense(new Array[Double](19)),
+ Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
-1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
- Vectors.sparse(20, Array(0), Array(1.0)))
+ Vectors.sparse(19, Array.empty, Array.empty))
val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected")