aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorOctavian Geagla <ogeagla@gmail.com>2015-05-07 14:49:55 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-05-07 14:49:55 -0700
commit658a478d3f86456df09d0fbb1ba438fb36d8725c (patch)
treecafcc6978110c48afbd818323ba9d3c83e8f1c4a /mllib
parent347a329a36c94ff37363e4dffcbd5a24dc6a6714 (diff)
downloadspark-658a478d3f86456df09d0fbb1ba438fb36d8725c.tar.gz
spark-658a478d3f86456df09d0fbb1ba438fb36d8725c.tar.bz2
spark-658a478d3f86456df09d0fbb1ba438fb36d8725c.zip
[SPARK-5726] [MLLIB] Elementwise (Hadamard) Vector Product Transformer
See https://issues.apache.org/jira/browse/SPARK-5726 Author: Octavian Geagla <ogeagla@gmail.com> Author: Joseph K. Bradley <joseph@databricks.com> Closes #4580 from ogeagla/spark-mllib-weighting and squashes the following commits: fac12ad [Octavian Geagla] [SPARK-5726] [MLLIB] Use new createTransformFunc. 90f7e39 [Joseph K. Bradley] small cleanups 4595165 [Octavian Geagla] [SPARK-5726] [MLLIB] Remove erroneous test case. ded3ac6 [Octavian Geagla] [SPARK-5726] [MLLIB] Pass style checks. 37d4705 [Octavian Geagla] [SPARK-5726] [MLLIB] Incorporated feedback. 1dffeee [Octavian Geagla] [SPARK-5726] [MLLIB] Pass style checks. e436896 [Octavian Geagla] [SPARK-5726] [MLLIB] Remove 'TF' from 'ElementwiseProductTF' cb520e6 [Octavian Geagla] [SPARK-5726] [MLLIB] Rename HadamardProduct to ElementwiseProduct 4922722 [Octavian Geagla] [SPARK-5726] [MLLIB] Hadamard Vector Product Transformer
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala55
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala64
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala61
3 files changed, 180 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
new file mode 100644
index 0000000000..f8b56293e3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param.Param
+import org.apache.spark.mllib.feature
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql.types.DataType
+
+/**
+ * :: AlphaComponent ::
+ * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
+ * provided "weight" vector. In other words, it scales each column of the dataset by a scalar
+ * multiplier.
+ */
+@AlphaComponent
+class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
+
+ /**
+ * the vector to multiply with input vectors
+ * @group param
+ */
+ val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
+
+ /** @group setParam */
+ def setScalingVec(value: Vector): this.type = set(scalingVec, value)
+
+ /** @group getParam */
+ def getScalingVec: Vector = getOrDefault(scalingVec)
+
+ override protected def createTransformFunc: Vector => Vector = {
+ require(params.contains(scalingVec), s"transformation requires a weight vector")
+ val elemScaler = new feature.ElementwiseProduct($(scalingVec))
+ elemScaler.transform
+ }
+
+ override protected def outputDataType: DataType = new VectorUDT()
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
new file mode 100644
index 0000000000..b0985baf9b
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg._
+
+/**
+ * :: Experimental ::
+ * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
+ * provided "weight" vector. In other words, it scales each column of the dataset by a scalar
+ * multiplier.
+ * @param scalingVector The values used to scale the reference vector's individual components.
+ */
+@Experimental
+class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
+
+ /**
+ * Does the hadamard product transformation.
+ *
+ * @param vector vector to be transformed.
+ * @return transformed vector.
+ */
+ override def transform(vector: Vector): Vector = {
+ require(vector.size == scalingVector.size,
+ s"vector sizes do not match: Expected ${scalingVector.size} but found ${vector.size}")
+ vector match {
+ case dv: DenseVector =>
+ val values: Array[Double] = dv.values.clone()
+ val dim = scalingVector.size
+ var i = 0
+ while (i < dim) {
+ values(i) *= scalingVector(i)
+ i += 1
+ }
+ Vectors.dense(values)
+ case SparseVector(size, indices, vs) =>
+ val values = vs.clone()
+ val dim = values.length
+ var i = 0
+ while (i < dim) {
+ values(i) *= scalingVector(indices(i))
+ i += 1
+ }
+ Vectors.sparse(size, indices, values)
+ case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass)
+ }
+ }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
new file mode 100644
index 0000000000..f3a482abda
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext {
+
+ test("elementwise (hadamard) product should properly apply vector to dense data set") {
+ val denseData = Array(
+ Vectors.dense(1.0, 4.0, 1.9, -9.0)
+ )
+ val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
+ val transformer = new ElementwiseProduct(scalingVec)
+ val transformedData = transformer.transform(sc.makeRDD(denseData))
+ val transformedVecs = transformedData.collect()
+ val transformedVec = transformedVecs(0)
+ val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
+ assert(transformedVec ~== expectedVec absTol 1E-5,
+ s"Expected transformed vector $expectedVec but found $transformedVec")
+ }
+
+ test("elementwise (hadamard) product should properly apply vector to sparse data set") {
+ val sparseData = Array(
+ Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
+ )
+ val dataRDD = sc.parallelize(sparseData, 3)
+ val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
+ val transformer = new ElementwiseProduct(scalingVec)
+ val data2 = sparseData.map(transformer.transform)
+ val data2RDD = transformer.transform(dataRDD)
+
+ assert((sparseData, data2, data2RDD.collect()).zipped.forall {
+ case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
+ case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
+ case _ => false
+ }, "The vector type should be preserved after hadamard product")
+
+ assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
+ assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
+ }
+}