aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/mllib-feature-extraction.md54
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala55
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala64
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala61
4 files changed, 234 insertions, 0 deletions
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 80842b27ef..03fedd0101 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -477,3 +477,57 @@ sc.stop();
</div>
</div>
+## ElementwiseProduct
+
+ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
+
+`\[ \begin{pmatrix}
+v_1 \\
+\vdots \\
+v_N
+\end{pmatrix} \circ \begin{pmatrix}
+ w_1 \\
+ \vdots \\
+ w_N
+ \end{pmatrix}
+= \begin{pmatrix}
+ v_1 w_1 \\
+ \vdots \\
+ v_N w_N
+ \end{pmatrix}
+\]`
+
+[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
+
+* `w`: the transforming vector.
+
+`ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
+
+### Example
+
+This example below demonstrates how to load a simple vectors file, extract a set of vectors, then transform those vectors using a transforming vector value.
+
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+
+// Load and parse the data:
+val data = sc.textFile("data/mllib/kmeans_data.txt")
+val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
+
+val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+val transformer = new ElementwiseProduct(transformingVector)
+
+// Batch transform and per-row transform give the same results:
+val transformedData = transformer.transform(parsedData)
+val transformedData2 = parsedData.map(x => transformer.transform(x))
+
+{% endhighlight %}
+</div>
+</div>
+
+
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
new file mode 100644
index 0000000000..f8b56293e3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param.Param
+import org.apache.spark.mllib.feature
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql.types.DataType
+
+/**
+ * :: AlphaComponent ::
+ * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
+ * provided "weight" vector. In other words, it scales each column of the dataset by a scalar
+ * multiplier.
+ */
+@AlphaComponent
+class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
+
+ /**
+ * the vector to multiply with input vectors
+ * @group param
+ */
+ val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
+
+ /** @group setParam */
+ def setScalingVec(value: Vector): this.type = set(scalingVec, value)
+
+ /** @group getParam */
+ def getScalingVec: Vector = getOrDefault(scalingVec)
+
+ override protected def createTransformFunc: Vector => Vector = {
+ require(params.contains(scalingVec), s"transformation requires a weight vector")
+ val elemScaler = new feature.ElementwiseProduct($(scalingVec))
+ elemScaler.transform
+ }
+
+ override protected def outputDataType: DataType = new VectorUDT()
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
new file mode 100644
index 0000000000..b0985baf9b
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg._
+
+/**
+ * :: Experimental ::
+ * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
+ * provided "weight" vector. In other words, it scales each column of the dataset by a scalar
+ * multiplier.
+ * @param scalingVector The values used to scale the reference vector's individual components.
+ */
+@Experimental
+class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
+
+ /**
+ * Does the hadamard product transformation.
+ *
+ * @param vector vector to be transformed.
+ * @return transformed vector.
+ */
+ override def transform(vector: Vector): Vector = {
+ require(vector.size == scalingVector.size,
+ s"vector sizes do not match: Expected ${scalingVector.size} but found ${vector.size}")
+ vector match {
+ case dv: DenseVector =>
+ val values: Array[Double] = dv.values.clone()
+ val dim = scalingVector.size
+ var i = 0
+ while (i < dim) {
+ values(i) *= scalingVector(i)
+ i += 1
+ }
+ Vectors.dense(values)
+ case SparseVector(size, indices, vs) =>
+ val values = vs.clone()
+ val dim = values.length
+ var i = 0
+ while (i < dim) {
+ values(i) *= scalingVector(indices(i))
+ i += 1
+ }
+ Vectors.sparse(size, indices, values)
+ case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass)
+ }
+ }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
new file mode 100644
index 0000000000..f3a482abda
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext {
+
+ test("elementwise (hadamard) product should properly apply vector to dense data set") {
+ val denseData = Array(
+ Vectors.dense(1.0, 4.0, 1.9, -9.0)
+ )
+ val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
+ val transformer = new ElementwiseProduct(scalingVec)
+ val transformedData = transformer.transform(sc.makeRDD(denseData))
+ val transformedVecs = transformedData.collect()
+ val transformedVec = transformedVecs(0)
+ val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
+ assert(transformedVec ~== expectedVec absTol 1E-5,
+ s"Expected transformed vector $expectedVec but found $transformedVec")
+ }
+
+ test("elementwise (hadamard) product should properly apply vector to sparse data set") {
+ val sparseData = Array(
+ Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
+ )
+ val dataRDD = sc.parallelize(sparseData, 3)
+ val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
+ val transformer = new ElementwiseProduct(scalingVec)
+ val data2 = sparseData.map(transformer.transform)
+ val data2RDD = transformer.transform(dataRDD)
+
+ assert((sparseData, data2, data2RDD.collect()).zipped.forall {
+ case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
+ case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
+ case _ => false
+ }, "The vector type should be preserved after hadamard product")
+
+ assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
+ assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
+ }
+}