From f25a3ea8d3ee6972efb925826981918549deacaa Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Thu, 21 Apr 2016 16:50:09 -0700 Subject: [SPARK-14734][ML][MLLIB] Added asML, fromML methods for all spark.mllib Vector, Matrix types ## What changes were proposed in this pull request? For maintaining wrappers around spark.mllib algorithms in spark.ml, it will be useful to have ```private[spark]``` methods for converting from one linear algebra representation to another. This PR adds toNew, fromNew methods for all spark.mllib Vector and Matrix types. ## How was this patch tested? Unit tests for all conversions Author: Joseph K. Bradley Closes #12504 from jkbradley/linalg-conversions. --- .../org/apache/spark/mllib/linalg/Matrices.scala | 35 +++++++++++++++++++++- .../org/apache/spark/mllib/linalg/Vectors.scala | 33 ++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) (limited to 'mllib/src/main') diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 8c09b69b3c..bb5d6d9d51 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -24,7 +24,8 @@ import scala.collection.mutable.{ArrayBuffer, ArrayBuilder => MArrayBuilder, Has import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM} import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.spark.annotation.{DeveloperApi, Since} +import org.apache.spark.annotation.Since +import org.apache.spark.ml.{linalg => newlinalg} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.catalyst.util.GenericArrayData @@ -158,6 +159,12 @@ sealed trait Matrix extends Serializable { */ @Since("1.5.0") def numActives: Int + + /** + * Convert this matrix to the new mllib-local representation. + * This does NOT copy the data; it copies references. + */ + private[spark] def asML: newlinalg.Matrix } private[spark] class MatrixUDT extends UserDefinedType[Matrix] { @@ -419,6 +426,10 @@ class DenseMatrix @Since("1.3.0") ( } } } + + private[spark] override def asML: newlinalg.DenseMatrix = { + new newlinalg.DenseMatrix(numRows, numCols, values, isTransposed) + } } /** @@ -515,6 +526,11 @@ object DenseMatrix { } matrix } + + /** Convert new linalg type to spark.mllib type. Light copy; only copies references */ + private[spark] def fromML(m: newlinalg.DenseMatrix): DenseMatrix = { + new DenseMatrix(m.numRows, m.numCols, m.values, m.isTransposed) + } } /** @@ -721,6 +737,10 @@ class SparseMatrix @Since("1.3.0") ( } } } + + private[spark] override def asML: newlinalg.SparseMatrix = { + new newlinalg.SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) + } } /** @@ -895,6 +915,11 @@ object SparseMatrix { SparseMatrix.fromCOO(n, n, nnzVals.map(v => (v._2, v._2, v._1))) } } + + /** Convert new linalg type to spark.mllib type. Light copy; only copies references */ + private[spark] def fromML(m: newlinalg.SparseMatrix): SparseMatrix = { + new SparseMatrix(m.numRows, m.numCols, m.colPtrs, m.rowIndices, m.values, m.isTransposed) + } } /** @@ -1177,4 +1202,12 @@ object Matrices { SparseMatrix.fromCOO(numRows, numCols, entries) } } + + /** Convert new linalg type to spark.mllib type. Light copy; only copies references */ + private[spark] def fromML(m: newlinalg.Matrix): Matrix = m match { + case dm: newlinalg.DenseMatrix => + DenseMatrix.fromML(dm) + case sm: newlinalg.SparseMatrix => + SparseMatrix.fromML(sm) + } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 5812cdde2c..5ec83e8d5c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -30,6 +30,7 @@ import org.json4s.jackson.JsonMethods.{compact, parse => parseJson, render} import org.apache.spark.SparkException import org.apache.spark.annotation.{AlphaComponent, Since} +import org.apache.spark.ml.{linalg => newlinalg} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow @@ -180,6 +181,12 @@ sealed trait Vector extends Serializable { */ @Since("1.6.0") def toJson: String + + /** + * Convert this vector to the new mllib-local representation. + * This does NOT copy the data; it copies references. + */ + private[spark] def asML: newlinalg.Vector } /** @@ -573,6 +580,14 @@ object Vectors { /** Max number of nonzero entries used in computing hash code. */ private[linalg] val MAX_HASH_NNZ = 128 + + /** Convert new linalg type to spark.mllib type. Light copy; only copies references */ + private[spark] def fromML(v: newlinalg.Vector): Vector = v match { + case dv: newlinalg.DenseVector => + DenseVector.fromML(dv) + case sv: newlinalg.SparseVector => + SparseVector.fromML(sv) + } } /** @@ -686,6 +701,10 @@ class DenseVector @Since("1.0.0") ( val jValue = ("type" -> 1) ~ ("values" -> values.toSeq) compact(render(jValue)) } + + private[spark] override def asML: newlinalg.DenseVector = { + new newlinalg.DenseVector(values) + } } @Since("1.3.0") @@ -694,6 +713,11 @@ object DenseVector { /** Extracts the value array from a dense vector. */ @Since("1.3.0") def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values) + + /** Convert new linalg type to spark.mllib type. Light copy; only copies references */ + private[spark] def fromML(v: newlinalg.DenseVector): DenseVector = { + new DenseVector(v.values) + } } /** @@ -882,6 +906,10 @@ class SparseVector @Since("1.0.0") ( ("values" -> values.toSeq) compact(render(jValue)) } + + private[spark] override def asML: newlinalg.SparseVector = { + new newlinalg.SparseVector(size, indices, values) + } } @Since("1.3.0") @@ -889,4 +917,9 @@ object SparseVector { @Since("1.3.0") def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] = Some((sv.size, sv.indices, sv.values)) + + /** Convert new linalg type to spark.mllib type. Light copy; only copies references */ + private[spark] def fromML(v: newlinalg.SparseVector): SparseVector = { + new SparseVector(v.size, v.indices, v.values) + } } -- cgit v1.2.3