From e158478a9fff5e63ae0336a54b3f360d0cd38921 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 28 Jun 2016 06:28:22 -0700 Subject: [SPARK-16242][MLLIB][PYSPARK] Conversion between old/new matrix columns in a DataFrame (Python) ## What changes were proposed in this pull request? This PR implements python wrappers for #13888 to convert old/new matrix columns in a DataFrame. ## How was this patch tested? Doctest in python. Author: Yanbo Liang Closes #13935 from yanboliang/spark-16242. --- python/pyspark/mllib/util.py | 80 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'python/pyspark') diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a7e6bcc754..48867a08db 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -281,6 +281,86 @@ class MLUtils(object): raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols)) + @staticmethod + @since("2.0.0") + def convertMatrixColumnsToML(dataset, *cols): + """ + Converts matrix columns in an input DataFrame from the + :py:class:`pyspark.mllib.linalg.Matrix` type to the new + :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` + package. + + :param dataset: + input dataset + :param cols: + a list of matrix columns to be converted. + New matrix columns will be ignored. If unspecified, all old + matrix columns will be converted excepted nested ones. + :return: + the input dataset with old matrix columns converted to the + new matrix type + + >>> import pyspark + >>> from pyspark.mllib.linalg import Matrices + >>> from pyspark.mllib.util import MLUtils + >>> df = spark.createDataFrame( + ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), + ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) + >>> r1 = MLUtils.convertMatrixColumnsToML(df).first() + >>> isinstance(r1.x, pyspark.ml.linalg.SparseMatrix) + True + >>> isinstance(r1.y, pyspark.ml.linalg.DenseMatrix) + True + >>> r2 = MLUtils.convertMatrixColumnsToML(df, "x").first() + >>> isinstance(r2.x, pyspark.ml.linalg.SparseMatrix) + True + >>> isinstance(r2.y, pyspark.mllib.linalg.DenseMatrix) + True + """ + if not isinstance(dataset, DataFrame): + raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) + return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols)) + + @staticmethod + @since("2.0.0") + def convertMatrixColumnsFromML(dataset, *cols): + """ + Converts matrix columns in an input DataFrame to the + :py:class:`pyspark.mllib.linalg.Matrix` type from the new + :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` + package. + + :param dataset: + input dataset + :param cols: + a list of matrix columns to be converted. + Old matrix columns will be ignored. If unspecified, all new + matrix columns will be converted except nested ones. + :return: + the input dataset with new matrix columns converted to the + old matrix type + + >>> import pyspark + >>> from pyspark.ml.linalg import Matrices + >>> from pyspark.mllib.util import MLUtils + >>> df = spark.createDataFrame( + ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), + ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) + >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first() + >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix) + True + >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix) + True + >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first() + >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix) + True + >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix) + True + """ + if not isinstance(dataset, DataFrame): + raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) + return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols)) + class Saveable(object): """ -- cgit v1.2.3