aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-06-28 06:28:22 -0700
committerYanbo Liang <ybliang8@gmail.com>2016-06-28 06:28:22 -0700
commite158478a9fff5e63ae0336a54b3f360d0cd38921 (patch)
treee32067f95b522f5d4f107544138c6ef531bfe2d5 /python
parentf6b497fcdddc705a9e1022e20b0dbc15da1b5a5a (diff)
downloadspark-e158478a9fff5e63ae0336a54b3f360d0cd38921.tar.gz
spark-e158478a9fff5e63ae0336a54b3f360d0cd38921.tar.bz2
spark-e158478a9fff5e63ae0336a54b3f360d0cd38921.zip
[SPARK-16242][MLLIB][PYSPARK] Conversion between old/new matrix columns in a DataFrame (Python)
## What changes were proposed in this pull request? This PR implements python wrappers for #13888 to convert old/new matrix columns in a DataFrame. ## How was this patch tested? Doctest in python. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13935 from yanboliang/spark-16242.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/util.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index a7e6bcc754..48867a08db 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -281,6 +281,86 @@ class MLUtils(object):
raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols))
+ @staticmethod
+ @since("2.0.0")
+ def convertMatrixColumnsToML(dataset, *cols):
+ """
+ Converts matrix columns in an input DataFrame from the
+ :py:class:`pyspark.mllib.linalg.Matrix` type to the new
+ :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml`
+ package.
+
+ :param dataset:
+ input dataset
+ :param cols:
+ a list of matrix columns to be converted.
+ New matrix columns will be ignored. If unspecified, all old
+ matrix columns will be converted excepted nested ones.
+ :return:
+ the input dataset with old matrix columns converted to the
+ new matrix type
+
+ >>> import pyspark
+ >>> from pyspark.mllib.linalg import Matrices
+ >>> from pyspark.mllib.util import MLUtils
+ >>> df = spark.createDataFrame(
+ ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]),
+ ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"])
+ >>> r1 = MLUtils.convertMatrixColumnsToML(df).first()
+ >>> isinstance(r1.x, pyspark.ml.linalg.SparseMatrix)
+ True
+ >>> isinstance(r1.y, pyspark.ml.linalg.DenseMatrix)
+ True
+ >>> r2 = MLUtils.convertMatrixColumnsToML(df, "x").first()
+ >>> isinstance(r2.x, pyspark.ml.linalg.SparseMatrix)
+ True
+ >>> isinstance(r2.y, pyspark.mllib.linalg.DenseMatrix)
+ True
+ """
+ if not isinstance(dataset, DataFrame):
+ raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
+ return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols))
+
+ @staticmethod
+ @since("2.0.0")
+ def convertMatrixColumnsFromML(dataset, *cols):
+ """
+ Converts matrix columns in an input DataFrame to the
+ :py:class:`pyspark.mllib.linalg.Matrix` type from the new
+ :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml`
+ package.
+
+ :param dataset:
+ input dataset
+ :param cols:
+ a list of matrix columns to be converted.
+ Old matrix columns will be ignored. If unspecified, all new
+ matrix columns will be converted except nested ones.
+ :return:
+ the input dataset with new matrix columns converted to the
+ old matrix type
+
+ >>> import pyspark
+ >>> from pyspark.ml.linalg import Matrices
+ >>> from pyspark.mllib.util import MLUtils
+ >>> df = spark.createDataFrame(
+ ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]),
+ ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"])
+ >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first()
+ >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix)
+ True
+ >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix)
+ True
+ >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first()
+ >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix)
+ True
+ >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix)
+ True
+ """
+ if not isinstance(dataset, DataFrame):
+ raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
+ return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols))
+
class Saveable(object):
"""