aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2016-06-17 21:22:29 -0700
committerYanbo Liang <ybliang8@gmail.com>2016-06-17 21:22:29 -0700
commitedb23f9e47eecfe60992dde0e037ec1985c77e1d (patch)
tree11158f998ed7ba18050f332835cbaaca65a286c7 /python
parentaf2a4b0826b2358c0fe75c3e4d7fd8f7bccdd8e5 (diff)
downloadspark-edb23f9e47eecfe60992dde0e037ec1985c77e1d.tar.gz
spark-edb23f9e47eecfe60992dde0e037ec1985c77e1d.tar.bz2
spark-edb23f9e47eecfe60992dde0e037ec1985c77e1d.zip
[SPARK-15946][MLLIB] Conversion between old/new vector columns in a DataFrame (Python)
## What changes were proposed in this pull request? This PR implements python wrappers for #13662 to convert old/new vector columns in a DataFrame. ## How was this patch tested? doctest in Python cc: yanboliang Author: Xiangrui Meng <meng@databricks.com> Closes #13731 from mengxr/SPARK-15946.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/util.py82
1 files changed, 82 insertions, 0 deletions
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index a316ee1ad4..a7e6bcc754 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -26,6 +26,7 @@ if sys.version > '3':
from pyspark import SparkContext, since
from pyspark.mllib.common import callMLlibFunc, inherit_doc
from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
+from pyspark.sql import DataFrame
class MLUtils(object):
@@ -200,6 +201,86 @@ class MLUtils(object):
"""
return callMLlibFunc("loadVectors", sc, path)
+ @staticmethod
+ @since("2.0.0")
+ def convertVectorColumnsToML(dataset, *cols):
+ """
+ Converts vector columns in an input DataFrame from the
+ :py:class:`pyspark.mllib.linalg.Vector` type to the new
+ :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml`
+ package.
+
+ :param dataset:
+ input dataset
+ :param cols:
+ a list of vector columns to be converted.
+ New vector columns will be ignored. If unspecified, all old
+ vector columns will be converted excepted nested ones.
+ :return:
+ the input dataset with old vector columns converted to the
+ new vector type
+
+ >>> import pyspark
+ >>> from pyspark.mllib.linalg import Vectors
+ >>> from pyspark.mllib.util import MLUtils
+ >>> df = spark.createDataFrame(
+ ... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))],
+ ... ["id", "x", "y"])
+ >>> r1 = MLUtils.convertVectorColumnsToML(df).first()
+ >>> isinstance(r1.x, pyspark.ml.linalg.SparseVector)
+ True
+ >>> isinstance(r1.y, pyspark.ml.linalg.DenseVector)
+ True
+ >>> r2 = MLUtils.convertVectorColumnsToML(df, "x").first()
+ >>> isinstance(r2.x, pyspark.ml.linalg.SparseVector)
+ True
+ >>> isinstance(r2.y, pyspark.mllib.linalg.DenseVector)
+ True
+ """
+ if not isinstance(dataset, DataFrame):
+ raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
+ return callMLlibFunc("convertVectorColumnsToML", dataset, list(cols))
+
+ @staticmethod
+ @since("2.0.0")
+ def convertVectorColumnsFromML(dataset, *cols):
+ """
+ Converts vector columns in an input DataFrame to the
+ :py:class:`pyspark.mllib.linalg.Vector` type from the new
+ :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml`
+ package.
+
+ :param dataset:
+ input dataset
+ :param cols:
+ a list of vector columns to be converted.
+ Old vector columns will be ignored. If unspecified, all new
+ vector columns will be converted except nested ones.
+ :return:
+ the input dataset with new vector columns converted to the
+ old vector type
+
+ >>> import pyspark
+ >>> from pyspark.ml.linalg import Vectors
+ >>> from pyspark.mllib.util import MLUtils
+ >>> df = spark.createDataFrame(
+ ... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))],
+ ... ["id", "x", "y"])
+ >>> r1 = MLUtils.convertVectorColumnsFromML(df).first()
+ >>> isinstance(r1.x, pyspark.mllib.linalg.SparseVector)
+ True
+ >>> isinstance(r1.y, pyspark.mllib.linalg.DenseVector)
+ True
+ >>> r2 = MLUtils.convertVectorColumnsFromML(df, "x").first()
+ >>> isinstance(r2.x, pyspark.mllib.linalg.SparseVector)
+ True
+ >>> isinstance(r2.y, pyspark.ml.linalg.DenseVector)
+ True
+ """
+ if not isinstance(dataset, DataFrame):
+ raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
+ return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols))
+
class Saveable(object):
"""
@@ -355,6 +436,7 @@ def _test():
.master("local[2]")\
.appName("mllib.util tests")\
.getOrCreate()
+ globs['spark'] = spark
globs['sc'] = spark.sparkContext
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
spark.stop()