[SPARK-13032][ML][PYSPARK] PySpark support model export/import and take LinearRegression as example

* Implement ```MLWriter/MLWritable/MLReader/MLReadable``` for PySpark. * Making ```LinearRegression``` to support ```save/load``` as example. After this merged, the work for other transformers/estimators will be easy, then we can list and distribute the tasks to the community. cc mengxr jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Author: Joseph K. Bradley <joseph@databricks.com> Closes #10469 from yanboliang/spark-11939.
author: Yanbo Liang <ybliang8@gmail.com> 2016-01-29 09:22:24 -0800
committer: Joseph K. Bradley <joseph@databricks.com> 2016-01-29 09:22:24 -0800
commit: e51b6eaa9e9c007e194d858195291b2b9fb27322 (patch)
tree: b6af90c439154fe7514fd32e47a56a693ffd745a /python/pyspark/ml/util.py
parent: 55561e7693dd2a5bf3c7f8026c725421801fd0ec (diff)
download: spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.tar.gz
spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.tar.bz2
spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.zip
1 files changed, 141 insertions, 1 deletions
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index cee9d67b05..d7a813f56c 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -15,8 +15,27 @@
 # limitations under the License.
 #
 
-from functools import wraps
+import sys
 import uuid
+from functools import wraps
+
+if sys.version > '3':
+    basestring = str
+
+from pyspark import SparkContext, since
+from pyspark.mllib.common import inherit_doc
+
+
+def _jvm():
+    """
+    Returns the JVM view associated with SparkContext. Must be called
+    after SparkContext is initialized.
+    """
+    jvm = SparkContext._jvm
+    if jvm:
+        return jvm
+    else:
+        raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?")
 
 
 def keyword_only(func):
@@ -52,3 +71,124 @@ class Identifiable(object):
         concatenates the class name, "_", and 12 random hex chars.
         """
         return cls.__name__ + "_" + uuid.uuid4().hex[12:]
+
+
+@inherit_doc
+class JavaMLWriter(object):
+    """
+    .. note:: Experimental
+
+    Utility class that can save ML instances through their Scala implementation.
+
+    .. versionadded:: 2.0.0
+    """
+
+    def __init__(self, instance):
+        instance._transfer_params_to_java()
+        self._jwrite = instance._java_obj.write()
+
+    def save(self, path):
+        """Save the ML instance to the input path."""
+        if not isinstance(path, basestring):
+            raise TypeError("path should be a basestring, got type %s" % type(path))
+        self._jwrite.save(path)
+
+    def overwrite(self):
+        """Overwrites if the output path already exists."""
+        self._jwrite.overwrite()
+        return self
+
+    def context(self, sqlContext):
+        """Sets the SQL context to use for saving."""
+        self._jwrite.context(sqlContext._ssql_ctx)
+        return self
+
+
+@inherit_doc
+class MLWritable(object):
+    """
+    .. note:: Experimental
+
+    Mixin for ML instances that provide JavaMLWriter.
+
+    .. versionadded:: 2.0.0
+    """
+
+    def write(self):
+        """Returns an JavaMLWriter instance for this ML instance."""
+        return JavaMLWriter(self)
+
+    def save(self, path):
+        """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
+        self.write().save(path)
+
+
+@inherit_doc
+class JavaMLReader(object):
+    """
+    .. note:: Experimental
+
+    Utility class that can load ML instances through their Scala implementation.
+
+    .. versionadded:: 2.0.0
+    """
+
+    def __init__(self, clazz):
+        self._clazz = clazz
+        self._jread = self._load_java_obj(clazz).read()
+
+    def load(self, path):
+        """Load the ML instance from the input path."""
+        if not isinstance(path, basestring):
+            raise TypeError("path should be a basestring, got type %s" % type(path))
+        java_obj = self._jread.load(path)
+        instance = self._clazz()
+        instance._java_obj = java_obj
+        instance._resetUid(java_obj.uid())
+        instance._transfer_params_from_java()
+        return instance
+
+    def context(self, sqlContext):
+        """Sets the SQL context to use for loading."""
+        self._jread.context(sqlContext._ssql_ctx)
+        return self
+
+    @classmethod
+    def _java_loader_class(cls, clazz):
+        """
+        Returns the full class name of the Java ML instance. The default
+        implementation replaces "pyspark" by "org.apache.spark" in
+        the Python full class name.
+        """
+        java_package = clazz.__module__.replace("pyspark", "org.apache.spark")
+        return ".".join([java_package, clazz.__name__])
+
+    @classmethod
+    def _load_java_obj(cls, clazz):
+        """Load the peer Java object of the ML instance."""
+        java_class = cls._java_loader_class(clazz)
+        java_obj = _jvm()
+        for name in java_class.split("."):
+            java_obj = getattr(java_obj, name)
+        return java_obj
+
+
+@inherit_doc
+class MLReadable(object):
+    """
+    .. note:: Experimental
+
+    Mixin for instances that provide JavaMLReader.
+
+    .. versionadded:: 2.0.0
+    """
+
+    @classmethod
+    def read(cls):
+        """Returns an JavaMLReader instance for this class."""
+        return JavaMLReader(cls)
+
+    @classmethod
+    def load(cls, path):
+        """Reads an ML instance from the input path, a shortcut of `read().load(path)`."""
+        return cls.read().load(path)
author	Yanbo Liang <ybliang8@gmail.com>	2016-01-29 09:22:24 -0800
committer	Joseph K. Bradley <joseph@databricks.com>	2016-01-29 09:22:24 -0800
commit	e51b6eaa9e9c007e194d858195291b2b9fb27322 (patch)
tree	b6af90c439154fe7514fd32e47a56a693ffd745a /python/pyspark/ml/util.py
parent	55561e7693dd2a5bf3c7f8026c725421801fd0ec (diff)
download	spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.tar.gz spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.tar.bz2 spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.zip