aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/util.py
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-01-29 09:22:24 -0800
committerJoseph K. Bradley <joseph@databricks.com>2016-01-29 09:22:24 -0800
commite51b6eaa9e9c007e194d858195291b2b9fb27322 (patch)
treeb6af90c439154fe7514fd32e47a56a693ffd745a /python/pyspark/ml/util.py
parent55561e7693dd2a5bf3c7f8026c725421801fd0ec (diff)
downloadspark-e51b6eaa9e9c007e194d858195291b2b9fb27322.tar.gz
spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.tar.bz2
spark-e51b6eaa9e9c007e194d858195291b2b9fb27322.zip
[SPARK-13032][ML][PYSPARK] PySpark support model export/import and take LinearRegression as example
* Implement ```MLWriter/MLWritable/MLReader/MLReadable``` for PySpark. * Making ```LinearRegression``` to support ```save/load``` as example. After this merged, the work for other transformers/estimators will be easy, then we can list and distribute the tasks to the community. cc mengxr jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Author: Joseph K. Bradley <joseph@databricks.com> Closes #10469 from yanboliang/spark-11939.
Diffstat (limited to 'python/pyspark/ml/util.py')
-rw-r--r--python/pyspark/ml/util.py142
1 files changed, 141 insertions, 1 deletions
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index cee9d67b05..d7a813f56c 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -15,8 +15,27 @@
# limitations under the License.
#
-from functools import wraps
+import sys
import uuid
+from functools import wraps
+
+if sys.version > '3':
+ basestring = str
+
+from pyspark import SparkContext, since
+from pyspark.mllib.common import inherit_doc
+
+
+def _jvm():
+ """
+ Returns the JVM view associated with SparkContext. Must be called
+ after SparkContext is initialized.
+ """
+ jvm = SparkContext._jvm
+ if jvm:
+ return jvm
+ else:
+ raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?")
def keyword_only(func):
@@ -52,3 +71,124 @@ class Identifiable(object):
concatenates the class name, "_", and 12 random hex chars.
"""
return cls.__name__ + "_" + uuid.uuid4().hex[12:]
+
+
+@inherit_doc
+class JavaMLWriter(object):
+ """
+ .. note:: Experimental
+
+ Utility class that can save ML instances through their Scala implementation.
+
+ .. versionadded:: 2.0.0
+ """
+
+ def __init__(self, instance):
+ instance._transfer_params_to_java()
+ self._jwrite = instance._java_obj.write()
+
+ def save(self, path):
+ """Save the ML instance to the input path."""
+ if not isinstance(path, basestring):
+ raise TypeError("path should be a basestring, got type %s" % type(path))
+ self._jwrite.save(path)
+
+ def overwrite(self):
+ """Overwrites if the output path already exists."""
+ self._jwrite.overwrite()
+ return self
+
+ def context(self, sqlContext):
+ """Sets the SQL context to use for saving."""
+ self._jwrite.context(sqlContext._ssql_ctx)
+ return self
+
+
+@inherit_doc
+class MLWritable(object):
+ """
+ .. note:: Experimental
+
+ Mixin for ML instances that provide JavaMLWriter.
+
+ .. versionadded:: 2.0.0
+ """
+
+ def write(self):
+ """Returns an JavaMLWriter instance for this ML instance."""
+ return JavaMLWriter(self)
+
+ def save(self, path):
+ """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
+ self.write().save(path)
+
+
+@inherit_doc
+class JavaMLReader(object):
+ """
+ .. note:: Experimental
+
+ Utility class that can load ML instances through their Scala implementation.
+
+ .. versionadded:: 2.0.0
+ """
+
+ def __init__(self, clazz):
+ self._clazz = clazz
+ self._jread = self._load_java_obj(clazz).read()
+
+ def load(self, path):
+ """Load the ML instance from the input path."""
+ if not isinstance(path, basestring):
+ raise TypeError("path should be a basestring, got type %s" % type(path))
+ java_obj = self._jread.load(path)
+ instance = self._clazz()
+ instance._java_obj = java_obj
+ instance._resetUid(java_obj.uid())
+ instance._transfer_params_from_java()
+ return instance
+
+ def context(self, sqlContext):
+ """Sets the SQL context to use for loading."""
+ self._jread.context(sqlContext._ssql_ctx)
+ return self
+
+ @classmethod
+ def _java_loader_class(cls, clazz):
+ """
+ Returns the full class name of the Java ML instance. The default
+ implementation replaces "pyspark" by "org.apache.spark" in
+ the Python full class name.
+ """
+ java_package = clazz.__module__.replace("pyspark", "org.apache.spark")
+ return ".".join([java_package, clazz.__name__])
+
+ @classmethod
+ def _load_java_obj(cls, clazz):
+ """Load the peer Java object of the ML instance."""
+ java_class = cls._java_loader_class(clazz)
+ java_obj = _jvm()
+ for name in java_class.split("."):
+ java_obj = getattr(java_obj, name)
+ return java_obj
+
+
+@inherit_doc
+class MLReadable(object):
+ """
+ .. note:: Experimental
+
+ Mixin for instances that provide JavaMLReader.
+
+ .. versionadded:: 2.0.0
+ """
+
+ @classmethod
+ def read(cls):
+ """Returns an JavaMLReader instance for this class."""
+ return JavaMLReader(cls)
+
+ @classmethod
+ def load(cls, path):
+ """Reads an ML instance from the input path, a shortcut of `read().load(path)`."""
+ return cls.read().load(path)