aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/tests.py
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-03-02 22:27:01 -0800
committerXiangrui Meng <meng@databricks.com>2015-03-02 22:27:01 -0800
commit7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b (patch)
tree4fc615db1b5144cf7b430ea3bc26bda2cd49cad8 /python/pyspark/mllib/tests.py
parent54d19689ff8d786acde5b8ada6741854ffadadea (diff)
downloadspark-7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b.tar.gz
spark-7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b.tar.bz2
spark-7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b.zip
[SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib
Similar to `MatrixFactorizaionModel`, we only need wrappers to support save/load for tree models in Python. jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #4854 from mengxr/SPARK-6097 and squashes the following commits: 4586a4d [Xiangrui Meng] fix more typos 8ebcac2 [Xiangrui Meng] fix python style 91172d8 [Xiangrui Meng] fix typos 201b3b9 [Xiangrui Meng] update user guide b5158e2 [Xiangrui Meng] support tree model save/load in PySpark/MLlib
Diffstat (limited to 'python/pyspark/mllib/tests.py')
-rw-r--r--python/pyspark/mllib/tests.py27
1 files changed, 26 insertions, 1 deletions
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 06207a076e..5328d99b69 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -19,7 +19,9 @@
Fuller unit tests for Python MLlib.
"""
+import os
import sys
+import tempfile
import array as pyarray
from numpy import array, array_equal
@@ -195,7 +197,8 @@ class ListTests(PySparkTestCase):
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
- from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
+ from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
+ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
@@ -205,6 +208,8 @@ class ListTests(PySparkTestCase):
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
+ temp_dir = tempfile.mkdtemp()
+
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
@@ -231,6 +236,11 @@ class ListTests(PySparkTestCase):
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
+ dt_model_dir = os.path.join(temp_dir, "dt")
+ dt_model.save(self.sc, dt_model_dir)
+ same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
+ self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
+
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
@@ -238,6 +248,11 @@ class ListTests(PySparkTestCase):
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
+ rf_model_dir = os.path.join(temp_dir, "rf")
+ rf_model.save(self.sc, rf_model_dir)
+ same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
+ self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
+
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
@@ -245,6 +260,16 @@ class ListTests(PySparkTestCase):
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
+ gbt_model_dir = os.path.join(temp_dir, "gbt")
+ gbt_model.save(self.sc, gbt_model_dir)
+ same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
+ self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
+
+ try:
+ os.removedirs(temp_dir)
+ except OSError:
+ pass
+
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD