[SPARK-7487] [ML] Feature Parity in PySpark for ml.regression

Added LinearRegression Python API Author: Burak Yavuz <brkyvz@gmail.com> Closes #6016 from brkyvz/ml-reg and squashes the following commits: 11c9ef9 [Burak Yavuz] address comments 1027a40 [Burak Yavuz] fix typo 4c699ad [Burak Yavuz] added tree regressor api 8afead2 [Burak Yavuz] made mixin for DT fa51c74 [Burak Yavuz] save additions 0640d48 [Burak Yavuz] added ml.regression 82aac48 [Burak Yavuz] added linear regression
author: Burak Yavuz <brkyvz@gmail.com> 2015-05-12 12:17:05 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-05-12 12:17:05 -0700
commit: 8e935b0a214f8b477fe9579fbf6a2d0a27b59118 (patch)
tree: 409915759b0929b79e0b85c3a824a6b2e4f5f591
parent: b9b01f44f687f35460db9e0ab0b426897747596a (diff)
download: spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.gz
spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.bz2
spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.zip
6 files changed, 709 insertions, 8 deletions
diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst
index 220ea595df..a42217a4ae 100644
--- a/python/docs/pyspark.ml.rst
+++ b/python/docs/pyspark.ml.rst
@@ -25,6 +25,22 @@ pyspark.ml.classification module
     :undoc-members:
     :inherited-members:
 
+pyspark.ml.recommendation module
+-------------------------
+
+.. automodule:: pyspark.ml.recommendation
+    :members:
+    :undoc-members:
+    :inherited-members:
+
+pyspark.ml.regression module
+-------------------------
+
+.. automodule:: pyspark.ml.regression
+    :members:
+    :undoc-members:
+    :inherited-members:
+
 pyspark.ml.tuning module
 --------------------------------
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 45754bc9d4..8a009c4ac7 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -91,7 +91,7 @@ if __name__ == "__main__":
     globs = globals().copy()
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
-    sc = SparkContext("local[2]", "ml.feature tests")
+    sc = SparkContext("local[2]", "ml.classification tests")
     sqlContext = SQLContext(sc)
     globs['sc'] = sc
     globs['sqlContext'] = sqlContext
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 3be0979b92..4a5cc6e64f 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -38,16 +38,13 @@ header = """#
 # python _shared_params_code_gen.py > shared.py
 
 
-def _gen_param_code(name, doc, defaultValueStr):
+def _gen_param_header(name, doc, defaultValueStr):
     """
-    Generates Python code for a shared param class.
+    Generates the header part for shared variables
 
     :param name: param name
     :param doc: param doc
-    :param defaultValueStr: string representation of the default value
-    :return: code string
     """
-    # TODO: How to correctly inherit instance attributes?
     template = '''class Has$Name(Params):
     """
     Mixin for param $name: $doc.
@@ -61,8 +58,27 @@ def _gen_param_code(name, doc, defaultValueStr):
         #: param for $doc
         self.$name = Param(self, "$name", "$doc")
         if $defaultValueStr is not None:
-            self._setDefault($name=$defaultValueStr)
+            self._setDefault($name=$defaultValueStr)'''
+
+    Name = name[0].upper() + name[1:]
+    return template \
+        .replace("$name", name) \
+        .replace("$Name", Name) \
+        .replace("$doc", doc) \
+        .replace("$defaultValueStr", str(defaultValueStr))
 
+
+def _gen_param_code(name, doc, defaultValueStr):
+    """
+    Generates Python code for a shared param class.
+
+    :param name: param name
+    :param doc: param doc
+    :param defaultValueStr: string representation of the default value
+    :return: code string
+    """
+    # TODO: How to correctly inherit instance attributes?
+    template = '''
     def set$Name(self, value):
         """
         Sets the value of :py:attr:`$name`.
@@ -104,5 +120,44 @@ if __name__ == "__main__":
         ("stepSize", "Step size to be used for each iteration of optimization.", None)]
     code = []
     for name, doc, defaultValueStr in shared:
-        code.append(_gen_param_code(name, doc, defaultValueStr))
+        param_code = _gen_param_header(name, doc, defaultValueStr)
+        code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr))
+
+    decisionTreeParams = [
+        ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " +
+         "depth 1 means 1 internal node + 2 leaf nodes."),
+        ("maxBins", "Max number of bins for" +
+         " discretizing continuous features.  Must be >=2 and >= number of categories for any" +
+         " categorical feature."),
+        ("minInstancesPerNode", "Minimum number of instances each child must have after split. " +
+         "If a split causes the left or right child to have fewer than minInstancesPerNode, the " +
+         "split will be discarded as invalid. Should be >= 1."),
+        ("minInfoGain", "Minimum information gain for a split to be considered at a tree node."),
+        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
+        ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
+         "instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
+         "Caching can speed up training of deeper trees.")]
+
+    decisionTreeCode = '''class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    $dummyPlaceHolders
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+        $realParams'''
+    dtParamMethods = ""
+    dummyPlaceholders = ""
+    realParams = ""
+    paramTemplate = """$name = Param($owner, "$name", "$doc")"""
+    for name, doc in decisionTreeParams:
+        variable = paramTemplate.replace("$name", name).replace("$doc", doc)
+        dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n    "
+        realParams += "self." + variable.replace("$owner", "self") + "\n        "
+        dtParamMethods += _gen_param_code(name, doc, None) + "\n"
+    code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders)
+                .replace("$realParams", realParams) + dtParamMethods)
     print("\n\n\n".join(code))
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 4b22322b89..779cabe853 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -424,3 +424,111 @@ class HasStepSize(Params):
         Gets the value of stepSize or its default value.
         """
         return self.getOrDefault(self.stepSize)
+
+
+class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
+    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
+    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
+    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+        #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+        self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+        #: param for Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.
+        self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
+        #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
+        self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
+        #: param for Minimum information gain for a split to be considered at a tree node.
+        self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
+        #: param for Maximum memory in MB allocated to histogram aggregation.
+        self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
+        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
+        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+
+    def setMaxDepth(self, value):
+        """
+        Sets the value of :py:attr:`maxDepth`.
+        """
+        self.paramMap[self.maxDepth] = value
+        return self
+
+    def getMaxDepth(self):
+        """
+        Gets the value of maxDepth or its default value.
+        """
+        return self.getOrDefault(self.maxDepth)
+
+    def setMaxBins(self, value):
+        """
+        Sets the value of :py:attr:`maxBins`.
+        """
+        self.paramMap[self.maxBins] = value
+        return self
+
+    def getMaxBins(self):
+        """
+        Gets the value of maxBins or its default value.
+        """
+        return self.getOrDefault(self.maxBins)
+
+    def setMinInstancesPerNode(self, value):
+        """
+        Sets the value of :py:attr:`minInstancesPerNode`.
+        """
+        self.paramMap[self.minInstancesPerNode] = value
+        return self
+
+    def getMinInstancesPerNode(self):
+        """
+        Gets the value of minInstancesPerNode or its default value.
+        """
+        return self.getOrDefault(self.minInstancesPerNode)
+
+    def setMinInfoGain(self, value):
+        """
+        Sets the value of :py:attr:`minInfoGain`.
+        """
+        self.paramMap[self.minInfoGain] = value
+        return self
+
+    def getMinInfoGain(self):
+        """
+        Gets the value of minInfoGain or its default value.
+        """
+        return self.getOrDefault(self.minInfoGain)
+
+    def setMaxMemoryInMB(self, value):
+        """
+        Sets the value of :py:attr:`maxMemoryInMB`.
+        """
+        self.paramMap[self.maxMemoryInMB] = value
+        return self
+
+    def getMaxMemoryInMB(self):
+        """
+        Gets the value of maxMemoryInMB or its default value.
+        """
+        return self.getOrDefault(self.maxMemoryInMB)
+
+    def setCacheNodeIds(self, value):
+        """
+        Sets the value of :py:attr:`cacheNodeIds`.
+        """
+        self.paramMap[self.cacheNodeIds] = value
+        return self
+
+    def getCacheNodeIds(self):
+        """
+        Gets the value of cacheNodeIds or its default value.
+        """
+        return self.getOrDefault(self.cacheNodeIds)
+
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
new file mode 100644
index 0000000000..0ab5c6c3d2
--- /dev/null
+++ b/python/pyspark/ml/regression.py
@@ -0,0 +1,520 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.ml.util import keyword_only
+from pyspark.ml.wrapper import JavaEstimator, JavaModel
+from pyspark.ml.param.shared import *
+from pyspark.mllib.common import inherit_doc
+
+
+__all__ = ['DecisionTreeRegressor', 'DecisionTreeRegressionModel', 'GBTRegressor',
+           'GBTRegressionModel', 'LinearRegression', 'LinearRegressionModel',
+           'RandomForestRegressor', 'RandomForestRegressionModel']
+
+
+@inherit_doc
+class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
+                       HasRegParam, HasTol):
+    """
+    Linear regression.
+
+    The learning objective is to minimize the squared error, with regularization.
+    The specific squared error loss function used is:
+      L = 1/2n ||A weights - y||^2^
+
+    This support multiple types of regularization:
+     - none (a.k.a. ordinary least squares)
+     - L2 (ridge regression)
+     - L1 (Lasso)
+     - L2 + L1 (elastic net)
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (1.0, Vectors.dense(1.0)),
+    ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
+    >>> lr = LinearRegression(maxIter=5, regParam=0.0)
+    >>> model = lr.fit(df)
+    >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
+    >>> model.transform(test0).head().prediction
+    -1.0
+    >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
+    >>> model.transform(test1).head().prediction
+    1.0
+    >>> lr.setParams("vector")
+    Traceback (most recent call last):
+        ...
+    TypeError: Method setParams forces keyword arguments.
+    """
+    _java_class = "org.apache.spark.ml.regression.LinearRegression"
+    # a placeholder to make it appear in the generated doc
+    elasticNetParam = \
+        Param(Params._dummy(), "elasticNetParam",
+              "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
+              "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+        """
+        super(LinearRegression, self).__init__()
+        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
+        #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
+        self.elasticNetParam = \
+            Param(self, "elasticNetParam",
+                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
+                  "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+        self._setDefault(maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+        Sets params for linear regression.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return LinearRegressionModel(java_model)
+
+    def setElasticNetParam(self, value):
+        """
+        Sets the value of :py:attr:`elasticNetParam`.
+        """
+        self.paramMap[self.elasticNetParam] = value
+        return self
+
+    def getElasticNetParam(self):
+        """
+        Gets the value of elasticNetParam or its default value.
+        """
+        return self.getOrDefault(self.elasticNetParam)
+
+
+class LinearRegressionModel(JavaModel):
+    """
+    Model fitted by LinearRegression.
+    """
+
+
+class TreeRegressorParams(object):
+    """
+    Private class to track supported impurity measures.
+    """
+    supportedImpurities = ["variance"]
+
+
+class RandomForestParams(object):
+    """
+    Private class to track supported random forest parameters.
+    """
+    supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
+
+
+class GBTParams(object):
+    """
+    Private class to track supported GBT params.
+    """
+    supportedLossTypes = ["squared", "absolute"]
+
+
+@inherit_doc
+class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+                            DecisionTreeParams, HasCheckpointInterval):
+    """
+    `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
+    learning algorithm for regression.
+    It supports both continuous and categorical features.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (1.0, Vectors.dense(1.0)),
+    ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
+    >>> dt = DecisionTreeRegressor(maxDepth=2)
+    >>> model = dt.fit(df)
+    >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
+    >>> model.transform(test0).head().prediction
+    0.0
+    >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
+    >>> model.transform(test1).head().prediction
+    1.0
+    """
+
+    _java_class = "org.apache.spark.ml.regression.DecisionTreeRegressor"
+    # a placeholder to make it appear in the generated doc
+    impurity = Param(Params._dummy(), "impurity",
+                     "Criterion used for information gain calculation (case-insensitive). " +
+                     "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance"):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance")
+        """
+        super(DecisionTreeRegressor, self).__init__()
+        #: param for Criterion used for information gain calculation (case-insensitive).
+        self.impurity = \
+            Param(self, "impurity",
+                  "Criterion used for information gain calculation (case-insensitive). " +
+                  "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
+        self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                         impurity="variance")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                  impurity="variance"):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                  impurity="variance")
+        Sets params for the DecisionTreeRegressor.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return DecisionTreeRegressionModel(java_model)
+
+    def setImpurity(self, value):
+        """
+        Sets the value of :py:attr:`impurity`.
+        """
+        self.paramMap[self.impurity] = value
+        return self
+
+    def getImpurity(self):
+        """
+        Gets the value of impurity or its default value.
+        """
+        return self.getOrDefault(self.impurity)
+
+
+class DecisionTreeRegressionModel(JavaModel):
+    """
+    Model fitted by DecisionTreeRegressor.
+    """
+
+
+@inherit_doc
+class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
+                            DecisionTreeParams, HasCheckpointInterval):
+    """
+    `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
+    learning algorithm for regression.
+    It supports both continuous and categorical features.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (1.0, Vectors.dense(1.0)),
+    ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
+    >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2)
+    >>> model = rf.fit(df)
+    >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
+    >>> model.transform(test0).head().prediction
+    0.0
+    >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
+    >>> model.transform(test1).head().prediction
+    0.5
+    """
+
+    _java_class = "org.apache.spark.ml.regression.RandomForestRegressor"
+    # a placeholder to make it appear in the generated doc
+    impurity = Param(Params._dummy(), "impurity",
+                     "Criterion used for information gain calculation (case-insensitive). " +
+                     "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
+    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
+                            "Fraction of the training data used for learning each decision tree, " +
+                            "in range (0, 1].")
+    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1)")
+    featureSubsetStrategy = \
+        Param(Params._dummy(), "featureSubsetStrategy",
+              "The number of features to consider for splits at each tree node. Supported " +
+              "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
+                 numTrees=20, featureSubsetStrategy="auto", seed=42):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
+                 numTrees=20, featureSubsetStrategy="auto", seed=42)
+        """
+        super(RandomForestRegressor, self).__init__()
+        #: param for Criterion used for information gain calculation (case-insensitive).
+        self.impurity = \
+            Param(self, "impurity",
+                  "Criterion used for information gain calculation (case-insensitive). " +
+                  "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
+        #: param for Fraction of the training data used for learning each decision tree,
+        #  in range (0, 1]
+        self.subsamplingRate = Param(self, "subsamplingRate",
+                                     "Fraction of the training data used for learning each " +
+                                     "decision tree, in range (0, 1].")
+        #: param for Number of trees to train (>= 1)
+        self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1)")
+        #: param for The number of features to consider for splits at each tree node
+        self.featureSubsetStrategy = \
+            Param(self, "featureSubsetStrategy",
+                  "The number of features to consider for splits at each tree node. Supported " +
+                  "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
+        self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                         impurity="variance", numTrees=20, featureSubsetStrategy="auto")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                  impurity="variance", numTrees=20, featureSubsetStrategy="auto"):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=42,
+                  impurity="variance", numTrees=20, featureSubsetStrategy="auto")
+        Sets params for linear regression.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return RandomForestRegressionModel(java_model)
+
+    def setImpurity(self, value):
+        """
+        Sets the value of :py:attr:`impurity`.
+        """
+        self.paramMap[self.impurity] = value
+        return self
+
+    def getImpurity(self):
+        """
+        Gets the value of impurity or its default value.
+        """
+        return self.getOrDefault(self.impurity)
+
+    def setSubsamplingRate(self, value):
+        """
+        Sets the value of :py:attr:`subsamplingRate`.
+        """
+        self.paramMap[self.subsamplingRate] = value
+        return self
+
+    def getSubsamplingRate(self):
+        """
+        Gets the value of subsamplingRate or its default value.
+        """
+        return self.getOrDefault(self.subsamplingRate)
+
+    def setNumTrees(self, value):
+        """
+        Sets the value of :py:attr:`numTrees`.
+        """
+        self.paramMap[self.numTrees] = value
+        return self
+
+    def getNumTrees(self):
+        """
+        Gets the value of numTrees or its default value.
+        """
+        return self.getOrDefault(self.numTrees)
+
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+        """
+        self.paramMap[self.featureSubsetStrategy] = value
+        return self
+
+    def getFeatureSubsetStrategy(self):
+        """
+        Gets the value of featureSubsetStrategy or its default value.
+        """
+        return self.getOrDefault(self.featureSubsetStrategy)
+
+
+class RandomForestRegressionModel(JavaModel):
+    """
+    Model fitted by RandomForestRegressor.
+    """
+
+
+@inherit_doc
+class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
+                   DecisionTreeParams, HasCheckpointInterval):
+    """
+    `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
+    learning algorithm for regression.
+    It supports both continuous and categorical features.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (1.0, Vectors.dense(1.0)),
+    ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
+    >>> gbt = GBTRegressor(maxIter=5, maxDepth=2)
+    >>> model = gbt.fit(df)
+    >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
+    >>> model.transform(test0).head().prediction
+    0.0
+    >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
+    >>> model.transform(test1).head().prediction
+    1.0
+    """
+
+    _java_class = "org.apache.spark.ml.regression.GBTRegressor"
+    # a placeholder to make it appear in the generated doc
+    lossType = Param(Params._dummy(), "lossType",
+                     "Loss function which GBT tries to minimize (case-insensitive). " +
+                     "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
+    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
+                            "Fraction of the training data used for learning each decision tree, " +
+                            "in range (0, 1].")
+    stepSize = Param(Params._dummy(), "stepSize",
+                     "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the " +
+                     "contribution of each estimator")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="squared",
+                 maxIter=20, stepSize=0.1):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="squared",
+                 maxIter=20, stepSize=0.1)
+        """
+        super(GBTRegressor, self).__init__()
+        #: param for Loss function which GBT tries to minimize (case-insensitive).
+        self.lossType = Param(self, "lossType",
+                              "Loss function which GBT tries to minimize (case-insensitive). " +
+                              "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
+        #: Fraction of the training data used for learning each decision tree, in range (0, 1].
+        self.subsamplingRate = Param(self, "subsamplingRate",
+                                     "Fraction of the training data used for learning each " +
+                                     "decision tree, in range (0, 1].")
+        #: Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of
+        #  each estimator
+        self.stepSize = Param(self, "stepSize",
+                              "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
+                              "the contribution of each estimator")
+        self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                         lossType="squared", maxIter=20, stepSize=0.1)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                  lossType="squared", maxIter=20, stepSize=0.1):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                  lossType="squared", maxIter=20, stepSize=0.1)
+        Sets params for Gradient Boosted Tree Regression.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return GBTRegressionModel(java_model)
+
+    def setLossType(self, value):
+        """
+        Sets the value of :py:attr:`lossType`.
+        """
+        self.paramMap[self.lossType] = value
+        return self
+
+    def getLossType(self):
+        """
+        Gets the value of lossType or its default value.
+        """
+        return self.getOrDefault(self.lossType)
+
+    def setSubsamplingRate(self, value):
+        """
+        Sets the value of :py:attr:`subsamplingRate`.
+        """
+        self.paramMap[self.subsamplingRate] = value
+        return self
+
+    def getSubsamplingRate(self):
+        """
+        Gets the value of subsamplingRate or its default value.
+        """
+        return self.getOrDefault(self.subsamplingRate)
+
+    def setStepSize(self, value):
+        """
+        Sets the value of :py:attr:`stepSize`.
+        """
+        self.paramMap[self.stepSize] = value
+        return self
+
+    def getStepSize(self):
+        """
+        Gets the value of stepSize or its default value.
+        """
+        return self.getOrDefault(self.stepSize)
+
+
+class GBTRegressionModel(JavaModel):
+    """
+    Model fitted by GBTRegressor.
+    """
+
+
+if __name__ == "__main__":
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import SQLContext
+    globs = globals().copy()
+    # The small batch size here ensures that we see multiple batches,
+    # even in these small test examples:
+    sc = SparkContext("local[2]", "ml.regression tests")
+    sqlContext = SQLContext(sc)
+    globs['sc'] = sc
+    globs['sqlContext'] = sqlContext
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    sc.stop()
+    if failure_count:
+        exit(-1)
diff --git a/python/run-tests b/python/run-tests
index f9ca26467f..f2757a3967 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -98,6 +98,8 @@ function run_ml_tests() {
     echo "Run ml tests ..."
     run_test "pyspark/ml/feature.py"
     run_test "pyspark/ml/classification.py"
+    run_test "pyspark/ml/recommendation.py"
+    run_test "pyspark/ml/regression.py"
     run_test "pyspark/ml/tuning.py"
     run_test "pyspark/ml/tests.py"
     run_test "pyspark/ml/evaluation.py"
author	Burak Yavuz <brkyvz@gmail.com>	2015-05-12 12:17:05 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-05-12 12:17:05 -0700
commit	8e935b0a214f8b477fe9579fbf6a2d0a27b59118 (patch)
tree	409915759b0929b79e0b85c3a824a6b2e4f5f591
parent	b9b01f44f687f35460db9e0ab0b426897747596a (diff)
download	spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.gz spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.bz2 spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.zip