[SPARK-7487] [ML] Feature Parity in PySpark for ml.regression

Added LinearRegression Python API Author: Burak Yavuz <brkyvz@gmail.com> Closes #6016 from brkyvz/ml-reg and squashes the following commits: 11c9ef9 [Burak Yavuz] address comments 1027a40 [Burak Yavuz] fix typo 4c699ad [Burak Yavuz] added tree regressor api 8afead2 [Burak Yavuz] made mixin for DT fa51c74 [Burak Yavuz] save additions 0640d48 [Burak Yavuz] added ml.regression 82aac48 [Burak Yavuz] added linear regression
author: Burak Yavuz <brkyvz@gmail.com> 2015-05-12 12:17:05 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-05-12 12:17:05 -0700
commit: 8e935b0a214f8b477fe9579fbf6a2d0a27b59118 (patch)
tree: 409915759b0929b79e0b85c3a824a6b2e4f5f591 /python/pyspark/ml/param
parent: b9b01f44f687f35460db9e0ab0b426897747596a (diff)
download: spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.gz
spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.bz2
spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.zip
2 files changed, 170 insertions, 7 deletions
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 3be0979b92..4a5cc6e64f 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -38,16 +38,13 @@ header = """#
 # python _shared_params_code_gen.py > shared.py
 
 
-def _gen_param_code(name, doc, defaultValueStr):
+def _gen_param_header(name, doc, defaultValueStr):
     """
-    Generates Python code for a shared param class.
+    Generates the header part for shared variables
 
     :param name: param name
     :param doc: param doc
-    :param defaultValueStr: string representation of the default value
-    :return: code string
     """
-    # TODO: How to correctly inherit instance attributes?
     template = '''class Has$Name(Params):
     """
     Mixin for param $name: $doc.
@@ -61,8 +58,27 @@ def _gen_param_code(name, doc, defaultValueStr):
         #: param for $doc
         self.$name = Param(self, "$name", "$doc")
         if $defaultValueStr is not None:
-            self._setDefault($name=$defaultValueStr)
+            self._setDefault($name=$defaultValueStr)'''
+
+    Name = name[0].upper() + name[1:]
+    return template \
+        .replace("$name", name) \
+        .replace("$Name", Name) \
+        .replace("$doc", doc) \
+        .replace("$defaultValueStr", str(defaultValueStr))
 
+
+def _gen_param_code(name, doc, defaultValueStr):
+    """
+    Generates Python code for a shared param class.
+
+    :param name: param name
+    :param doc: param doc
+    :param defaultValueStr: string representation of the default value
+    :return: code string
+    """
+    # TODO: How to correctly inherit instance attributes?
+    template = '''
     def set$Name(self, value):
         """
         Sets the value of :py:attr:`$name`.
@@ -104,5 +120,44 @@ if __name__ == "__main__":
         ("stepSize", "Step size to be used for each iteration of optimization.", None)]
     code = []
     for name, doc, defaultValueStr in shared:
-        code.append(_gen_param_code(name, doc, defaultValueStr))
+        param_code = _gen_param_header(name, doc, defaultValueStr)
+        code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr))
+
+    decisionTreeParams = [
+        ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " +
+         "depth 1 means 1 internal node + 2 leaf nodes."),
+        ("maxBins", "Max number of bins for" +
+         " discretizing continuous features.  Must be >=2 and >= number of categories for any" +
+         " categorical feature."),
+        ("minInstancesPerNode", "Minimum number of instances each child must have after split. " +
+         "If a split causes the left or right child to have fewer than minInstancesPerNode, the " +
+         "split will be discarded as invalid. Should be >= 1."),
+        ("minInfoGain", "Minimum information gain for a split to be considered at a tree node."),
+        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
+        ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
+         "instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
+         "Caching can speed up training of deeper trees.")]
+
+    decisionTreeCode = '''class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    $dummyPlaceHolders
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+        $realParams'''
+    dtParamMethods = ""
+    dummyPlaceholders = ""
+    realParams = ""
+    paramTemplate = """$name = Param($owner, "$name", "$doc")"""
+    for name, doc in decisionTreeParams:
+        variable = paramTemplate.replace("$name", name).replace("$doc", doc)
+        dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n    "
+        realParams += "self." + variable.replace("$owner", "self") + "\n        "
+        dtParamMethods += _gen_param_code(name, doc, None) + "\n"
+    code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders)
+                .replace("$realParams", realParams) + dtParamMethods)
     print("\n\n\n".join(code))
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 4b22322b89..779cabe853 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -424,3 +424,111 @@ class HasStepSize(Params):
         Gets the value of stepSize or its default value.
         """
         return self.getOrDefault(self.stepSize)
+
+
+class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
+    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
+    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
+    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+        #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+        self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+        #: param for Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.
+        self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
+        #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
+        self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
+        #: param for Minimum information gain for a split to be considered at a tree node.
+        self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
+        #: param for Maximum memory in MB allocated to histogram aggregation.
+        self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
+        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
+        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+
+    def setMaxDepth(self, value):
+        """
+        Sets the value of :py:attr:`maxDepth`.
+        """
+        self.paramMap[self.maxDepth] = value
+        return self
+
+    def getMaxDepth(self):
+        """
+        Gets the value of maxDepth or its default value.
+        """
+        return self.getOrDefault(self.maxDepth)
+
+    def setMaxBins(self, value):
+        """
+        Sets the value of :py:attr:`maxBins`.
+        """
+        self.paramMap[self.maxBins] = value
+        return self
+
+    def getMaxBins(self):
+        """
+        Gets the value of maxBins or its default value.
+        """
+        return self.getOrDefault(self.maxBins)
+
+    def setMinInstancesPerNode(self, value):
+        """
+        Sets the value of :py:attr:`minInstancesPerNode`.
+        """
+        self.paramMap[self.minInstancesPerNode] = value
+        return self
+
+    def getMinInstancesPerNode(self):
+        """
+        Gets the value of minInstancesPerNode or its default value.
+        """
+        return self.getOrDefault(self.minInstancesPerNode)
+
+    def setMinInfoGain(self, value):
+        """
+        Sets the value of :py:attr:`minInfoGain`.
+        """
+        self.paramMap[self.minInfoGain] = value
+        return self
+
+    def getMinInfoGain(self):
+        """
+        Gets the value of minInfoGain or its default value.
+        """
+        return self.getOrDefault(self.minInfoGain)
+
+    def setMaxMemoryInMB(self, value):
+        """
+        Sets the value of :py:attr:`maxMemoryInMB`.
+        """
+        self.paramMap[self.maxMemoryInMB] = value
+        return self
+
+    def getMaxMemoryInMB(self):
+        """
+        Gets the value of maxMemoryInMB or its default value.
+        """
+        return self.getOrDefault(self.maxMemoryInMB)
+
+    def setCacheNodeIds(self, value):
+        """
+        Sets the value of :py:attr:`cacheNodeIds`.
+        """
+        self.paramMap[self.cacheNodeIds] = value
+        return self
+
+    def getCacheNodeIds(self):
+        """
+        Gets the value of cacheNodeIds or its default value.
+        """
+        return self.getOrDefault(self.cacheNodeIds)
+
author	Burak Yavuz <brkyvz@gmail.com>	2015-05-12 12:17:05 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-05-12 12:17:05 -0700
commit	8e935b0a214f8b477fe9579fbf6a2d0a27b59118 (patch)
tree	409915759b0929b79e0b85c3a824a6b2e4f5f591 /python/pyspark/ml/param
parent	b9b01f44f687f35460db9e0ab0b426897747596a (diff)
download	spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.gz spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.tar.bz2 spark-8e935b0a214f8b477fe9579fbf6a2d0a27b59118.zip