[SPARK-7762] [MLLIB] set default value for outputCol

Set a default value for `outputCol` instead of forcing users to name it. This is useful for intermediate transformers in the pipeline. jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #6289 from mengxr/SPARK-7762 and squashes the following commits: 54edebc [Xiangrui Meng] merge master bff8667 [Xiangrui Meng] update unit test 171246b [Xiangrui Meng] add unit test for outputCol a4321bd [Xiangrui Meng] set default value for outputCol
author: Xiangrui Meng <meng@databricks.com> 2015-05-20 17:26:26 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-05-20 17:26:26 -0700
commit: c330e52dae6a3ec7e67ca82e2c2f4ea873976458 (patch)
tree: a6e98424c41b264292f6b8f7b777c7dc8e0547f3 /python
parent: f2faa7af30662e3bdf15780f8719c71108f8e30b (diff)
download: spark-c330e52dae6a3ec7e67ca82e2c2f4ea873976458.tar.gz
spark-c330e52dae6a3ec7e67ca82e2c2f4ea873976458.tar.bz2
spark-c330e52dae6a3ec7e67ca82e2c2f4ea873976458.zip
2 files changed, 3 insertions, 2 deletions
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index ccb929af18..69efc424ec 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -116,7 +116,7 @@ if __name__ == "__main__":
         ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name", "'rawPrediction'"),
         ("inputCol", "input column name", None),
         ("inputCols", "input column names", None),
-        ("outputCol", "output column name", None),
+        ("outputCol", "output column name", "self.uid + '__output'"),
         ("numFeatures", "number of features", None),
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
         ("seed", "random seed", "hash(type(self).__name__)"),
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 0b93788899..bc088e4c29 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -280,6 +280,7 @@ class HasOutputCol(Params):
         super(HasOutputCol, self).__init__()
         #: param for output column name
         self.outputCol = Param(self, "outputCol", "output column name")
+        self._setDefault(outputCol=self.uid + '__output')
 
     def setOutputCol(self, value):
         """
@@ -459,7 +460,7 @@ class DecisionTreeParams(Params):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.
author	Xiangrui Meng <meng@databricks.com>	2015-05-20 17:26:26 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-05-20 17:26:26 -0700
commit	c330e52dae6a3ec7e67ca82e2c2f4ea873976458 (patch)
tree	a6e98424c41b264292f6b8f7b777c7dc8e0547f3 /python
parent	f2faa7af30662e3bdf15780f8719c71108f8e30b (diff)
download	spark-c330e52dae6a3ec7e67ca82e2c2f4ea873976458.tar.gz spark-c330e52dae6a3ec7e67ca82e2c2f4ea873976458.tar.bz2 spark-c330e52dae6a3ec7e67ca82e2c2f4ea873976458.zip