[SPARK-6264] [MLLIB] Support FPGrowth algorithm in Python API

Support FPGrowth algorithm in Python API. Should we remove "Experimental" which were marked for FPGrowth and FPGrowthModel in Scala? jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Closes #5213 from yanboliang/spark-6264 and squashes the following commits: ed62ead [Yanbo Liang] trigger jenkins 8ce0359 [Yanbo Liang] fix docstring style 544c725 [Yanbo Liang] address comments a2d7cf7 [Yanbo Liang] add doc for FPGrowth.train() dcf7d73 [Yanbo Liang] add python doc b18fd07 [Yanbo Liang] trigger jenkins 2c951b8 [Yanbo Liang] fix typos 7f62c8f [Yanbo Liang] add fpm to __init__.py b96206a [Yanbo Liang] Support FPGrowth algorithm in Python API
author: Yanbo Liang <ybliang8@gmail.com> 2015-04-09 15:10:10 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-04-09 15:10:10 -0700
commit: a0411aebee7c134f0426f0c2b2cb4c1c7856a291 (patch)
tree: b64884fa9957f5ce3102d39c621df5a4b7a32aa0 /python
parent: 7d92db342e01fa694d3522fb8d2254d6297a4203 (diff)
download: spark-a0411aebee7c134f0426f0c2b2cb4c1c7856a291.tar.gz
spark-a0411aebee7c134f0426f0c2b2cb4c1c7856a291.tar.bz2
spark-a0411aebee7c134f0426f0c2b2cb4c1c7856a291.zip
4 files changed, 90 insertions, 1 deletions
diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst
index 15101470af..26ece4c2c3 100644
--- a/python/docs/pyspark.mllib.rst
+++ b/python/docs/pyspark.mllib.rst
@@ -31,6 +31,13 @@ pyspark.mllib.feature module
     :undoc-members:
     :show-inheritance:
 
+pyspark.mllib.fpm module
+------------------------
+
+.. automodule:: pyspark.mllib.fpm
+    :members:
+    :undoc-members:
+
 pyspark.mllib.linalg module
 ---------------------------
 
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index 6449800d9c..f2ef573fe9 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -25,7 +25,7 @@ import numpy
 if numpy.version.version < '1.4':
     raise Exception("MLlib requires NumPy 1.4+")
 
-__all__ = ['classification', 'clustering', 'feature', 'linalg', 'random',
+__all__ = ['classification', 'clustering', 'feature', 'fpm', 'linalg', 'random',
            'recommendation', 'regression', 'stat', 'tree', 'util']
 
 import sys
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
new file mode 100644
index 0000000000..3aa6d79d70
--- /dev/null
+++ b/python/pyspark/mllib/fpm.py
@@ -0,0 +1,81 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark import SparkContext
+from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
+
+__all__ = ['FPGrowth', 'FPGrowthModel']
+
+
+@inherit_doc
+class FPGrowthModel(JavaModelWrapper):
+
+    """
+    .. note:: Experimental
+
+    A FP-Growth model for mining frequent itemsets
+    using the Parallel FP-Growth algorithm.
+
+    >>> data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
+    >>> rdd = sc.parallelize(data, 2)
+    >>> model = FPGrowth.train(rdd, 0.6, 2)
+    >>> sorted(model.freqItemsets().collect())
+    [([u'a'], 4), ([u'c'], 3), ([u'c', u'a'], 3)]
+    """
+
+    def freqItemsets(self):
+        """
+        Get the frequent itemsets of this model
+        """
+        return self.call("getFreqItemsets")
+
+
+class FPGrowth(object):
+    """
+    .. note:: Experimental
+
+    A Parallel FP-growth algorithm to mine frequent itemsets.
+    """
+
+    @classmethod
+    def train(cls, data, minSupport=0.3, numPartitions=-1):
+        """
+        Computes an FP-Growth model that contains frequent itemsets.
+        :param data:            The input data set, each element
+                                contains a transaction.
+        :param minSupport:      The minimal support level
+                                (default: `0.3`).
+        :param numPartitions:   The number of partitions used by parallel
+                                FP-growth (default: same as input data).
+        """
+        model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions))
+        return FPGrowthModel(model)
+
+
+def _test():
+    import doctest
+    import pyspark.mllib.fpm
+    globs = pyspark.mllib.fpm.__dict__.copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest')
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/run-tests b/python/run-tests
index b7630c356c..f569a56fb7 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -77,6 +77,7 @@ function run_mllib_tests() {
     run_test "pyspark/mllib/clustering.py"
     run_test "pyspark/mllib/evaluation.py"
     run_test "pyspark/mllib/feature.py"
+    run_test "pyspark/mllib/fpm.py"
     run_test "pyspark/mllib/linalg.py"
     run_test "pyspark/mllib/rand.py"
     run_test "pyspark/mllib/recommendation.py"
author	Yanbo Liang <ybliang8@gmail.com>	2015-04-09 15:10:10 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-04-09 15:10:10 -0700
commit	a0411aebee7c134f0426f0c2b2cb4c1c7856a291 (patch)
tree	b64884fa9957f5ce3102d39c621df5a4b7a32aa0 /python
parent	7d92db342e01fa694d3522fb8d2254d6297a4203 (diff)
download	spark-a0411aebee7c134f0426f0c2b2cb4c1c7856a291.tar.gz spark-a0411aebee7c134f0426f0c2b2cb4c1c7856a291.tar.bz2 spark-a0411aebee7c134f0426f0c2b2cb4c1c7856a291.zip