From 411ff6afb485c9d8cfc667c9346f836f2529ea9f Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Wed, 4 Nov 2015 15:28:19 -0800 Subject: [SPARK-10028][MLLIB][PYTHON] Add Python API for PrefixSpan Author: Yu ISHIKAWA Closes #9469 from yu-iskw/SPARK-10028. --- python/pyspark/mllib/fpm.py | 69 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) (limited to 'python/pyspark') diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index bdabba9602..2039decc0c 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -23,7 +23,7 @@ from pyspark import SparkContext, since from pyspark.rdd import ignore_unicode_prefix from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc -__all__ = ['FPGrowth', 'FPGrowthModel'] +__all__ = ['FPGrowth', 'FPGrowthModel', 'PrefixSpan', 'PrefixSpanModel'] @inherit_doc @@ -85,6 +85,73 @@ class FPGrowth(object): """ +@inherit_doc +@ignore_unicode_prefix +class PrefixSpanModel(JavaModelWrapper): + """ + .. note:: Experimental + + Model fitted by PrefixSpan + + >>> data = [ + ... [["a", "b"], ["c"]], + ... [["a"], ["c", "b"], ["a", "b"]], + ... [["a", "b"], ["e"]], + ... [["f"]]] + >>> rdd = sc.parallelize(data, 2) + >>> model = PrefixSpan.train(rdd) + >>> sorted(model.freqSequences().collect()) + [FreqSequence(sequence=[[u'a']], freq=3), FreqSequence(sequence=[[u'a'], [u'a']], freq=1), ... + + .. versionadded:: 1.6.0 + """ + + @since("1.6.0") + def freqSequences(self): + """Gets frequence sequences""" + return self.call("getFreqSequences").map(lambda x: PrefixSpan.FreqSequence(x[0], x[1])) + + +class PrefixSpan(object): + """ + .. note:: Experimental + + A parallel PrefixSpan algorithm to mine frequent sequential patterns. + The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: + Mining Sequential Patterns Efficiently by Prefix-Projected Pattern Growth + ([[http://doi.org/10.1109/ICDE.2001.914830]]). + + .. versionadded:: 1.6.0 + """ + + @classmethod + @since("1.6.0") + def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): + """ + Finds the complete set of frequent sequential patterns in the input sequences of itemsets. + + :param data: The input data set, each element contains a sequnce of itemsets. + :param minSupport: the minimal support level of the sequential pattern, any pattern appears + more than (minSupport * size-of-the-dataset) times will be output (default: `0.1`) + :param maxPatternLength: the maximal length of the sequential pattern, any pattern appears + less than maxPatternLength will be output. (default: `10`) + :param maxLocalProjDBSize: The maximum number of items (including delimiters used in + the internal storage format) allowed in a projected database before local + processing. If a projected database exceeds this size, another + iteration of distributed prefix growth is run. (default: `32000000`) + """ + model = callMLlibFunc("trainPrefixSpanModel", + data, minSupport, maxPatternLength, maxLocalProjDBSize) + return PrefixSpanModel(model) + + class FreqSequence(namedtuple("FreqSequence", ["sequence", "freq"])): + """ + Represents a (sequence, freq) tuple. + + .. versionadded:: 1.6.0 + """ + + def _test(): import doctest import pyspark.mllib.fpm -- cgit v1.2.3