diff options
author | zhangjiajin <zhangjiajin@huawei.com> | 2015-08-01 01:56:27 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-08-01 01:56:27 -0700 |
commit | d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e (patch) | |
tree | c58b3009b8c8d2a936834022bf28aeb2f1d472df /mllib/src/test | |
parent | 65038973a17904e0e04d453799ec108af240fbab (diff) | |
download | spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.gz spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.bz2 spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.zip |
[SPARK-8999] [MLLIB] PrefixSpan non-temporal sequences
mengxr Extends PrefixSpan to non-temporal itemsets. Continues work by zhangjiajin
* Internal API uses List[Set[Int]] which is likely not efficient; will need to refactor during QA
Closes #7646
Author: zhangjiajin <zhangjiajin@huawei.com>
Author: Feynman Liang <fliang@databricks.com>
Author: zhang jiajin <zhangjiajin@huawei.com>
Closes #7818 from feynmanliang/SPARK-8999-nonTemporal and squashes the following commits:
4ded81d [Feynman Liang] Replace all filters to filter nonempty
350e67e [Feynman Liang] Code review feedback
03156ca [Feynman Liang] Fix tests, drop delimiters at boundaries of sequences
d1fe0ed [Feynman Liang] Remove comments
86ca4e5 [Feynman Liang] Fix style
7c7bf39 [Feynman Liang] Fixed itemSet sequences
6073b10 [Feynman Liang] Basic itemset functionality, failing test
1a7fb48 [Feynman Liang] Add delimiter to results
5db00aa [Feynman Liang] Working for items, not itemsets
6787716 [Feynman Liang] Working on temporal sequences
f1114b9 [Feynman Liang] Add -1 delimiter
00fe756 [Feynman Liang] Reset base files for rebase
f486dcd [zhangjiajin] change maxLocalProjDBSize and fix a bug (remove -3 from frequent items).
60a0b76 [zhangjiajin] fixed a scala style error.
740c203 [zhangjiajin] fixed a scala style error.
5785cb8 [zhangjiajin] support non-temporal sequence
a5d649d [zhangjiajin] restore original version
09dc409 [zhangjiajin] Merge branch 'master' of https://github.com/apache/spark into multiItems_2
ae8c02d [zhangjiajin] Fixed some Scala style errors.
216ab0c [zhangjiajin] Support non-temporal sequence in PrefixSpan
b572f54 [zhangjiajin] initialize file before rebase.
f06772f [zhangjiajin] fix a scala style error.
a7e50d4 [zhangjiajin] Add feature: Collect enough frequent prefixes before projection in PrefixSpan.
c1d13d0 [zhang jiajin] Delete PrefixspanSuite.scala
d9d8137 [zhang jiajin] Delete Prefixspan.scala
c6ceb63 [zhangjiajin] Add new algorithm PrefixSpan and test file.
Diffstat (limited to 'mllib/src/test')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala | 237 |
1 files changed, 199 insertions, 38 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala index 6dd2dc926a..457f32670f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { - test("PrefixSpan using Integer type") { + test("PrefixSpan using Integer type, singleton itemsets") { /* library("arulesSequences") @@ -35,12 +35,12 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { */ val sequences = Array( - Array(1, 3, 4, 5), - Array(2, 3, 1), - Array(2, 4, 1), - Array(3, 1, 3, 4, 5), - Array(3, 4, 4, 3), - Array(6, 5, 3)) + Array(1, -1, 3, -1, 4, -1, 5), + Array(2, -1, 3, -1, 1), + Array(2, -1, 4, -1, 1), + Array(3, -1, 1, -1, 3, -1, 4, -1, 5), + Array(3, -1, 4, -1, 4, -1, 3), + Array(6, -1, 5, -1, 3)) val rdd = sc.parallelize(sequences, 2).cache() @@ -50,64 +50,225 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext { val result1 = prefixspan.run(rdd) val expectedValue1 = Array( (Array(1), 4L), - (Array(1, 3), 2L), - (Array(1, 3, 4), 2L), - (Array(1, 3, 4, 5), 2L), - (Array(1, 3, 5), 2L), - (Array(1, 4), 2L), - (Array(1, 4, 5), 2L), - (Array(1, 5), 2L), + (Array(1, -1, 3), 2L), + (Array(1, -1, 3, -1, 4), 2L), + (Array(1, -1, 3, -1, 4, -1, 5), 2L), + (Array(1, -1, 3, -1, 5), 2L), + (Array(1, -1, 4), 2L), + (Array(1, -1, 4, -1, 5), 2L), + (Array(1, -1, 5), 2L), (Array(2), 2L), - (Array(2, 1), 2L), + (Array(2, -1, 1), 2L), (Array(3), 5L), - (Array(3, 1), 2L), - (Array(3, 3), 2L), - (Array(3, 4), 3L), - (Array(3, 4, 5), 2L), - (Array(3, 5), 2L), + (Array(3, -1, 1), 2L), + (Array(3, -1, 3), 2L), + (Array(3, -1, 4), 3L), + (Array(3, -1, 4, -1, 5), 2L), + (Array(3, -1, 5), 2L), (Array(4), 4L), - (Array(4, 5), 2L), + (Array(4, -1, 5), 2L), (Array(5), 3L) ) - assert(compareResults(expectedValue1, result1.collect())) + compareResults(expectedValue1, result1.collect()) prefixspan.setMinSupport(0.5).setMaxPatternLength(50) val result2 = prefixspan.run(rdd) val expectedValue2 = Array( (Array(1), 4L), (Array(3), 5L), - (Array(3, 4), 3L), + (Array(3, -1, 4), 3L), (Array(4), 4L), (Array(5), 3L) ) - assert(compareResults(expectedValue2, result2.collect())) + compareResults(expectedValue2, result2.collect()) prefixspan.setMinSupport(0.33).setMaxPatternLength(2) val result3 = prefixspan.run(rdd) val expectedValue3 = Array( (Array(1), 4L), - (Array(1, 3), 2L), - (Array(1, 4), 2L), - (Array(1, 5), 2L), - (Array(2, 1), 2L), + (Array(1, -1, 3), 2L), + (Array(1, -1, 4), 2L), + (Array(1, -1, 5), 2L), + (Array(2, -1, 1), 2L), (Array(2), 2L), (Array(3), 5L), - (Array(3, 1), 2L), - (Array(3, 3), 2L), - (Array(3, 4), 3L), - (Array(3, 5), 2L), + (Array(3, -1, 1), 2L), + (Array(3, -1, 3), 2L), + (Array(3, -1, 4), 3L), + (Array(3, -1, 5), 2L), (Array(4), 4L), - (Array(4, 5), 2L), + (Array(4, -1, 5), 2L), (Array(5), 3L) ) - assert(compareResults(expectedValue3, result3.collect())) + compareResults(expectedValue3, result3.collect()) + } + + test("PrefixSpan using Integer type, variable-size itemsets") { + val sequences = Array( + Array(1, -1, 1, 2, 3, -1, 1, 3, -1, 4, -1, 3, 6), + Array(1, 4, -1, 3, -1, 2, 3, -1, 1, 5), + Array(5, 6, -1, 1, 2, -1, 4, 6, -1, 3, -1, 2), + Array(5, -1, 7, -1, 1, 6, -1, 3, -1, 2, -1, 3)) + val rdd = sc.parallelize(sequences, 2).cache() + val prefixspan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5) + val result = prefixspan.run(rdd) + + /* + To verify results, create file "prefixSpanSeqs" with content + (format = (transactionID, idxInTransaction, numItemsinItemset, itemset)): + 1 1 1 1 + 1 2 3 1 2 3 + 1 3 2 1 3 + 1 4 1 4 + 1 5 2 3 6 + 2 1 2 1 4 + 2 2 1 3 + 2 3 2 2 3 + 2 4 2 1 5 + 3 1 2 5 6 + 3 2 2 1 2 + 3 3 2 4 6 + 3 4 1 3 + 3 5 1 2 + 4 1 1 5 + 4 2 1 7 + 4 3 2 1 6 + 4 4 1 3 + 4 5 1 2 + 4 6 1 3 + In R, run: + library("arulesSequences") + prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE")) + freqItemSeq = cspade(prefixSpanSeqs, + parameter = list(support = 0.5, maxlen = 5 )) + resSeq = as(freqItemSeq, "data.frame") + resSeq + + sequence support + 1 <{1}> 1.00 + 2 <{2}> 1.00 + 3 <{3}> 1.00 + 4 <{4}> 0.75 + 5 <{5}> 0.75 + 6 <{6}> 0.75 + 7 <{1},{6}> 0.50 + 8 <{2},{6}> 0.50 + 9 <{5},{6}> 0.50 + 10 <{1,2},{6}> 0.50 + 11 <{1},{4}> 0.50 + 12 <{2},{4}> 0.50 + 13 <{1,2},{4}> 0.50 + 14 <{1},{3}> 1.00 + 15 <{2},{3}> 0.75 + 16 <{2,3}> 0.50 + 17 <{3},{3}> 0.75 + 18 <{4},{3}> 0.75 + 19 <{5},{3}> 0.50 + 20 <{6},{3}> 0.50 + 21 <{5},{6},{3}> 0.50 + 22 <{6},{2},{3}> 0.50 + 23 <{5},{2},{3}> 0.50 + 24 <{5},{1},{3}> 0.50 + 25 <{2},{4},{3}> 0.50 + 26 <{1},{4},{3}> 0.50 + 27 <{1,2},{4},{3}> 0.50 + 28 <{1},{3},{3}> 0.75 + 29 <{1,2},{3}> 0.50 + 30 <{1},{2},{3}> 0.50 + 31 <{1},{2,3}> 0.50 + 32 <{1},{2}> 1.00 + 33 <{1,2}> 0.50 + 34 <{3},{2}> 0.75 + 35 <{4},{2}> 0.50 + 36 <{5},{2}> 0.50 + 37 <{6},{2}> 0.50 + 38 <{5},{6},{2}> 0.50 + 39 <{6},{3},{2}> 0.50 + 40 <{5},{3},{2}> 0.50 + 41 <{5},{1},{2}> 0.50 + 42 <{4},{3},{2}> 0.50 + 43 <{1},{3},{2}> 0.75 + 44 <{5},{6},{3},{2}> 0.50 + 45 <{5},{1},{3},{2}> 0.50 + 46 <{1},{1}> 0.50 + 47 <{2},{1}> 0.50 + 48 <{3},{1}> 0.50 + 49 <{5},{1}> 0.50 + 50 <{2,3},{1}> 0.50 + 51 <{1},{3},{1}> 0.50 + 52 <{1},{2,3},{1}> 0.50 + 53 <{1},{2},{1}> 0.50 + */ + val expectedValue = Array( + (Array(1), 4L), + (Array(2), 4L), + (Array(3), 4L), + (Array(4), 3L), + (Array(5), 3L), + (Array(6), 3L), + (Array(1, -1, 6), 2L), + (Array(2, -1, 6), 2L), + (Array(5, -1, 6), 2L), + (Array(1, 2, -1, 6), 2L), + (Array(1, -1, 4), 2L), + (Array(2, -1, 4), 2L), + (Array(1, 2, -1, 4), 2L), + (Array(1, -1, 3), 4L), + (Array(2, -1, 3), 3L), + (Array(2, 3), 2L), + (Array(3, -1, 3), 3L), + (Array(4, -1, 3), 3L), + (Array(5, -1, 3), 2L), + (Array(6, -1, 3), 2L), + (Array(5, -1, 6, -1, 3), 2L), + (Array(6, -1, 2, -1, 3), 2L), + (Array(5, -1, 2, -1, 3), 2L), + (Array(5, -1, 1, -1, 3), 2L), + (Array(2, -1, 4, -1, 3), 2L), + (Array(1, -1, 4, -1, 3), 2L), + (Array(1, 2, -1, 4, -1, 3), 2L), + (Array(1, -1, 3, -1, 3), 3L), + (Array(1, 2, -1, 3), 2L), + (Array(1, -1, 2, -1, 3), 2L), + (Array(1, -1, 2, 3), 2L), + (Array(1, -1, 2), 4L), + (Array(1, 2), 2L), + (Array(3, -1, 2), 3L), + (Array(4, -1, 2), 2L), + (Array(5, -1, 2), 2L), + (Array(6, -1, 2), 2L), + (Array(5, -1, 6, -1, 2), 2L), + (Array(6, -1, 3, -1, 2), 2L), + (Array(5, -1, 3, -1, 2), 2L), + (Array(5, -1, 1, -1, 2), 2L), + (Array(4, -1, 3, -1, 2), 2L), + (Array(1, -1, 3, -1, 2), 3L), + (Array(5, -1, 6, -1, 3, -1, 2), 2L), + (Array(5, -1, 1, -1, 3, -1, 2), 2L), + (Array(1, -1, 1), 2L), + (Array(2, -1, 1), 2L), + (Array(3, -1, 1), 2L), + (Array(5, -1, 1), 2L), + (Array(2, 3, -1, 1), 2L), + (Array(1, -1, 3, -1, 1), 2L), + (Array(1, -1, 2, 3, -1, 1), 2L), + (Array(1, -1, 2, -1, 1), 2L)) + + compareResults(expectedValue, result.collect()) } private def compareResults( - expectedValue: Array[(Array[Int], Long)], - actualValue: Array[(Array[Int], Long)]): Boolean = { - expectedValue.map(x => (x._1.toSeq, x._2)).toSet == - actualValue.map(x => (x._1.toSeq, x._2)).toSet + expectedValue: Array[(Array[Int], Long)], + actualValue: Array[(Array[Int], Long)]): Unit = { + val expectedSet = expectedValue.map(x => (x._1.toSeq, x._2)).toSet + val actualSet = actualValue.map(x => (x._1.toSeq, x._2)).toSet + assert(expectedSet === actualSet) + } + + private def insertDelimiter(sequence: Array[Int]): Array[Int] = { + sequence.zip(Seq.fill(sequence.length)(PrefixSpan.DELIMITER)).map { case (a, b) => + List(a, b) + }.flatten } } |