[SPARK-8999] [MLLIB] PrefixSpan non-temporal sequences

mengxr Extends PrefixSpan to non-temporal itemsets. Continues work by zhangjiajin * Internal API uses List[Set[Int]] which is likely not efficient; will need to refactor during QA Closes #7646 Author: zhangjiajin <zhangjiajin@huawei.com> Author: Feynman Liang <fliang@databricks.com> Author: zhang jiajin <zhangjiajin@huawei.com> Closes #7818 from feynmanliang/SPARK-8999-nonTemporal and squashes the following commits: 4ded81d [Feynman Liang] Replace all filters to filter nonempty 350e67e [Feynman Liang] Code review feedback 03156ca [Feynman Liang] Fix tests, drop delimiters at boundaries of sequences d1fe0ed [Feynman Liang] Remove comments 86ca4e5 [Feynman Liang] Fix style 7c7bf39 [Feynman Liang] Fixed itemSet sequences 6073b10 [Feynman Liang] Basic itemset functionality, failing test 1a7fb48 [Feynman Liang] Add delimiter to results 5db00aa [Feynman Liang] Working for items, not itemsets 6787716 [Feynman Liang] Working on temporal sequences f1114b9 [Feynman Liang] Add -1 delimiter 00fe756 [Feynman Liang] Reset base files for rebase f486dcd [zhangjiajin] change maxLocalProjDBSize and fix a bug (remove -3 from frequent items). 60a0b76 [zhangjiajin] fixed a scala style error. 740c203 [zhangjiajin] fixed a scala style error. 5785cb8 [zhangjiajin] support non-temporal sequence a5d649d [zhangjiajin] restore original version 09dc409 [zhangjiajin] Merge branch 'master' of https://github.com/apache/spark into multiItems_2 ae8c02d [zhangjiajin] Fixed some Scala style errors. 216ab0c [zhangjiajin] Support non-temporal sequence in PrefixSpan b572f54 [zhangjiajin] initialize file before rebase. f06772f [zhangjiajin] fix a scala style error. a7e50d4 [zhangjiajin] Add feature: Collect enough frequent prefixes before projection in PrefixSpan. c1d13d0 [zhang jiajin] Delete PrefixspanSuite.scala d9d8137 [zhang jiajin] Delete Prefixspan.scala c6ceb63 [zhangjiajin] Add new algorithm PrefixSpan and test file.
author: zhangjiajin <zhangjiajin@huawei.com> 2015-08-01 01:56:27 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-08-01 01:56:27 -0700
commit: d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e (patch)
tree: c58b3009b8c8d2a936834022bf28aeb2f1d472df /mllib/src/test
parent: 65038973a17904e0e04d453799ec108af240fbab (diff)
download: spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.gz
spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.bz2
spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.zip
1 files changed, 199 insertions, 38 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
index 6dd2dc926a..457f32670f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  test("PrefixSpan using Integer type") {
+  test("PrefixSpan using Integer type, singleton itemsets") {
 
     /*
       library("arulesSequences")
@@ -35,12 +35,12 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
     */
 
     val sequences = Array(
-      Array(1, 3, 4, 5),
-      Array(2, 3, 1),
-      Array(2, 4, 1),
-      Array(3, 1, 3, 4, 5),
-      Array(3, 4, 4, 3),
-      Array(6, 5, 3))
+      Array(1, -1, 3, -1, 4, -1, 5),
+      Array(2, -1, 3, -1, 1),
+      Array(2, -1, 4, -1, 1),
+      Array(3, -1, 1, -1, 3, -1, 4, -1, 5),
+      Array(3, -1, 4, -1, 4, -1, 3),
+      Array(6, -1, 5, -1, 3))
 
     val rdd = sc.parallelize(sequences, 2).cache()
 
@@ -50,64 +50,225 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
     val result1 = prefixspan.run(rdd)
     val expectedValue1 = Array(
       (Array(1), 4L),
-      (Array(1, 3), 2L),
-      (Array(1, 3, 4), 2L),
-      (Array(1, 3, 4, 5), 2L),
-      (Array(1, 3, 5), 2L),
-      (Array(1, 4), 2L),
-      (Array(1, 4, 5), 2L),
-      (Array(1, 5), 2L),
+      (Array(1, -1, 3), 2L),
+      (Array(1, -1, 3, -1, 4), 2L),
+      (Array(1, -1, 3, -1, 4, -1, 5), 2L),
+      (Array(1, -1, 3, -1, 5), 2L),
+      (Array(1, -1, 4), 2L),
+      (Array(1, -1, 4, -1, 5), 2L),
+      (Array(1, -1, 5), 2L),
       (Array(2), 2L),
-      (Array(2, 1), 2L),
+      (Array(2, -1, 1), 2L),
       (Array(3), 5L),
-      (Array(3, 1), 2L),
-      (Array(3, 3), 2L),
-      (Array(3, 4), 3L),
-      (Array(3, 4, 5), 2L),
-      (Array(3, 5), 2L),
+      (Array(3, -1, 1), 2L),
+      (Array(3, -1, 3), 2L),
+      (Array(3, -1, 4), 3L),
+      (Array(3, -1, 4, -1, 5), 2L),
+      (Array(3, -1, 5), 2L),
       (Array(4), 4L),
-      (Array(4, 5), 2L),
+      (Array(4, -1, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue1, result1.collect()))
+    compareResults(expectedValue1, result1.collect())
 
     prefixspan.setMinSupport(0.5).setMaxPatternLength(50)
     val result2 = prefixspan.run(rdd)
     val expectedValue2 = Array(
       (Array(1), 4L),
       (Array(3), 5L),
-      (Array(3, 4), 3L),
+      (Array(3, -1, 4), 3L),
       (Array(4), 4L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue2, result2.collect()))
+    compareResults(expectedValue2, result2.collect())
 
     prefixspan.setMinSupport(0.33).setMaxPatternLength(2)
     val result3 = prefixspan.run(rdd)
     val expectedValue3 = Array(
       (Array(1), 4L),
-      (Array(1, 3), 2L),
-      (Array(1, 4), 2L),
-      (Array(1, 5), 2L),
-      (Array(2, 1), 2L),
+      (Array(1, -1, 3), 2L),
+      (Array(1, -1, 4), 2L),
+      (Array(1, -1, 5), 2L),
+      (Array(2, -1, 1), 2L),
       (Array(2), 2L),
       (Array(3), 5L),
-      (Array(3, 1), 2L),
-      (Array(3, 3), 2L),
-      (Array(3, 4), 3L),
-      (Array(3, 5), 2L),
+      (Array(3, -1, 1), 2L),
+      (Array(3, -1, 3), 2L),
+      (Array(3, -1, 4), 3L),
+      (Array(3, -1, 5), 2L),
       (Array(4), 4L),
-      (Array(4, 5), 2L),
+      (Array(4, -1, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue3, result3.collect()))
+    compareResults(expectedValue3, result3.collect())
+  }
+
+  test("PrefixSpan using Integer type, variable-size itemsets") {
+    val sequences = Array(
+      Array(1, -1, 1, 2, 3, -1, 1, 3, -1, 4, -1, 3, 6),
+      Array(1, 4, -1, 3, -1, 2, 3, -1, 1, 5),
+      Array(5, 6, -1, 1, 2, -1, 4, 6, -1, 3, -1, 2),
+      Array(5, -1, 7, -1, 1, 6, -1, 3, -1, 2, -1, 3))
+    val rdd = sc.parallelize(sequences, 2).cache()
+    val prefixspan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5)
+    val result = prefixspan.run(rdd)
+
+    /*
+      To verify results, create file "prefixSpanSeqs" with content
+      (format = (transactionID, idxInTransaction, numItemsinItemset, itemset)):
+        1 1 1 1
+        1 2 3 1 2 3
+        1 3 2 1 3
+        1 4 1 4
+        1 5 2 3 6
+        2 1 2 1 4
+        2 2 1 3
+        2 3 2 2 3
+        2 4 2 1 5
+        3 1 2 5 6
+        3 2 2 1 2
+        3 3 2 4 6
+        3 4 1 3
+        3 5 1 2
+        4 1 1 5
+        4 2 1 7
+        4 3 2 1 6
+        4 4 1 3
+        4 5 1 2
+        4 6 1 3
+      In R, run:
+        library("arulesSequences")
+        prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE"))
+        freqItemSeq = cspade(prefixSpanSeqs,
+                             parameter = list(support = 0.5, maxlen = 5 ))
+        resSeq = as(freqItemSeq, "data.frame")
+        resSeq
+
+                    sequence support
+        1              <{1}>    1.00
+        2              <{2}>    1.00
+        3              <{3}>    1.00
+        4              <{4}>    0.75
+        5              <{5}>    0.75
+        6              <{6}>    0.75
+        7          <{1},{6}>    0.50
+        8          <{2},{6}>    0.50
+        9          <{5},{6}>    0.50
+        10       <{1,2},{6}>    0.50
+        11         <{1},{4}>    0.50
+        12         <{2},{4}>    0.50
+        13       <{1,2},{4}>    0.50
+        14         <{1},{3}>    1.00
+        15         <{2},{3}>    0.75
+        16           <{2,3}>    0.50
+        17         <{3},{3}>    0.75
+        18         <{4},{3}>    0.75
+        19         <{5},{3}>    0.50
+        20         <{6},{3}>    0.50
+        21     <{5},{6},{3}>    0.50
+        22     <{6},{2},{3}>    0.50
+        23     <{5},{2},{3}>    0.50
+        24     <{5},{1},{3}>    0.50
+        25     <{2},{4},{3}>    0.50
+        26     <{1},{4},{3}>    0.50
+        27   <{1,2},{4},{3}>    0.50
+        28     <{1},{3},{3}>    0.75
+        29       <{1,2},{3}>    0.50
+        30     <{1},{2},{3}>    0.50
+        31       <{1},{2,3}>    0.50
+        32         <{1},{2}>    1.00
+        33           <{1,2}>    0.50
+        34         <{3},{2}>    0.75
+        35         <{4},{2}>    0.50
+        36         <{5},{2}>    0.50
+        37         <{6},{2}>    0.50
+        38     <{5},{6},{2}>    0.50
+        39     <{6},{3},{2}>    0.50
+        40     <{5},{3},{2}>    0.50
+        41     <{5},{1},{2}>    0.50
+        42     <{4},{3},{2}>    0.50
+        43     <{1},{3},{2}>    0.75
+        44 <{5},{6},{3},{2}>    0.50
+        45 <{5},{1},{3},{2}>    0.50
+        46         <{1},{1}>    0.50
+        47         <{2},{1}>    0.50
+        48         <{3},{1}>    0.50
+        49         <{5},{1}>    0.50
+        50       <{2,3},{1}>    0.50
+        51     <{1},{3},{1}>    0.50
+        52   <{1},{2,3},{1}>    0.50
+        53     <{1},{2},{1}>    0.50
+     */
+    val expectedValue = Array(
+      (Array(1), 4L),
+      (Array(2), 4L),
+      (Array(3), 4L),
+      (Array(4), 3L),
+      (Array(5), 3L),
+      (Array(6), 3L),
+      (Array(1, -1, 6), 2L),
+      (Array(2, -1, 6), 2L),
+      (Array(5, -1, 6), 2L),
+      (Array(1, 2, -1, 6), 2L),
+      (Array(1, -1, 4), 2L),
+      (Array(2, -1, 4), 2L),
+      (Array(1, 2, -1, 4), 2L),
+      (Array(1, -1, 3), 4L),
+      (Array(2, -1, 3), 3L),
+      (Array(2, 3), 2L),
+      (Array(3, -1, 3), 3L),
+      (Array(4, -1, 3), 3L),
+      (Array(5, -1, 3), 2L),
+      (Array(6, -1, 3), 2L),
+      (Array(5, -1, 6, -1, 3), 2L),
+      (Array(6, -1, 2, -1, 3), 2L),
+      (Array(5, -1, 2, -1, 3), 2L),
+      (Array(5, -1, 1, -1, 3), 2L),
+      (Array(2, -1, 4, -1, 3), 2L),
+      (Array(1, -1, 4, -1, 3), 2L),
+      (Array(1, 2, -1, 4, -1, 3), 2L),
+      (Array(1, -1, 3, -1, 3), 3L),
+      (Array(1, 2, -1, 3), 2L),
+      (Array(1, -1, 2, -1, 3), 2L),
+      (Array(1, -1, 2, 3), 2L),
+      (Array(1, -1, 2), 4L),
+      (Array(1, 2), 2L),
+      (Array(3, -1, 2), 3L),
+      (Array(4, -1, 2), 2L),
+      (Array(5, -1, 2), 2L),
+      (Array(6, -1, 2), 2L),
+      (Array(5, -1, 6, -1, 2), 2L),
+      (Array(6, -1, 3, -1, 2), 2L),
+      (Array(5, -1, 3, -1, 2), 2L),
+      (Array(5, -1, 1, -1, 2), 2L),
+      (Array(4, -1, 3, -1, 2), 2L),
+      (Array(1, -1, 3, -1, 2), 3L),
+      (Array(5, -1, 6, -1, 3, -1, 2), 2L),
+      (Array(5, -1, 1, -1, 3, -1, 2), 2L),
+      (Array(1, -1, 1), 2L),
+      (Array(2, -1, 1), 2L),
+      (Array(3, -1, 1), 2L),
+      (Array(5, -1, 1), 2L),
+      (Array(2, 3, -1, 1), 2L),
+      (Array(1, -1, 3, -1, 1), 2L),
+      (Array(1, -1, 2, 3, -1, 1), 2L),
+      (Array(1, -1, 2, -1, 1), 2L))
+
+    compareResults(expectedValue, result.collect())
   }
 
   private def compareResults(
-    expectedValue: Array[(Array[Int], Long)],
-    actualValue: Array[(Array[Int], Long)]): Boolean = {
-    expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
-      actualValue.map(x => (x._1.toSeq, x._2)).toSet
+      expectedValue: Array[(Array[Int], Long)],
+      actualValue: Array[(Array[Int], Long)]): Unit = {
+    val expectedSet = expectedValue.map(x => (x._1.toSeq, x._2)).toSet
+    val actualSet = actualValue.map(x => (x._1.toSeq, x._2)).toSet
+    assert(expectedSet === actualSet)
+  }
+
+  private def insertDelimiter(sequence: Array[Int]): Array[Int] = {
+    sequence.zip(Seq.fill(sequence.length)(PrefixSpan.DELIMITER)).map { case (a, b) =>
+      List(a, b)
+    }.flatten
   }
 
 }
author	zhangjiajin <zhangjiajin@huawei.com>	2015-08-01 01:56:27 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-08-01 01:56:27 -0700
commit	d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e (patch)
tree	c58b3009b8c8d2a936834022bf28aeb2f1d472df /mllib/src/test
parent	65038973a17904e0e04d453799ec108af240fbab (diff)
download	spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.gz spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.bz2 spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.zip