aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorzhangjiajin <zhangjiajin@huawei.com>2015-08-01 01:56:27 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-01 01:56:27 -0700
commitd2a9b66f6c0de89d6d16370af1c77c7f51b11d3e (patch)
treec58b3009b8c8d2a936834022bf28aeb2f1d472df /mllib/src/test
parent65038973a17904e0e04d453799ec108af240fbab (diff)
downloadspark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.gz
spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.tar.bz2
spark-d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e.zip
[SPARK-8999] [MLLIB] PrefixSpan non-temporal sequences
mengxr Extends PrefixSpan to non-temporal itemsets. Continues work by zhangjiajin * Internal API uses List[Set[Int]] which is likely not efficient; will need to refactor during QA Closes #7646 Author: zhangjiajin <zhangjiajin@huawei.com> Author: Feynman Liang <fliang@databricks.com> Author: zhang jiajin <zhangjiajin@huawei.com> Closes #7818 from feynmanliang/SPARK-8999-nonTemporal and squashes the following commits: 4ded81d [Feynman Liang] Replace all filters to filter nonempty 350e67e [Feynman Liang] Code review feedback 03156ca [Feynman Liang] Fix tests, drop delimiters at boundaries of sequences d1fe0ed [Feynman Liang] Remove comments 86ca4e5 [Feynman Liang] Fix style 7c7bf39 [Feynman Liang] Fixed itemSet sequences 6073b10 [Feynman Liang] Basic itemset functionality, failing test 1a7fb48 [Feynman Liang] Add delimiter to results 5db00aa [Feynman Liang] Working for items, not itemsets 6787716 [Feynman Liang] Working on temporal sequences f1114b9 [Feynman Liang] Add -1 delimiter 00fe756 [Feynman Liang] Reset base files for rebase f486dcd [zhangjiajin] change maxLocalProjDBSize and fix a bug (remove -3 from frequent items). 60a0b76 [zhangjiajin] fixed a scala style error. 740c203 [zhangjiajin] fixed a scala style error. 5785cb8 [zhangjiajin] support non-temporal sequence a5d649d [zhangjiajin] restore original version 09dc409 [zhangjiajin] Merge branch 'master' of https://github.com/apache/spark into multiItems_2 ae8c02d [zhangjiajin] Fixed some Scala style errors. 216ab0c [zhangjiajin] Support non-temporal sequence in PrefixSpan b572f54 [zhangjiajin] initialize file before rebase. f06772f [zhangjiajin] fix a scala style error. a7e50d4 [zhangjiajin] Add feature: Collect enough frequent prefixes before projection in PrefixSpan. c1d13d0 [zhang jiajin] Delete PrefixspanSuite.scala d9d8137 [zhang jiajin] Delete Prefixspan.scala c6ceb63 [zhangjiajin] Add new algorithm PrefixSpan and test file.
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala237
1 files changed, 199 insertions, 38 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
index 6dd2dc926a..457f32670f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
- test("PrefixSpan using Integer type") {
+ test("PrefixSpan using Integer type, singleton itemsets") {
/*
library("arulesSequences")
@@ -35,12 +35,12 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
*/
val sequences = Array(
- Array(1, 3, 4, 5),
- Array(2, 3, 1),
- Array(2, 4, 1),
- Array(3, 1, 3, 4, 5),
- Array(3, 4, 4, 3),
- Array(6, 5, 3))
+ Array(1, -1, 3, -1, 4, -1, 5),
+ Array(2, -1, 3, -1, 1),
+ Array(2, -1, 4, -1, 1),
+ Array(3, -1, 1, -1, 3, -1, 4, -1, 5),
+ Array(3, -1, 4, -1, 4, -1, 3),
+ Array(6, -1, 5, -1, 3))
val rdd = sc.parallelize(sequences, 2).cache()
@@ -50,64 +50,225 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
val result1 = prefixspan.run(rdd)
val expectedValue1 = Array(
(Array(1), 4L),
- (Array(1, 3), 2L),
- (Array(1, 3, 4), 2L),
- (Array(1, 3, 4, 5), 2L),
- (Array(1, 3, 5), 2L),
- (Array(1, 4), 2L),
- (Array(1, 4, 5), 2L),
- (Array(1, 5), 2L),
+ (Array(1, -1, 3), 2L),
+ (Array(1, -1, 3, -1, 4), 2L),
+ (Array(1, -1, 3, -1, 4, -1, 5), 2L),
+ (Array(1, -1, 3, -1, 5), 2L),
+ (Array(1, -1, 4), 2L),
+ (Array(1, -1, 4, -1, 5), 2L),
+ (Array(1, -1, 5), 2L),
(Array(2), 2L),
- (Array(2, 1), 2L),
+ (Array(2, -1, 1), 2L),
(Array(3), 5L),
- (Array(3, 1), 2L),
- (Array(3, 3), 2L),
- (Array(3, 4), 3L),
- (Array(3, 4, 5), 2L),
- (Array(3, 5), 2L),
+ (Array(3, -1, 1), 2L),
+ (Array(3, -1, 3), 2L),
+ (Array(3, -1, 4), 3L),
+ (Array(3, -1, 4, -1, 5), 2L),
+ (Array(3, -1, 5), 2L),
(Array(4), 4L),
- (Array(4, 5), 2L),
+ (Array(4, -1, 5), 2L),
(Array(5), 3L)
)
- assert(compareResults(expectedValue1, result1.collect()))
+ compareResults(expectedValue1, result1.collect())
prefixspan.setMinSupport(0.5).setMaxPatternLength(50)
val result2 = prefixspan.run(rdd)
val expectedValue2 = Array(
(Array(1), 4L),
(Array(3), 5L),
- (Array(3, 4), 3L),
+ (Array(3, -1, 4), 3L),
(Array(4), 4L),
(Array(5), 3L)
)
- assert(compareResults(expectedValue2, result2.collect()))
+ compareResults(expectedValue2, result2.collect())
prefixspan.setMinSupport(0.33).setMaxPatternLength(2)
val result3 = prefixspan.run(rdd)
val expectedValue3 = Array(
(Array(1), 4L),
- (Array(1, 3), 2L),
- (Array(1, 4), 2L),
- (Array(1, 5), 2L),
- (Array(2, 1), 2L),
+ (Array(1, -1, 3), 2L),
+ (Array(1, -1, 4), 2L),
+ (Array(1, -1, 5), 2L),
+ (Array(2, -1, 1), 2L),
(Array(2), 2L),
(Array(3), 5L),
- (Array(3, 1), 2L),
- (Array(3, 3), 2L),
- (Array(3, 4), 3L),
- (Array(3, 5), 2L),
+ (Array(3, -1, 1), 2L),
+ (Array(3, -1, 3), 2L),
+ (Array(3, -1, 4), 3L),
+ (Array(3, -1, 5), 2L),
(Array(4), 4L),
- (Array(4, 5), 2L),
+ (Array(4, -1, 5), 2L),
(Array(5), 3L)
)
- assert(compareResults(expectedValue3, result3.collect()))
+ compareResults(expectedValue3, result3.collect())
+ }
+
+ test("PrefixSpan using Integer type, variable-size itemsets") {
+ val sequences = Array(
+ Array(1, -1, 1, 2, 3, -1, 1, 3, -1, 4, -1, 3, 6),
+ Array(1, 4, -1, 3, -1, 2, 3, -1, 1, 5),
+ Array(5, 6, -1, 1, 2, -1, 4, 6, -1, 3, -1, 2),
+ Array(5, -1, 7, -1, 1, 6, -1, 3, -1, 2, -1, 3))
+ val rdd = sc.parallelize(sequences, 2).cache()
+ val prefixspan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5)
+ val result = prefixspan.run(rdd)
+
+ /*
+ To verify results, create file "prefixSpanSeqs" with content
+ (format = (transactionID, idxInTransaction, numItemsinItemset, itemset)):
+ 1 1 1 1
+ 1 2 3 1 2 3
+ 1 3 2 1 3
+ 1 4 1 4
+ 1 5 2 3 6
+ 2 1 2 1 4
+ 2 2 1 3
+ 2 3 2 2 3
+ 2 4 2 1 5
+ 3 1 2 5 6
+ 3 2 2 1 2
+ 3 3 2 4 6
+ 3 4 1 3
+ 3 5 1 2
+ 4 1 1 5
+ 4 2 1 7
+ 4 3 2 1 6
+ 4 4 1 3
+ 4 5 1 2
+ 4 6 1 3
+ In R, run:
+ library("arulesSequences")
+ prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE"))
+ freqItemSeq = cspade(prefixSpanSeqs,
+ parameter = list(support = 0.5, maxlen = 5 ))
+ resSeq = as(freqItemSeq, "data.frame")
+ resSeq
+
+ sequence support
+ 1 <{1}> 1.00
+ 2 <{2}> 1.00
+ 3 <{3}> 1.00
+ 4 <{4}> 0.75
+ 5 <{5}> 0.75
+ 6 <{6}> 0.75
+ 7 <{1},{6}> 0.50
+ 8 <{2},{6}> 0.50
+ 9 <{5},{6}> 0.50
+ 10 <{1,2},{6}> 0.50
+ 11 <{1},{4}> 0.50
+ 12 <{2},{4}> 0.50
+ 13 <{1,2},{4}> 0.50
+ 14 <{1},{3}> 1.00
+ 15 <{2},{3}> 0.75
+ 16 <{2,3}> 0.50
+ 17 <{3},{3}> 0.75
+ 18 <{4},{3}> 0.75
+ 19 <{5},{3}> 0.50
+ 20 <{6},{3}> 0.50
+ 21 <{5},{6},{3}> 0.50
+ 22 <{6},{2},{3}> 0.50
+ 23 <{5},{2},{3}> 0.50
+ 24 <{5},{1},{3}> 0.50
+ 25 <{2},{4},{3}> 0.50
+ 26 <{1},{4},{3}> 0.50
+ 27 <{1,2},{4},{3}> 0.50
+ 28 <{1},{3},{3}> 0.75
+ 29 <{1,2},{3}> 0.50
+ 30 <{1},{2},{3}> 0.50
+ 31 <{1},{2,3}> 0.50
+ 32 <{1},{2}> 1.00
+ 33 <{1,2}> 0.50
+ 34 <{3},{2}> 0.75
+ 35 <{4},{2}> 0.50
+ 36 <{5},{2}> 0.50
+ 37 <{6},{2}> 0.50
+ 38 <{5},{6},{2}> 0.50
+ 39 <{6},{3},{2}> 0.50
+ 40 <{5},{3},{2}> 0.50
+ 41 <{5},{1},{2}> 0.50
+ 42 <{4},{3},{2}> 0.50
+ 43 <{1},{3},{2}> 0.75
+ 44 <{5},{6},{3},{2}> 0.50
+ 45 <{5},{1},{3},{2}> 0.50
+ 46 <{1},{1}> 0.50
+ 47 <{2},{1}> 0.50
+ 48 <{3},{1}> 0.50
+ 49 <{5},{1}> 0.50
+ 50 <{2,3},{1}> 0.50
+ 51 <{1},{3},{1}> 0.50
+ 52 <{1},{2,3},{1}> 0.50
+ 53 <{1},{2},{1}> 0.50
+ */
+ val expectedValue = Array(
+ (Array(1), 4L),
+ (Array(2), 4L),
+ (Array(3), 4L),
+ (Array(4), 3L),
+ (Array(5), 3L),
+ (Array(6), 3L),
+ (Array(1, -1, 6), 2L),
+ (Array(2, -1, 6), 2L),
+ (Array(5, -1, 6), 2L),
+ (Array(1, 2, -1, 6), 2L),
+ (Array(1, -1, 4), 2L),
+ (Array(2, -1, 4), 2L),
+ (Array(1, 2, -1, 4), 2L),
+ (Array(1, -1, 3), 4L),
+ (Array(2, -1, 3), 3L),
+ (Array(2, 3), 2L),
+ (Array(3, -1, 3), 3L),
+ (Array(4, -1, 3), 3L),
+ (Array(5, -1, 3), 2L),
+ (Array(6, -1, 3), 2L),
+ (Array(5, -1, 6, -1, 3), 2L),
+ (Array(6, -1, 2, -1, 3), 2L),
+ (Array(5, -1, 2, -1, 3), 2L),
+ (Array(5, -1, 1, -1, 3), 2L),
+ (Array(2, -1, 4, -1, 3), 2L),
+ (Array(1, -1, 4, -1, 3), 2L),
+ (Array(1, 2, -1, 4, -1, 3), 2L),
+ (Array(1, -1, 3, -1, 3), 3L),
+ (Array(1, 2, -1, 3), 2L),
+ (Array(1, -1, 2, -1, 3), 2L),
+ (Array(1, -1, 2, 3), 2L),
+ (Array(1, -1, 2), 4L),
+ (Array(1, 2), 2L),
+ (Array(3, -1, 2), 3L),
+ (Array(4, -1, 2), 2L),
+ (Array(5, -1, 2), 2L),
+ (Array(6, -1, 2), 2L),
+ (Array(5, -1, 6, -1, 2), 2L),
+ (Array(6, -1, 3, -1, 2), 2L),
+ (Array(5, -1, 3, -1, 2), 2L),
+ (Array(5, -1, 1, -1, 2), 2L),
+ (Array(4, -1, 3, -1, 2), 2L),
+ (Array(1, -1, 3, -1, 2), 3L),
+ (Array(5, -1, 6, -1, 3, -1, 2), 2L),
+ (Array(5, -1, 1, -1, 3, -1, 2), 2L),
+ (Array(1, -1, 1), 2L),
+ (Array(2, -1, 1), 2L),
+ (Array(3, -1, 1), 2L),
+ (Array(5, -1, 1), 2L),
+ (Array(2, 3, -1, 1), 2L),
+ (Array(1, -1, 3, -1, 1), 2L),
+ (Array(1, -1, 2, 3, -1, 1), 2L),
+ (Array(1, -1, 2, -1, 1), 2L))
+
+ compareResults(expectedValue, result.collect())
}
private def compareResults(
- expectedValue: Array[(Array[Int], Long)],
- actualValue: Array[(Array[Int], Long)]): Boolean = {
- expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
- actualValue.map(x => (x._1.toSeq, x._2)).toSet
+ expectedValue: Array[(Array[Int], Long)],
+ actualValue: Array[(Array[Int], Long)]): Unit = {
+ val expectedSet = expectedValue.map(x => (x._1.toSeq, x._2)).toSet
+ val actualSet = actualValue.map(x => (x._1.toSeq, x._2)).toSet
+ assert(expectedSet === actualSet)
+ }
+
+ private def insertDelimiter(sequence: Array[Int]): Array[Int] = {
+ sequence.zip(Seq.fill(sequence.length)(PrefixSpan.DELIMITER)).map { case (a, b) =>
+ List(a, b)
+ }.flatten
}
}