diff options
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala index 66ae3543ec..ddc296a428 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala @@ -39,6 +39,22 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.9) .setNumPartitions(1) .run(rdd) + + /* Verify results using the `R` code: + transactions = as(sapply( + list("r z h k p", + "z y x w v u t s", + "s x o n r", + "x z y m t s q e", + "z", + "x z y r q t p"), + FUN=function(x) strsplit(x," ",fixed=TRUE)), + "transactions") + > eclat(transactions, parameter = list(support = 0.9)) + ... + eclat - zero frequent items + set of 0 itemsets + */ assert(model6.freqItemsets.count() === 0) val model3 = fpg @@ -48,6 +64,33 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { val freqItemsets3 = model3.freqItemsets.collect().map { itemset => (itemset.items.toSet, itemset.freq) } + + /* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.5)) + fpDF = as(sort(fp), "data.frame") + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == "support"] = "freq" + > fpDF + items freq + 13 {z} 5 + 14 {x} 4 + 1 {s,x} 3 + 2 {t,x,y,z} 3 + 3 {t,y,z} 3 + 4 {t,x,y} 3 + 5 {x,y,z} 3 + 6 {y,z} 3 + 7 {x,y} 3 + 8 {t,y} 3 + 9 {t,x,z} 3 + 10 {t,z} 3 + 11 {t,x} 3 + 12 {x,z} 3 + 15 {t} 3 + 16 {y} 3 + 17 {s} 3 + 18 {r} 3 + */ val expected = Set( (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L), (Set("r"), 3L), @@ -62,12 +105,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.3) .setNumPartitions(4) .run(rdd) + + /* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.3)) + fpDF = as(fp, "data.frame") + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == "support"] = "freq" + > nrow(fpDF) + [1] 54 + */ assert(model2.freqItemsets.count() === 54) val model1 = fpg .setMinSupport(0.1) .setNumPartitions(8) .run(rdd) + + /* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.1)) + fpDF = as(fp, "data.frame") + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == "support"] = "freq" + > nrow(fpDF) + [1] 625 + */ assert(model1.freqItemsets.count() === 625) } @@ -89,6 +150,23 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.9) .setNumPartitions(1) .run(rdd) + + /* Verify results using the `R` code: + transactions = as(sapply( + list("1 2 3", + "1 2 3 4", + "5 4 3 2 1", + "6 5 4 3 2 1", + "2 4", + "1 3", + "1 7"), + FUN=function(x) strsplit(x," ",fixed=TRUE)), + "transactions") + > eclat(transactions, parameter = list(support = 0.9)) + ... + eclat - zero frequent items + set of 0 itemsets + */ assert(model6.freqItemsets.count() === 0) val model3 = fpg @@ -100,6 +178,24 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { val freqItemsets3 = model3.freqItemsets.collect().map { itemset => (itemset.items.toSet, itemset.freq) } + + /* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.5)) + fpDF = as(sort(fp), "data.frame") + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == "support"] = "freq" + > fpDF + items freq + 6 {1} 6 + 3 {1,3} 5 + 7 {2} 5 + 8 {3} 5 + 1 {2,4} 4 + 2 {1,2,3} 4 + 4 {2,3} 4 + 5 {1,2} 4 + 9 {4} 4 + */ val expected = Set( (Set(1), 6L), (Set(2), 5L), (Set(3), 5L), (Set(4), 4L), (Set(1, 2), 4L), (Set(1, 3), 5L), (Set(2, 3), 4L), @@ -110,12 +206,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.3) .setNumPartitions(4) .run(rdd) + + /* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.3)) + fpDF = as(fp, "data.frame") + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == "support"] = "freq" + > nrow(fpDF) + [1] 15 + */ assert(model2.freqItemsets.count() === 15) val model1 = fpg .setMinSupport(0.1) .setNumPartitions(8) .run(rdd) + + /* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.1)) + fpDF = as(fp, "data.frame") + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == "support"] = "freq" + > nrow(fpDF) + [1] 65 + */ assert(model1.freqItemsets.count() === 65) } } |