aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorKashif Rasul <kashif.rasul@gmail.com>2015-07-08 08:44:58 -0700
committerXiangrui Meng <meng@databricks.com>2015-07-08 08:44:58 -0700
commit3bb217750ada18a49c40d974ac57050ef2abfd2c (patch)
tree2c034a7cda307541d0cedc8b1bb50dac432703f4 /mllib
parent8a9d9cc1561cf157793c90db6700ffa6f1f00a69 (diff)
downloadspark-3bb217750ada18a49c40d974ac57050ef2abfd2c.tar.gz
spark-3bb217750ada18a49c40d974ac57050ef2abfd2c.tar.bz2
spark-3bb217750ada18a49c40d974ac57050ef2abfd2c.zip
[SPARK-8872] [MLLIB] added verification results from R for FPGrowthSuite
Author: Kashif Rasul <kashif.rasul@gmail.com> Closes #7269 from kashif/SPARK-8872 and squashes the following commits: 2d5457f [Kashif Rasul] added R code for FP Int type 3de6808 [Kashif Rasul] added verification results from R for FPGrowthSuite
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala114
1 files changed, 114 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index 66ae3543ec..ddc296a428 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -39,6 +39,22 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.9)
.setNumPartitions(1)
.run(rdd)
+
+ /* Verify results using the `R` code:
+ transactions = as(sapply(
+ list("r z h k p",
+ "z y x w v u t s",
+ "s x o n r",
+ "x z y m t s q e",
+ "z",
+ "x z y r q t p"),
+ FUN=function(x) strsplit(x," ",fixed=TRUE)),
+ "transactions")
+ > eclat(transactions, parameter = list(support = 0.9))
+ ...
+ eclat - zero frequent items
+ set of 0 itemsets
+ */
assert(model6.freqItemsets.count() === 0)
val model3 = fpg
@@ -48,6 +64,33 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
(itemset.items.toSet, itemset.freq)
}
+
+ /* Verify results using the `R` code:
+ fp = eclat(transactions, parameter = list(support = 0.5))
+ fpDF = as(sort(fp), "data.frame")
+ fpDF$support = fpDF$support * length(transactions)
+ names(fpDF)[names(fpDF) == "support"] = "freq"
+ > fpDF
+ items freq
+ 13 {z} 5
+ 14 {x} 4
+ 1 {s,x} 3
+ 2 {t,x,y,z} 3
+ 3 {t,y,z} 3
+ 4 {t,x,y} 3
+ 5 {x,y,z} 3
+ 6 {y,z} 3
+ 7 {x,y} 3
+ 8 {t,y} 3
+ 9 {t,x,z} 3
+ 10 {t,z} 3
+ 11 {t,x} 3
+ 12 {x,z} 3
+ 15 {t} 3
+ 16 {y} 3
+ 17 {s} 3
+ 18 {r} 3
+ */
val expected = Set(
(Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
(Set("r"), 3L),
@@ -62,12 +105,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.3)
.setNumPartitions(4)
.run(rdd)
+
+ /* Verify results using the `R` code:
+ fp = eclat(transactions, parameter = list(support = 0.3))
+ fpDF = as(fp, "data.frame")
+ fpDF$support = fpDF$support * length(transactions)
+ names(fpDF)[names(fpDF) == "support"] = "freq"
+ > nrow(fpDF)
+ [1] 54
+ */
assert(model2.freqItemsets.count() === 54)
val model1 = fpg
.setMinSupport(0.1)
.setNumPartitions(8)
.run(rdd)
+
+ /* Verify results using the `R` code:
+ fp = eclat(transactions, parameter = list(support = 0.1))
+ fpDF = as(fp, "data.frame")
+ fpDF$support = fpDF$support * length(transactions)
+ names(fpDF)[names(fpDF) == "support"] = "freq"
+ > nrow(fpDF)
+ [1] 625
+ */
assert(model1.freqItemsets.count() === 625)
}
@@ -89,6 +150,23 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.9)
.setNumPartitions(1)
.run(rdd)
+
+ /* Verify results using the `R` code:
+ transactions = as(sapply(
+ list("1 2 3",
+ "1 2 3 4",
+ "5 4 3 2 1",
+ "6 5 4 3 2 1",
+ "2 4",
+ "1 3",
+ "1 7"),
+ FUN=function(x) strsplit(x," ",fixed=TRUE)),
+ "transactions")
+ > eclat(transactions, parameter = list(support = 0.9))
+ ...
+ eclat - zero frequent items
+ set of 0 itemsets
+ */
assert(model6.freqItemsets.count() === 0)
val model3 = fpg
@@ -100,6 +178,24 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
(itemset.items.toSet, itemset.freq)
}
+
+ /* Verify results using the `R` code:
+ fp = eclat(transactions, parameter = list(support = 0.5))
+ fpDF = as(sort(fp), "data.frame")
+ fpDF$support = fpDF$support * length(transactions)
+ names(fpDF)[names(fpDF) == "support"] = "freq"
+ > fpDF
+ items freq
+ 6 {1} 6
+ 3 {1,3} 5
+ 7 {2} 5
+ 8 {3} 5
+ 1 {2,4} 4
+ 2 {1,2,3} 4
+ 4 {2,3} 4
+ 5 {1,2} 4
+ 9 {4} 4
+ */
val expected = Set(
(Set(1), 6L), (Set(2), 5L), (Set(3), 5L), (Set(4), 4L),
(Set(1, 2), 4L), (Set(1, 3), 5L), (Set(2, 3), 4L),
@@ -110,12 +206,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.setMinSupport(0.3)
.setNumPartitions(4)
.run(rdd)
+
+ /* Verify results using the `R` code:
+ fp = eclat(transactions, parameter = list(support = 0.3))
+ fpDF = as(fp, "data.frame")
+ fpDF$support = fpDF$support * length(transactions)
+ names(fpDF)[names(fpDF) == "support"] = "freq"
+ > nrow(fpDF)
+ [1] 15
+ */
assert(model2.freqItemsets.count() === 15)
val model1 = fpg
.setMinSupport(0.1)
.setNumPartitions(8)
.run(rdd)
+
+ /* Verify results using the `R` code:
+ fp = eclat(transactions, parameter = list(support = 0.1))
+ fpDF = as(fp, "data.frame")
+ fpDF$support = fpDF$support * length(transactions)
+ names(fpDF)[names(fpDF) == "support"] = "freq"
+ > nrow(fpDF)
+ [1] 65
+ */
assert(model1.freqItemsets.count() === 65)
}
}