4.1 Performance Optimization: FP-Growth Algorithm
In this subsection I introduce the FP-Growth (Frequent
Pattern Growth) algorithm as an alternative to Apriori, using
the fast C backend via fim4r to mine rules on the same data
and compare runtime.
# =========================================================
# High-Performance Mining with FP-Growth
# =========================================================
# Ensure packages
# Parameters (tune if needed)
fpg_supp <- 0.01
fpg_conf <- 0.25
cat("Running FP-Growth algorithm (C-implementation via fim4r)...\n")
## Running FP-Growth algorithm (C-implementation via fim4r)...
cat(sprintf("Parameters: Support = %.3f, Confidence = %.2f\n", fpg_supp, fpg_conf))
## Parameters: Support = 0.010, Confidence = 0.25
time_fpg <- system.time({
fpg_rules <- arules::fim4r(
trans,
method = "fpgrowth",
target = "rules",
supp = fpg_supp,
conf = fpg_conf
)
})
## fim4r.fpgrowth
##
## Parameter specification:
## supp conf target report
## 1 25 rules scl
##
## Data size: 964462 transactions and 48452 items
## Result: 7 rules
# Optional: remove empty LHS rules
if (length(fpg_rules) > 0) {
fpg_rules <- fpg_rules[size(lhs(fpg_rules)) > 0]
}
cat("\n[FP-Growth Results]\n")
##
## [FP-Growth Results]
cat("Execution Time:", time_fpg["elapsed"], "seconds\n")
## Execution Time: 0.882 seconds
cat("Rules Generated:", length(fpg_rules), "\n")
## Rules Generated: 7
# =========================================================
# Performance Comparison: Apriori vs. FP-Growth
# =========================================================
# Assuming 'rules' is the object from Section 3.5 (Apriori after redundant pruning)
cat("--- Algorithm Comparison ---\n")
## --- Algorithm Comparison ---
# Create comparison table
algo_comparison <- data.frame(
Algorithm = c("Apriori", "FP-Growth"),
Rules_Found = c(length(rules), length(fpg_rules)),
Execution_Time_sec = c(NA, time_fpg["elapsed"])
)
knitr::kable(algo_comparison, digits = 2, format = "html",
col.names = c("Algorithm", "Rules Found", "Execution Time (seconds)")) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12
)
|
|
Algorithm
|
Rules Found
|
Execution Time (seconds)
|
|
|
Apriori
|
18
|
NA
|
|
elapsed
|
FP-Growth
|
7
|
0.88
|
# Note: Even if rule counts are identical, execution time differs.
# FP-Growth constructs a tree structure (FP-Tree) instead of generating candidates.
4.2 Rule Quality & Consistency Checks
I compare the rule quality and the overlap between Apriori and
FP-Growth results.
From the quality summaries, I observe that FP-Growth shows slightly
higher average confidence (Mean: 0.309 vs 0.293) but lower average and
maximum lift compared with Apriori (Max Lift: 2.57 vs 5.79). This
suggests that while FP-Growth identifies highly reliable rules (higher
confidence), Apriori is more effective at capturing rare but
high-strength associations that may be filtered out by the higher
support threshold in FP-Growth.
For rule overlap, the intersection size is 7, and the exclusive rules
are mainly in Apriori (11 rules in Apriori vs 0 in FP-Growth). This
means FP-Growth confirms a subset of Apriori patterns, specifically
those with higher support levels. The absence of exclusive rules in
FP-Growth suggests it provides a more conservative, high-frequency set
of rules, whereas Apriori’s broader range helps explain its practical
value in discovering “hidden gems” or niche associations with lower
support but higher lift.
# =========================================================
# Rule Quality Comparison (Lift/Confidence)
# =========================================================
# Safety checks
stopifnot(exists("rules"), exists("fpg_rules"))
# Extract quality metrics
apriori_q <- quality(rules)[, c("support", "confidence", "lift")]
fpg_q <- quality(fpg_rules)[, c("support", "confidence", "lift")]
# Summary statistics
cat("[Apriori Quality Summary]\n")
## [Apriori Quality Summary]
apriori_summary_df <- data.frame(
Metric = c("Support", "Confidence", "Lift"),
Min = c(min(apriori_q$support), min(apriori_q$confidence), min(apriori_q$lift)),
Mean = c(mean(apriori_q$support), mean(apriori_q$confidence), mean(apriori_q$lift)),
Max = c(max(apriori_q$support), max(apriori_q$confidence), max(apriori_q$lift))
)
formattable::formattable(apriori_summary_df)
|
Metric
|
Min
|
Mean
|
Max
|
|
Support
|
0.0051044
|
0.009380302
|
0.01954561
|
|
Confidence
|
0.2527340
|
0.293323569
|
0.37895907
|
|
Lift
|
1.7420349
|
2.400580954
|
5.78901715
|
cat("\n[FP-Growth Quality Summary]\n")
##
## [FP-Growth Quality Summary]
fpg_summary_df <- data.frame(
Metric = c("Support", "Confidence", "Lift"),
Min = c(min(fpg_q$support), min(fpg_q$confidence), min(fpg_q$lift)),
Mean = c(mean(fpg_q$support), mean(fpg_q$confidence), mean(fpg_q$lift)),
Max = c(max(fpg_q$support), max(fpg_q$confidence), max(fpg_q$lift))
)
formattable::formattable(fpg_summary_df)
|
Metric
|
Min
|
Mean
|
Max
|
|
Support
|
0.01016214
|
0.01363188
|
0.01954561
|
|
Confidence
|
0.26787232
|
0.30920496
|
0.37895907
|
|
Lift
|
1.81674944
|
2.24012537
|
2.57015611
|
# Compare top-k by lift
k <- 10
cat(sprintf("\nTop-%d rules by lift (Apriori):\n", k))
##
## Top-10 rules by lift (Apriori):
inspect(head(sort(rules, by = "lift"), k))
## lhs rhs support confidence coverage lift count kulczynski certainty
## [1] {Organic Cilantro} => {Limes} 0.005439302 0.2527340 0.02152184 5.789017 5246 0.1886622 0.2186210
## [2] {Organic Large Extra Fancy Fuji Apple} => {Bag of Organic Bananas} 0.007272448 0.3089323 0.02354059 2.619325 7014 0.1852964 0.2165268
## [3] {Organic Fuji Apple} => {Banana} 0.010606950 0.3789591 0.02798970 2.570156 10230 0.2254485 0.2715524
## [4] {Organic Raspberries} => {Bag of Organic Bananas} 0.012699308 0.2966264 0.04281247 2.514988 12248 0.2021496 0.2025753
## [5] {Organic Hass Avocado} => {Bag of Organic Bananas} 0.019545612 0.2936293 0.06656561 2.489576 18851 0.2296747 0.1991775
## [6] {Honeycrisp Apple} => {Banana} 0.008993615 0.3606953 0.02493411 2.446288 8674 0.2108456 0.2501300
## [7] {Apple Honeycrisp Organic} => {Bag of Organic Bananas} 0.007302517 0.2789970 0.02617418 2.365514 7043 0.1704562 0.1825887
## [8] {Organic Cucumber} => {Bag of Organic Bananas} 0.006616124 0.2674126 0.02474125 2.267295 6381 0.1617542 0.1694553
## [9] {Cucumber Kirby} => {Banana} 0.010162142 0.3334921 0.03047191 2.261793 9801 0.2012066 0.2182222
## [10] {Organic Gala Apples} => {Bag of Organic Bananas} 0.005886183 0.2592593 0.02270385 2.198165 5677 0.1545830 0.1602117
cat("\n=== Apriori Top Rules (Table View) ===\n")
##
## === Apriori Top Rules (Table View) ===
apriori_top_df <- as(head(sort(rules, by = "lift"), k), "data.frame")
knitr::kable(apriori_top_df, digits = 2, format = "html") %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12
) %>%
kableExtra::scroll_box(width = "100%", height = "300px")
|
|
rules
|
support
|
confidence
|
coverage
|
lift
|
count
|
kulczynski
|
certainty
|
|
4
|
{Organic Cilantro} => {Limes}
|
0.01
|
0.25
|
0.02
|
5.79
|
5246
|
0.19
|
0.22
|
|
6
|
{Organic Large Extra Fancy Fuji Apple} => {Bag of Organic Bananas}
|
0.01
|
0.31
|
0.02
|
2.62
|
7014
|
0.19
|
0.22
|
|
9
|
{Organic Fuji Apple} => {Banana}
|
0.01
|
0.38
|
0.03
|
2.57
|
10230
|
0.23
|
0.27
|
|
15
|
{Organic Raspberries} => {Bag of Organic Bananas}
|
0.01
|
0.30
|
0.04
|
2.51
|
12248
|
0.20
|
0.20
|
|
18
|
{Organic Hass Avocado} => {Bag of Organic Bananas}
|
0.02
|
0.29
|
0.07
|
2.49
|
18851
|
0.23
|
0.20
|
|
7
|
{Honeycrisp Apple} => {Banana}
|
0.01
|
0.36
|
0.02
|
2.45
|
8674
|
0.21
|
0.25
|
|
10
|
{Apple Honeycrisp Organic} => {Bag of Organic Bananas}
|
0.01
|
0.28
|
0.03
|
2.37
|
7043
|
0.17
|
0.18
|
|
11
|
{Organic Cucumber} => {Bag of Organic Bananas}
|
0.01
|
0.27
|
0.02
|
2.27
|
6381
|
0.16
|
0.17
|
|
13
|
{Cucumber Kirby} => {Banana}
|
0.01
|
0.33
|
0.03
|
2.26
|
9801
|
0.20
|
0.22
|
|
3
|
{Organic Gala Apples} => {Bag of Organic Bananas}
|
0.01
|
0.26
|
0.02
|
2.20
|
5677
|
0.15
|
0.16
|
cat(sprintf("\nTop-%d rules by lift (FP-Growth):\n", k))
##
## Top-10 rules by lift (FP-Growth):
inspect(head(sort(fpg_rules, by = "lift"), k))
## lhs rhs support confidence
## [1] {Organic Fuji Apple} => {Banana} 0.01060695 0.3789591
## [2] {Organic Raspberries} => {Bag of Organic Bananas} 0.01269931 0.2966264
## [3] {Organic Hass Avocado} => {Bag of Organic Bananas} 0.01954561 0.2936293
## [4] {Cucumber Kirby} => {Banana} 0.01016214 0.3334921
## [5] {Organic Avocado} => {Banana} 0.01682907 0.3058241
## [6] {Strawberries} => {Banana} 0.01283306 0.2880315
## [7] {Large Lemon} => {Banana} 0.01274700 0.2678723
## lift count
## [1] 2.570156 10230
## [2] 2.514988 12248
## [3] 2.489576 18851
## [4] 2.261793 9801
## [5] 2.074144 16231
## [6] 1.953472 12377
## [7] 1.816749 12294
cat("\n=== FP-Growth Top Rules (Table View) ===\n")
##
## === FP-Growth Top Rules (Table View) ===
fpg_top_df <- as(head(sort(fpg_rules, by = "lift"), k), "data.frame")
knitr::kable(fpg_top_df, digits = 2, format = "html") %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12
) %>%
kableExtra::scroll_box(width = "100%", height = "300px")
|
|
rules
|
support
|
confidence
|
lift
|
count
|
|
16
|
{Organic Fuji Apple} => {Banana}
|
0.01
|
0.38
|
2.57
|
10230
|
|
7
|
{Organic Raspberries} => {Bag of Organic Bananas}
|
0.01
|
0.30
|
2.51
|
12248
|
|
2
|
{Organic Hass Avocado} => {Bag of Organic Bananas}
|
0.02
|
0.29
|
2.49
|
18851
|
|
12
|
{Cucumber Kirby} => {Banana}
|
0.01
|
0.33
|
2.26
|
9801
|
|
5
|
{Organic Avocado} => {Banana}
|
0.02
|
0.31
|
2.07
|
16231
|
|
11
|
{Strawberries} => {Banana}
|
0.01
|
0.29
|
1.95
|
12377
|
|
6
|
{Large Lemon} => {Banana}
|
0.01
|
0.27
|
1.82
|
12294
|
# =========================================================
# Rule Overlap (Apriori vs FP-Growth)
# =========================================================
# Convert rules to comparable string form
apriori_rules_chr <- as(rules, "data.frame")$rules
fpg_rules_chr <- as(fpg_rules, "data.frame")$rules
# Overlap stats
overlap <- intersect(apriori_rules_chr, fpg_rules_chr)
only_apriori <- setdiff(apriori_rules_chr, fpg_rules_chr)
only_fpg <- setdiff(fpg_rules_chr, apriori_rules_chr)
cat("[Rule Overlap Summary]
")
## [Rule Overlap Summary]
cat("Overlap rules:", length(overlap), "\n")
## Overlap rules: 7
cat("Only Apriori rules:", length(only_apriori), "\n")
## Only Apriori rules: 11
cat("Only FP-Growth rules:", length(only_fpg), "\n")
## Only FP-Growth rules: 0
# Optional: show a few exclusive rules
cat("\nExamples only in Apriori (up to 5):\n")
##
## Examples only in Apriori (up to 5):
if (length(only_apriori) > 0) {
formattable::formattable(data.frame(Rules = head(only_apriori, 5)))
}
|
Rules
|
|
{Blueberries} => {Banana}
|
|
{Original Hummus} => {Banana}
|
|
{Organic Gala Apples} => {Bag of Organic Bananas}
|
|
{Organic Cilantro} => {Limes}
|
|
{Yellow Onions} => {Banana}
|
cat("\nExamples only in FP-Growth (up to 5):\n")
##
## Examples only in FP-Growth (up to 5):
if (length(only_fpg) > 0) {
formattable::formattable(data.frame(Rules = head(only_fpg, 5)))
}