library(arules)
library(arulesViz)
library(plotly)
library(htmlwidgets)
# load the store transaction data into a sparse matrix
store <- read.transactions("StoreTransactions.csv",
format = "single",
sep = ",",
cols = c("Transaction", "Item"),
header = TRUE,
rm.duplicates = TRUE)
summary(store)
## transactions as itemMatrix in sparse format with
## 6613 rows (elements/itemsets/transactions) and
## 103 columns (items) and a density of 0.02028367
##
## most frequent items:
## Coffee Bread Tea Cake Pastry (Other)
## 3188 2146 941 694 576 6271
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10
## 2555 2154 1078 546 187 67 18 3 2 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.089 3.000 10.000
##
## includes extended item information - examples:
## labels
## 1 Adjustment
## 2 Afternoon with the baker
## 3 Alfajores
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 10
## 3 1000
itemFrequencyPlot(store, topN=20,
main="Top 20 Items")
# Visualize the sparse matrix for the first 5 transactions
arules::image(store[1:5])
# Visualize a random sample of 100 transactions
arules::image(sample(store, 100))
# Use the same ballpark settings as lecture
rules <- apriori(store,
parameter=list(supp=0.005, conf=0.30, maxlen=3, target="rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 3 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 33
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[103 item(s), 6613 transaction(s)] done [0.00s].
## sorting and recoding items ... [36 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3
## done [0.00s].
## writing ... [42 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
length(rules)
## [1] 42
Insight: If you get zero rules, relax thresholds slightly (uncomment one line at a time) rules <- apriori(trans, parameter = list(supp = 0.005, conf = 0.25, maxlen = 3))
If you get too many rules, tighten thresholds rules <- apriori(trans, parameter = list(supp = 0.02, conf = 0.40, maxlen = 3))
#Inspect Rules:
rules_lift <- sort(rules, by="lift", decreasing=TRUE)
inspect(head(rules_lift, 10))
## lhs rhs support confidence coverage
## [1] {Cake, Sandwich} => {Coffee} 0.005595040 0.7551020 0.007409648
## [2] {Toast} => {Coffee} 0.025706941 0.7296137 0.035233631
## [3] {NONE, Sandwich} => {Coffee} 0.005141388 0.6415094 0.008014517
## [4] {Spanish Brunch} => {Coffee} 0.014063209 0.6326531 0.022228943
## [5] {Cake, Hot chocolate} => {Coffee} 0.006502344 0.6323529 0.010282776
## [6] {Salad} => {Coffee} 0.007863300 0.6117647 0.012853470
## [7] {NONE} => {Coffee} 0.041735975 0.5810526 0.071828217
## [8] {Medialuna} => {Coffee} 0.032965371 0.5751979 0.057311356
## [9] {Sandwich} => {Coffee} 0.042340844 0.5679513 0.074550129
## [10] {Pastry} => {Coffee} 0.048691970 0.5590278 0.087101164
## lift count
## [1] 1.566339 37
## [2] 1.513468 170
## [3] 1.330710 34
## [4] 1.312338 93
## [5] 1.311716 43
## [6] 1.269009 52
## [7] 1.205301 276
## [8] 1.193157 218
## [9] 1.178125 280
## [10] 1.159614 322
rules_conf <- sort(rules, by="confidence", decreasing=TRUE)
inspect(head(rules_conf, 10))
## lhs rhs support confidence coverage
## [1] {Cake, Sandwich} => {Coffee} 0.005595040 0.7551020 0.007409648
## [2] {Toast} => {Coffee} 0.025706941 0.7296137 0.035233631
## [3] {NONE, Sandwich} => {Coffee} 0.005141388 0.6415094 0.008014517
## [4] {Spanish Brunch} => {Coffee} 0.014063209 0.6326531 0.022228943
## [5] {Cake, Hot chocolate} => {Coffee} 0.006502344 0.6323529 0.010282776
## [6] {Salad} => {Coffee} 0.007863300 0.6117647 0.012853470
## [7] {NONE} => {Coffee} 0.041735975 0.5810526 0.071828217
## [8] {Medialuna} => {Coffee} 0.032965371 0.5751979 0.057311356
## [9] {Sandwich} => {Coffee} 0.042340844 0.5679513 0.074550129
## [10] {Pastry} => {Coffee} 0.048691970 0.5590278 0.087101164
## lift count
## [1] 1.566339 37
## [2] 1.513468 170
## [3] 1.330710 34
## [4] 1.312338 93
## [5] 1.311716 43
## [6] 1.269009 52
## [7] 1.205301 276
## [8] 1.193157 218
## [9] 1.178125 280
## [10] 1.159614 322
plot(head(rules_lift, 100), method="scatterplot",
measure=c("support","confidence"), shading="lift")
plot(head(rules_lift, 50), method="grouped")
-Darker clusters or higher-confidence points with high lift are the most actionable pairs/sets for cross-sell, placement, or bundles.
-Customers who buy muffin or pastry often also buy coffee (lift > 1, confidence ≈ 0.5 - 0.65). Consider placing/bundling these together.
-Sweet snacks like brownie tend to point ot coffee on RHS, which could be bundled in promotion theme like “afternoon tea”.
# Save as a simple CSV (easy to share)
rules_df <- as(rules, "data.frame")
write.csv(rules_df, "association_rules_Yuhan_Xinyuan.csv", row.names = FALSE)