library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(htmlwidgets)
# load the store transaction data into a sparse matrix
store <- read.transactions("StoreTransactions.csv",
format = "single",
sep = ",",
cols = c("Transaction", "Item"),
header = TRUE,
rm.duplicates = TRUE)
summary(store)
## transactions as itemMatrix in sparse format with
## 6613 rows (elements/itemsets/transactions) and
## 103 columns (items) and a density of 0.02028367
##
## most frequent items:
## Coffee Bread Tea Cake Pastry (Other)
## 3188 2146 941 694 576 6271
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10
## 2555 2154 1078 546 187 67 18 3 2 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.089 3.000 10.000
##
## includes extended item information - examples:
## labels
## 1 Adjustment
## 2 Afternoon with the baker
## 3 Alfajores
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 10
## 3 1000
# --- Visual check: sparse matrix like in lecture (optional, simple) ---
arules::image(store[1:5])
arules::image(sample(store, 100))
# get frequent pairs (size=2). adjust supp if you get too many/too few.
pairs_2 <- eclat(store,
parameter = list(
supp = 0.005,
maxlen = 2
))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.005 1 2 frequent itemsets TRUE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 33
##
## create itemset ...
## set transactions ...[103 item(s), 6613 transaction(s)] done [0.00s].
## sorting and recoding items ... [36 item(s)] done [0.00s].
## creating sparse bit matrix ... [36 row(s), 6613 column(s)] done [0.00s].
## writing ... [110 set(s)] done [0.00s].
## Creating S4 object ... done [0.00s].
length(pairs_2) # how many pairs we found
## [1] 110
# add quality measures: support, leverage (<0 means avoid each other), lift (<1 means negative assoc)
# Get support and lift for each pair
q_pairs <- interestMeasure(
pairs_2,
measure = c("support", "lift"),
transactions = store
)
pairs_df <- data.frame(
pair = labels(pairs_2),
support = q_pairs[, "support"],
lift = q_pairs[, "lift"],
row.names = NULL,
stringsAsFactors = FALSE
)
# "Substitute" candidates: lift < 1 (co-occur less than expected)
subs_df <- pairs_df[pairs_df$lift < 1, ]
subs_df <- subs_df[order(subs_df$lift), ] # smallest lift first
# Look at the strongest 10 substitutes
head(subs_df, 10)
## pair support lift
## 7 {Coffee,Scandinavian} 0.005292605 0.3761759
## 13 {Coffee,Farm House} 0.007712082 0.4198072
## 74 {Bread,Coffee} 0.089823076 0.5741653
## 39 {Bread,Juice} 0.007863300 0.6024077
## 28 {Bread,Soup} 0.007258430 0.6375615
## 73 {Bread,Tea} 0.029487373 0.6385778
## 22 {Bread,Muffin} 0.008014517 0.6805083
## 70 {Bread,Cake} 0.023287464 0.6838015
## 63 {Bread,Sandwich} 0.016936338 0.7000675
## 25 {Bread,Toast} 0.008014517 0.7009528
plot(pairs_df$support,
pairs_df$lift,
xlab = "support",
ylab = "lift",
main = "All Item Pairs: Support vs Lift",
pch = 19, cex = 0.6)
abline(h = 1, lty = 2, col = "red")
Most pairs lie around lift ≈ 1 (no strong relationship).
Points below the red line (lift < 1) are candidates for substitution: customers tend to buy these items separately rather than together.
Lift < 1 ⇒ the two items appear together less often than expected if they were independent.
The smaller the lift, the stronger the negative association ⇒ more likely the items are substitutes
“The pair {Coffee,Scandinavian} has lift 0.37:
customers tend to buy one or the other, not both. This suggests these
items behave as substitutes, so co-promoting them in a bundle may not be
effective; instead we should bundle each with complementary bakery
items.”
top_subs <- head(subs_df, 10) # 10 smallest lifts
barplot(top_subs$lift,
names.arg = top_subs$pair,
las = 2,
ylim = c(0, 1),
main = "Top 10 Substitute Pairs (lift < 1)",
ylab = "lift")
abline(h = 1, lty = 2)
These 10 pairs have the lowest lifts in the dataset, meaning they are
the strongest substitutes. For example, if
{bread, toast}
appears at the left with lift ≈ 0.7, shoppers are about 30% less likely
to buy them together than we’d expect from their individual
frequencies.