knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(arules)
library(arulesViz)
library(tidyverse)
library(caret)
#Association rules
txns <- read.transactions(
"GroceryDataSet.csv",
format = "basket",
sep = ",",
rm.duplicates = TRUE
)
#Run Apriori - minimum support of 0.01, confidence of 0.25, and at least one item on each side
rules <- apriori(txns,
parameter = list(support = 0.01,
confidence = 0.25,
minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 98
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [170 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
#Top 10 by lift sorted
top10 <- head(sort(rules, by = "lift"), 10)
as(top10, "data.frame") |>
arrange(desc(lift)) |>
mutate(support = round(support, 4),
confidence = round(confidence, 4),
lift = round(lift, 4)) |>
select(rules, support, confidence, lift) |>
print()
## rules support confidence
## 1 {citrus fruit,other vegetables} => {root vegetables} 0.0104 0.3592
## 2 {other vegetables,tropical fruit} => {root vegetables} 0.0123 0.3428
## 3 {beef} => {root vegetables} 0.0174 0.3314
## 4 {citrus fruit,root vegetables} => {other vegetables} 0.0104 0.5862
## 5 {root vegetables,tropical fruit} => {other vegetables} 0.0123 0.5845
## 6 {other vegetables,whole milk} => {root vegetables} 0.0232 0.3098
## 7 {curd,whole milk} => {yogurt} 0.0101 0.3852
## 8 {other vegetables,yogurt} => {root vegetables} 0.0129 0.2974
## 9 {other vegetables,yogurt} => {tropical fruit} 0.0123 0.2834
## 10 {other vegetables,rolls/buns} => {root vegetables} 0.0122 0.2864
## lift
## 1 3.2950
## 2 3.1448
## 3 3.0404
## 4 3.0296
## 5 3.0210
## 6 2.8421
## 7 2.7614
## 8 2.7287
## 9 2.7005
## 10 2.6275
#Network graph - nodes are items, edges are rules, color = lift, size = support
#Top10 by lift graph
plot(top10, method = "graph")
There are two patterns that can be seen from the results. The first and most dominant one is a produce cluster, in which citrus fruit, tropical fruit, other vegetables, and root vegetables tend to be purchased together, with root vegetables acting as a natural endpoint across six of the ten rules. The strongest rule, {citrus fruit, other vegetables} => {root vegetables}, has a lift of 3.30, meaning that customers with those two items in their basket are 3.3 times more likely to also buy root vegetables than just a random customer. It’s probable that these are customers shopping with a meal in mind already, rather than just grabbing items at random.
The second pattern is a smaller dairy cluster, where {curd, whole milk} => {yogurt} has a lift of 2.76, meaning that there is a group of shoppers making multiple dairy purchases in a single trip. This is visible in the network graph as a separate, loosely connected node away from the produce hub.
#Cluster analysis
#Narrows down to top 30 items by frequency
top_items <- names(sort(itemFrequency(txns), decreasing = TRUE))[1:30]
#Convert to a binary matrix - 1 if the item was purchased, 0 otherwise
bmat <- as(txns[, top_items], "matrix") * 1L
bmat_df <- as.data.frame(bmat)
#Center and scale before clustering
bmat_scaled <- predict(preProcess(bmat_df, method = c("center", "scale")), bmat_df)
set.seed(33)
km <- kmeans(bmat_scaled, centers = 4, nstart = 25, iter.max = 100)
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(km$cluster))
##
## 1 2 3 4
## 477 514 2198 6646
#Profile each cluster by average purchase rate per item, print top 5 per cluster
#What does each cluster buy?
bmat_df |>
mutate(cluster = km$cluster) |>
group_by(cluster) |>
summarise(across(everything(), mean)) |>
pivot_longer(-cluster, names_to = "item", values_to = "avg") |>
group_by(cluster) |>
slice_max(avg, n = 5) |>
print(n = 40)
## # A tibble: 20 × 3
## # Groups: cluster [4]
## cluster item avg
## <int> <chr> <dbl>
## 1 1 napkins 1
## 2 1 whole milk 0.373
## 3 1 other vegetables 0.260
## 4 1 soda 0.233
## 5 1 yogurt 0.231
## 6 2 beef 1
## 7 2 whole milk 0.403
## 8 2 other vegetables 0.377
## 9 2 root vegetables 0.333
## 10 2 rolls/buns 0.261
## 11 3 whole milk 0.527
## 12 3 other vegetables 0.478
## 13 3 yogurt 0.339
## 14 3 root vegetables 0.276
## 15 3 tropical fruit 0.269
## 16 4 soda 0.165
## 17 4 rolls/buns 0.154
## 18 4 whole milk 0.146
## 19 4 canned beer 0.0963
## 20 4 shopping bags 0.0825
The clustering identified four groups of shoppers. Clusters 3 and 4 are by far the largest. The fourth cluster, which is the largest, shows low purchase rates across most items, with soda, beer, and rolls/buns being the most common. This suggests that these are customers who make smaller and less varied trips, they might just be trying to top up their existing food pantry. Cluster 3 is the opposite, where we have high rates of whole milk, vegetables, yogurt, and fruit, which points to customers doing a more complete weekly shop. Perhaps clusters 3 and 4 show the difference between shoppers who use this store as their main one, compared to ones who don’t.
The two smaller clusters are more specific. Cluster 2 is almost entirely defined by beef purchases, paired with root vegetables and other vegetables. This is likely customers shopping around for a specific meal. Cluster 1 is defined by napkins, alongside dairy and soda, which could reflect customers shopping for a specific occasion or event rather than routine groceries. Overall, the clustering suggests that, while most customers make relatively simple trips, a meaningful subset shops in more deliberate and predictable patterns, which is where targeted promotions would have the most impact.