library(dplyr)
library(tidyr)
library(ggplot2)
library(magrittr)
library(readr)
library(tibble)
library(stringr)
library(purrr)
library(cluster)
library(knitr)raw <- read_csv(
"https://github.com/vincent-usny/hw10/raw/refs/heads/main/GroceryDataSet.csv",
col_names = FALSE,
show_col_types = FALSE
)
txns <- raw %>%
mutate(tid = row_number()) %>%
pivot_longer(cols = -tid, values_to = "item") %>%
filter(!is.na(item), item != "") %>%
select(tid, item) %>%
mutate(item = str_trim(item))
n_txns <- n_distinct(txns$tid)
n_items <- n_distinct(txns$item)
cat("Transactions :", n_txns, "\n")## Transactions : 9835
## Unique items : 169
item_freq <- txns %>%
count(item, sort = TRUE) %>%
mutate(support = n / n_txns)
item_freq %>%
head(20) %>%
kable(digits = 4, caption = "Top 20 Items by Support")| item | n | support |
|---|---|---|
| whole milk | 2513 | 0.2555 |
| other vegetables | 1903 | 0.1935 |
| rolls/buns | 1809 | 0.1839 |
| soda | 1715 | 0.1744 |
| yogurt | 1372 | 0.1395 |
| bottled water | 1086 | 0.1104 |
| root vegetables | 1072 | 0.1090 |
| tropical fruit | 1032 | 0.1049 |
| shopping bags | 968 | 0.0984 |
| sausage | 924 | 0.0940 |
| pastry | 875 | 0.0890 |
| citrus fruit | 814 | 0.0828 |
| bottled beer | 790 | 0.0803 |
| newspapers | 784 | 0.0797 |
| canned beer | 764 | 0.0777 |
| pip fruit | 744 | 0.0756 |
| fruit/vegetable juice | 709 | 0.0721 |
| whipped/sour cream | 705 | 0.0717 |
| brown bread | 638 | 0.0649 |
| domestic eggs | 624 | 0.0634 |
item_freq %>%
head(20) %>%
mutate(item = reorder(item, support)) %>%
ggplot(aes(x = support, y = item, fill = support)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#a8d5e2", high = "#1a6b8a") +
scale_x_continuous(labels = scales::percent_format()) +
labs(
title = "Top 20 Items by Transaction Support",
x = "Support (%)",
y = NULL
) +
theme_minimal(base_size = 12)Whole milk dominates by a mile, appearing in about 1 in 4 baskets. It’s essentially the gravity center of this entire dataset.
Association rules describe the relationship {antecedent} => {consequent}.
# For efficiency, work with lists of item sets per transaction
txn_list <- txns %>%
group_by(tid) %>%
summarise(items = list(item), .groups = "drop")
# Item supports
item_sup <- item_freq %>%
select(item, support) %>%
deframe() # named numeric vector
# For each transaction, generate all 2-item combinations
pair_counts <- txn_list$items %>%
map(function(its) {
if (length(its) < 2) return(NULL)
combn(sort(its), 2, simplify = FALSE)
}) %>%
compact() %>%
flatten() %>%
map_chr(~ paste(.x, collapse = "|||")) %>%
table() %>%
as_tibble(.name_repair = "minimal") %>%
setNames(c("pair", "n")) %>%
mutate(
antecedent = str_split_fixed(pair, "\\|\\|\\|", 2)[, 1],
consequent = str_split_fixed(pair, "\\|\\|\\|", 2)[, 2],
pair_support = as.integer(n) / n_txns
)
cat("Candidate pairs generated:", nrow(pair_counts), "\n")## Candidate pairs generated: 9559
MIN_SUPPORT <- 0.005 # rule must appear in >= 0.5% of transactions
MIN_CONFIDENCE <- 0.20 # at least 20% of the time, {A} => {B}
rules <- pair_counts %>%
filter(pair_support >= MIN_SUPPORT) %>%
# Both directions: A -> B and B -> A
bind_rows(
pair_counts %>%
filter(pair_support >= MIN_SUPPORT) %>%
rename(antecedent = consequent, consequent = antecedent)
) %>%
mutate(
sup_antecedent = item_sup[antecedent],
sup_consequent = item_sup[consequent],
confidence = pair_support / sup_antecedent,
lift = confidence / sup_consequent
) %>%
filter(confidence >= MIN_CONFIDENCE) %>%
select(antecedent, consequent, pair_support, confidence, lift) %>%
rename(support = pair_support) %>%
arrange(desc(lift))
cat("Rules meeting thresholds:", nrow(rules), "\n")## Rules meeting thresholds: 264
rules %>%
head(10) %>%
mutate(
Rule = paste0("{", antecedent, "} => {", consequent, "}"),
Support = round(support, 4),
Confidence = round(confidence, 4),
Lift = round(lift, 4)
) %>%
select(Rule, Support, Confidence, Lift) %>%
kable(caption = "Top 10 Association Rules by Lift")| Rule | Support | Confidence | Lift |
|---|---|---|---|
| {herbs} => {root vegetables} | 0.0070 | 0.4312 | 3.9565 |
| {berries} => {whipped/sour cream} | 0.0090 | 0.2722 | 3.7969 |
| {sliced cheese} => {sausage} | 0.0070 | 0.2863 | 3.0474 |
| {beef} => {root vegetables} | 0.0174 | 0.3314 | 3.0404 |
| {onions} => {root vegetables} | 0.0095 | 0.3049 | 2.7975 |
| {grapes} => {tropical fruit} | 0.0061 | 0.2727 | 2.5991 |
| {pip fruit} => {tropical fruit} | 0.0204 | 0.2702 | 2.5746 |
| {soft cheese} => {yogurt} | 0.0060 | 0.3512 | 2.5175 |
| {herbs} => {other vegetables} | 0.0077 | 0.4750 | 2.4549 |
| {onions} => {other vegetables} | 0.0142 | 0.4590 | 2.3723 |
rules %>%
head(10) %>%
mutate(Rule = paste0("{", antecedent, "}\n=> {", consequent, "}"),
Rule = reorder(Rule, lift)) %>%
ggplot(aes(x = lift, y = Rule, fill = confidence)) +
geom_col() +
geom_text(aes(label = round(lift, 2)), hjust = -0.15, size = 3.5) +
scale_fill_gradient(low = "#f7c59f", high = "#e05c5c",
name = "Confidence") +
scale_x_continuous(expand = expansion(mult = c(0, 0.12))) +
labs(
title = "Top 10 Association Rules by Lift",
x = "Lift",
y = NULL
) +
theme_minimal(base_size = 11)Lift is the most useful metric here because it controls for item popularity. Herbs and root vegetables topping this chart makes intuitive sense.
rules %>%
ggplot(aes(x = support, y = confidence, color = lift, size = lift)) +
geom_point(alpha = 0.65) +
scale_color_gradient(low = "#91c4f2", high = "#c0392b", name = "Lift") +
scale_size_continuous(range = c(1, 6), guide = "none") +
scale_x_continuous(labels = scales::percent_format()) +
scale_y_continuous(labels = scales::percent_format()) +
labs(
title = "Association Rules: Support vs. Confidence",
subtitle = "Bubble size and color reflect Lift",
x = "Support",
y = "Confidence"
) +
theme_minimal(base_size = 12)Each bubble is one rule, and you want to look at the top-right corner: high support and high confidence. The color tells you that those red bubbles are the really interesting ones where the co-purchase is way above what you’d expect by chance.
top15 <- item_freq$item[1:15]
heatmap_data <- pair_counts %>%
filter(antecedent %in% top15, consequent %in% top15) %>%
select(antecedent, consequent, pair_support) %>%
bind_rows(
tibble(antecedent = top15, consequent = top15, pair_support = item_sup[top15])
)
# Make symmetric
heatmap_sym <- heatmap_data %>%
bind_rows(
heatmap_data %>% rename(antecedent = consequent, consequent = antecedent)
) %>%
group_by(antecedent, consequent) %>%
summarise(pair_support = max(pair_support), .groups = "drop")
heatmap_sym %>%
ggplot(aes(x = antecedent, y = consequent, fill = pair_support)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "#1a6b8a",
name = "Support", labels = scales::percent_format()) +
labs(
title = "Co-occurrence Support Heatmap — Top 15 Items",
x = NULL, y = NULL
) +
theme_minimal(base_size = 10) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))The darker the cell, the more often that pair ends up in the same basket. Whole milk lights up the whole right column and bottom row because it co-occurs with almost everything.
We cluster the top 50 items based on their co-occurrence profiles across transactions.
TOP_N <- 50
top_items <- item_freq$item[1:TOP_N]
# Build item x item co-occurrence matrix (support for each pair)
cooc_wide <- pair_counts %>%
filter(antecedent %in% top_items, consequent %in% top_items) %>%
select(antecedent, consequent, pair_support) %>%
bind_rows(
# Symmetric: add reversed pairs
pair_counts %>%
filter(antecedent %in% top_items, consequent %in% top_items) %>%
rename(antecedent = consequent, consequent = antecedent) %>%
select(antecedent, consequent, pair_support)
) %>%
group_by(antecedent, consequent) %>%
summarise(pair_support = max(pair_support), .groups = "drop") %>%
pivot_wider(
names_from = consequent,
values_from = pair_support,
values_fill = 0
) %>%
column_to_rownames("antecedent")
# Keep only columns that exist as rows (square matrix)
common <- intersect(rownames(cooc_wide), colnames(cooc_wide))
cooc_mat <- as.matrix(cooc_wide[common, common])
cat("Co-occurrence matrix:", nrow(cooc_mat), "x", ncol(cooc_mat), "\n")## Co-occurrence matrix: 50 x 50
set.seed(42)
wss <- map_dbl(1:10, function(k) {
kmeans(cooc_mat, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})
tibble(k = 1:10, wss = wss) %>%
ggplot(aes(x = k, y = wss)) +
geom_line(color = "#1a6b8a", linewidth = 1) +
geom_point(size = 3, color = "#e05c5c") +
scale_x_continuous(breaks = 1:10) +
labs(
title = "Elbow Method — Choosing K",
x = "Number of Clusters (K)",
y = "Total Within-Cluster SS"
) +
theme_minimal(base_size = 12)We’re looking for the bend where adding another cluster stops buying us much. K=5 is a reasonable call here — the curve flattens noticeably after that point, meaning five groups capture the main structure without overfitting.
K <- 5
set.seed(42)
km <- kmeans(cooc_mat, centers = K, nstart = 50, iter.max = 200)
cluster_df <- tibble(
item = names(km$cluster),
cluster = km$cluster
) %>%
left_join(item_freq %>% select(item, support), by = "item") %>%
arrange(cluster, desc(support))
cluster_df %>%
group_by(cluster) %>%
summarise(
n_items = n(),
top_items = paste(head(item, 5), collapse = ", "),
avg_support = round(mean(support), 4)
) %>%
kable(caption = paste("K-Means Cluster Summary (K =", K, ")"))| cluster | n_items | top_items | avg_support |
|---|---|---|---|
| 1 | 21 | canned beer, chocolate, white bread, cream cheese, waffles | 0.0360 |
| 2 | 10 | soda, bottled water, root vegetables, tropical fruit, shopping bags | 0.1010 |
| 3 | 1 | whole milk | 0.2555 |
| 4 | 15 | bottled beer, newspapers, fruit/vegetable juice, brown bread, domestic eggs | 0.0599 |
| 5 | 3 | other vegetables, rolls/buns, yogurt | 0.1723 |
# PCA for 2D visualization
pca <- prcomp(cooc_mat, scale. = TRUE)
pca_df <- as_tibble(pca$x[, 1:2], rownames = "item") %>%
left_join(cluster_df %>% select(item, cluster), by = "item") %>%
mutate(cluster = factor(cluster))
pca_df %>%
ggplot(aes(x = PC1, y = PC2, color = cluster, label = item)) +
geom_point(size = 3, alpha = 0.8) +
geom_text(aes(label = item), size = 2.8, vjust = -0.7, hjust = 0.5, show.legend = FALSE) +
scale_color_brewer(palette = "Set1", name = "Cluster") +
labs(
title = "K-Means Clusters of Grocery Items (PCA projection)",
subtitle = paste("Top", nrow(cooc_mat), "items clustered by co-occurrence profile"),
x = paste0("PC1 (", round(summary(pca)$importance[2,1]*100,1), "% var)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2]*100,1), "% var)")
) +
theme_minimal(base_size = 11)PCA squashes the 50-item co-occurrence space down to two dimensions so we can actually see what the clusters are doing. Items that land close together tend to be bought by the same types of shoppers.
sil <- silhouette(km$cluster, dist(cooc_mat))
sil_df <- as.data.frame(sil[, 1:3]) %>%
mutate(item = rownames(cooc_mat)) %>%
arrange(cluster, sil_width) %>%
mutate(order = row_number(), cluster = factor(cluster))
ggplot(sil_df, aes(x = order, y = sil_width, fill = cluster)) +
geom_col(width = 0.9) +
scale_fill_brewer(palette = "Set1", name = "Cluster") +
geom_hline(yintercept = mean(sil_df$sil_width),
linetype = "dashed", color = "gray40") +
labs(
title = "Silhouette Plot - Cluster Cohesion & Separation",
subtitle = paste("Average silhouette width:", round(mean(sil_df$sil_width), 3)),
x = "Items",
y = "Silhouette Width"
) +
theme_minimal(base_size = 11)Each bar shows how well that item fits its assigned cluster. Positive is good, negative means it probably belongs somewhere else. A decent average silhouette around 0.2–0.3 is typical for grocery data since shopping baskets naturally overlap a lot across categories.