1. Data Loading

library(dplyr)
library(tidyr)
library(ggplot2)
library(magrittr)
library(readr)
library(tibble)
library(stringr)
library(purrr)
library(cluster)
library(knitr)
raw <- read_csv(
  "https://github.com/vincent-usny/hw10/raw/refs/heads/main/GroceryDataSet.csv",
  col_names       = FALSE,
  show_col_types  = FALSE
)

txns <- raw %>%
  mutate(tid = row_number()) %>%
  pivot_longer(cols = -tid, values_to = "item") %>%
  filter(!is.na(item), item != "") %>%
  select(tid, item) %>%
  mutate(item = str_trim(item))

n_txns <- n_distinct(txns$tid)
n_items <- n_distinct(txns$item)

cat("Transactions :", n_txns,  "\n")
## Transactions : 9835
cat("Unique items :", n_items, "\n")
## Unique items : 169

2. Exploratory Analysis

item_freq <- txns %>%
  count(item, sort = TRUE) %>%
  mutate(support = n / n_txns)

item_freq %>%
  head(20) %>%
  kable(digits = 4, caption = "Top 20 Items by Support")
Top 20 Items by Support
item n support
whole milk 2513 0.2555
other vegetables 1903 0.1935
rolls/buns 1809 0.1839
soda 1715 0.1744
yogurt 1372 0.1395
bottled water 1086 0.1104
root vegetables 1072 0.1090
tropical fruit 1032 0.1049
shopping bags 968 0.0984
sausage 924 0.0940
pastry 875 0.0890
citrus fruit 814 0.0828
bottled beer 790 0.0803
newspapers 784 0.0797
canned beer 764 0.0777
pip fruit 744 0.0756
fruit/vegetable juice 709 0.0721
whipped/sour cream 705 0.0717
brown bread 638 0.0649
domestic eggs 624 0.0634
item_freq %>%
  head(20) %>%
  mutate(item = reorder(item, support)) %>%
  ggplot(aes(x = support, y = item, fill = support)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#a8d5e2", high = "#1a6b8a") +
  scale_x_continuous(labels = scales::percent_format()) +
  labs(
    title = "Top 20 Items by Transaction Support",
    x     = "Support (%)",
    y     = NULL
  ) +
  theme_minimal(base_size = 12)

Whole milk dominates by a mile, appearing in about 1 in 4 baskets. It’s essentially the gravity center of this entire dataset.


3. Market Basket Analysis

Association rules describe the relationship {antecedent} => {consequent}.

# For efficiency, work with lists of item sets per transaction
txn_list <- txns %>%
  group_by(tid) %>%
  summarise(items = list(item), .groups = "drop")

# Item supports 
item_sup <- item_freq %>%
  select(item, support) %>%
  deframe()                         # named numeric vector

# For each transaction, generate all 2-item combinations
pair_counts <- txn_list$items %>%
  map(function(its) {
    if (length(its) < 2) return(NULL)
    combn(sort(its), 2, simplify = FALSE)
  }) %>%
  compact() %>%
  flatten() %>%
  map_chr(~ paste(.x, collapse = "|||")) %>%
  table() %>%
  as_tibble(.name_repair = "minimal") %>%
  setNames(c("pair", "n")) %>%
  mutate(
    antecedent  = str_split_fixed(pair, "\\|\\|\\|", 2)[, 1],
    consequent  = str_split_fixed(pair, "\\|\\|\\|", 2)[, 2],
    pair_support = as.integer(n) / n_txns
  )

cat("Candidate pairs generated:", nrow(pair_counts), "\n")
## Candidate pairs generated: 9559
MIN_SUPPORT    <- 0.005    # rule must appear in >= 0.5% of transactions
MIN_CONFIDENCE <- 0.20     # at least 20% of the time, {A} => {B}

rules <- pair_counts %>%
  filter(pair_support >= MIN_SUPPORT) %>%
  # Both directions: A -> B  and  B -> A
  bind_rows(
    pair_counts %>%
      filter(pair_support >= MIN_SUPPORT) %>%
      rename(antecedent = consequent, consequent = antecedent)
  ) %>%
  mutate(
    sup_antecedent = item_sup[antecedent],
    sup_consequent = item_sup[consequent],
    confidence     = pair_support / sup_antecedent,
    lift           = confidence   / sup_consequent
  ) %>%
  filter(confidence >= MIN_CONFIDENCE) %>%
  select(antecedent, consequent, pair_support, confidence, lift) %>%
  rename(support = pair_support) %>%
  arrange(desc(lift))

cat("Rules meeting thresholds:", nrow(rules), "\n")
## Rules meeting thresholds: 264

4. Results

4.1 Top 10 Rules by Lift

rules %>%
  head(10) %>%
  mutate(
    Rule       = paste0("{", antecedent, "} => {", consequent, "}"),
    Support    = round(support,    4),
    Confidence = round(confidence, 4),
    Lift       = round(lift,       4)
  ) %>%
  select(Rule, Support, Confidence, Lift) %>%
  kable(caption = "Top 10 Association Rules by Lift")
Top 10 Association Rules by Lift
Rule Support Confidence Lift
{herbs} => {root vegetables} 0.0070 0.4312 3.9565
{berries} => {whipped/sour cream} 0.0090 0.2722 3.7969
{sliced cheese} => {sausage} 0.0070 0.2863 3.0474
{beef} => {root vegetables} 0.0174 0.3314 3.0404
{onions} => {root vegetables} 0.0095 0.3049 2.7975
{grapes} => {tropical fruit} 0.0061 0.2727 2.5991
{pip fruit} => {tropical fruit} 0.0204 0.2702 2.5746
{soft cheese} => {yogurt} 0.0060 0.3512 2.5175
{herbs} => {other vegetables} 0.0077 0.4750 2.4549
{onions} => {other vegetables} 0.0142 0.4590 2.3723
rules %>%
  head(10) %>%
  mutate(Rule = paste0("{", antecedent, "}\n=> {", consequent, "}"),
         Rule = reorder(Rule, lift)) %>%
  ggplot(aes(x = lift, y = Rule, fill = confidence)) +
  geom_col() +
  geom_text(aes(label = round(lift, 2)), hjust = -0.15, size = 3.5) +
  scale_fill_gradient(low = "#f7c59f", high = "#e05c5c",
                      name = "Confidence") +
  scale_x_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(
    title = "Top 10 Association Rules by Lift",
    x     = "Lift",
    y     = NULL
  ) +
  theme_minimal(base_size = 11)

Lift is the most useful metric here because it controls for item popularity. Herbs and root vegetables topping this chart makes intuitive sense.

4.2 Support vs. Confidence vs. Lift (Scatter)

rules %>%
  ggplot(aes(x = support, y = confidence, color = lift, size = lift)) +
  geom_point(alpha = 0.65) +
  scale_color_gradient(low = "#91c4f2", high = "#c0392b", name = "Lift") +
  scale_size_continuous(range = c(1, 6), guide = "none") +
  scale_x_continuous(labels = scales::percent_format()) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    title    = "Association Rules: Support vs. Confidence",
    subtitle = "Bubble size and color reflect Lift",
    x        = "Support",
    y        = "Confidence"
  ) +
  theme_minimal(base_size = 12)

Each bubble is one rule, and you want to look at the top-right corner: high support and high confidence. The color tells you that those red bubbles are the really interesting ones where the co-purchase is way above what you’d expect by chance.

4.3 Item Co-occurrence Heatmap (Top 15 Items)

top15 <- item_freq$item[1:15]

heatmap_data <- pair_counts %>%
  filter(antecedent %in% top15, consequent %in% top15) %>%
  select(antecedent, consequent, pair_support) %>%
  bind_rows(
    tibble(antecedent = top15, consequent = top15, pair_support = item_sup[top15])
  )

# Make symmetric
heatmap_sym <- heatmap_data %>%
  bind_rows(
    heatmap_data %>% rename(antecedent = consequent, consequent = antecedent)
  ) %>%
  group_by(antecedent, consequent) %>%
  summarise(pair_support = max(pair_support), .groups = "drop")

heatmap_sym %>%
  ggplot(aes(x = antecedent, y = consequent, fill = pair_support)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "#1a6b8a",
                      name = "Support", labels = scales::percent_format()) +
  labs(
    title = "Co-occurrence Support Heatmap — Top 15 Items",
    x     = NULL, y = NULL
  ) +
  theme_minimal(base_size = 10) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The darker the cell, the more often that pair ends up in the same basket. Whole milk lights up the whole right column and bottom row because it co-occurs with almost everything.


5. Extra Credit — Cluster Analysis

We cluster the top 50 items based on their co-occurrence profiles across transactions.

TOP_N <- 50
top_items <- item_freq$item[1:TOP_N]

# Build item x item co-occurrence matrix (support for each pair)
cooc_wide <- pair_counts %>%
  filter(antecedent %in% top_items, consequent %in% top_items) %>%
  select(antecedent, consequent, pair_support) %>%
  bind_rows(
    # Symmetric: add reversed pairs
    pair_counts %>%
      filter(antecedent %in% top_items, consequent %in% top_items) %>%
      rename(antecedent = consequent, consequent = antecedent) %>%
      select(antecedent, consequent, pair_support)
  ) %>%
  group_by(antecedent, consequent) %>%
  summarise(pair_support = max(pair_support), .groups = "drop") %>%
  pivot_wider(
    names_from  = consequent,
    values_from = pair_support,
    values_fill = 0
  ) %>%
  column_to_rownames("antecedent")

# Keep only columns that exist as rows (square matrix)
common <- intersect(rownames(cooc_wide), colnames(cooc_wide))
cooc_mat <- as.matrix(cooc_wide[common, common])

cat("Co-occurrence matrix:", nrow(cooc_mat), "x", ncol(cooc_mat), "\n")
## Co-occurrence matrix: 50 x 50
set.seed(42)
wss <- map_dbl(1:10, function(k) {
  kmeans(cooc_mat, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})

tibble(k = 1:10, wss = wss) %>%
  ggplot(aes(x = k, y = wss)) +
  geom_line(color = "#1a6b8a", linewidth = 1) +
  geom_point(size = 3, color = "#e05c5c") +
  scale_x_continuous(breaks = 1:10) +
  labs(
    title = "Elbow Method — Choosing K",
    x     = "Number of Clusters (K)",
    y     = "Total Within-Cluster SS"
  ) +
  theme_minimal(base_size = 12)

We’re looking for the bend where adding another cluster stops buying us much. K=5 is a reasonable call here — the curve flattens noticeably after that point, meaning five groups capture the main structure without overfitting.

K <- 5
set.seed(42)
km <- kmeans(cooc_mat, centers = K, nstart = 50, iter.max = 200)

cluster_df <- tibble(
  item    = names(km$cluster),
  cluster = km$cluster
) %>%
  left_join(item_freq %>% select(item, support), by = "item") %>%
  arrange(cluster, desc(support))

cluster_df %>%
  group_by(cluster) %>%
  summarise(
    n_items     = n(),
    top_items   = paste(head(item, 5), collapse = ", "),
    avg_support = round(mean(support), 4)
  ) %>%
  kable(caption = paste("K-Means Cluster Summary (K =", K, ")"))
K-Means Cluster Summary (K = 5 )
cluster n_items top_items avg_support
1 21 canned beer, chocolate, white bread, cream cheese, waffles 0.0360
2 10 soda, bottled water, root vegetables, tropical fruit, shopping bags 0.1010
3 1 whole milk 0.2555
4 15 bottled beer, newspapers, fruit/vegetable juice, brown bread, domestic eggs 0.0599
5 3 other vegetables, rolls/buns, yogurt 0.1723
# PCA for 2D visualization
pca <- prcomp(cooc_mat, scale. = TRUE)
pca_df <- as_tibble(pca$x[, 1:2], rownames = "item") %>%
  left_join(cluster_df %>% select(item, cluster), by = "item") %>%
  mutate(cluster = factor(cluster))

pca_df %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster, label = item)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_text(aes(label = item), size = 2.8, vjust = -0.7, hjust = 0.5, show.legend = FALSE) +
  scale_color_brewer(palette = "Set1", name = "Cluster") +
  labs(
    title    = "K-Means Clusters of Grocery Items (PCA projection)",
    subtitle = paste("Top", nrow(cooc_mat), "items clustered by co-occurrence profile"),
    x        = paste0("PC1 (", round(summary(pca)$importance[2,1]*100,1), "% var)"),
    y        = paste0("PC2 (", round(summary(pca)$importance[2,2]*100,1), "% var)")
  ) +
  theme_minimal(base_size = 11)

PCA squashes the 50-item co-occurrence space down to two dimensions so we can actually see what the clusters are doing. Items that land close together tend to be bought by the same types of shoppers.

sil <- silhouette(km$cluster, dist(cooc_mat))

sil_df <- as.data.frame(sil[, 1:3]) %>%
  mutate(item = rownames(cooc_mat)) %>%
  arrange(cluster, sil_width) %>%
  mutate(order = row_number(), cluster = factor(cluster))

ggplot(sil_df, aes(x = order, y = sil_width, fill = cluster)) +
  geom_col(width = 0.9) +
  scale_fill_brewer(palette = "Set1", name = "Cluster") +
  geom_hline(yintercept = mean(sil_df$sil_width),
             linetype = "dashed", color = "gray40") +
  labs(
    title    = "Silhouette Plot - Cluster Cohesion & Separation",
    subtitle = paste("Average silhouette width:", round(mean(sil_df$sil_width), 3)),
    x        = "Items",
    y        = "Silhouette Width"
  ) +
  theme_minimal(base_size = 11)

Each bar shows how well that item fits its assigned cluster. Positive is good, negative means it probably belongs somewhere else. A decent average silhouette around 0.2–0.3 is typical for grocery data since shopping baskets naturally overlap a lot across categories.


6. Summary

Association Rules

  • 264 rules met the thresholds (support >= 0.5%, confidence >= 20%).
  • The highest-lift rules involve specialty produce pairings (e.g., citrus fruit, root vegetables, other vegetables) often purchased together with whole milk or yogurt.
  • Lift > 2 indicates a meaningful positive association; the top rules here show items are 2-3x more likely to be purchased together than by chance alone.

Cluster Analysis

  • K-Means with K = 5 (chosen via elbow method) partitions the top 50 items into coherent shopping segments visible in the PCA plot.
  • Clusters tend to reflect natural grocery departments: dairy/staples, fresh produce, baked goods/breakfast, beverages, and convenience/snack items.