1. Data Loading

library(dplyr)
library(tidyr)
library(ggplot2)
library(magrittr)
library(readr)
library(tibble)
library(stringr)
library(purrr)
library(cluster)
library(knitr)

raw <- read_csv(
  "https://github.com/vincent-usny/hw10/raw/refs/heads/main/GroceryDataSet.csv",
  col_names       = FALSE,
  show_col_types  = FALSE
)

txns <- raw %>%
  mutate(tid = row_number()) %>%
  pivot_longer(cols = -tid, values_to = "item") %>%
  filter(!is.na(item), item != "") %>%
  select(tid, item) %>%
  mutate(item = str_trim(item))

n_txns <- n_distinct(txns$tid)
n_items <- n_distinct(txns$item)

cat("Transactions :", n_txns,  "\n")

## Transactions : 9835

cat("Unique items :", n_items, "\n")

## Unique items : 169

2. Exploratory Analysis

item_freq <- txns %>%
  count(item, sort = TRUE) %>%
  mutate(support = n / n_txns)

item_freq %>%
  head(20) %>%
  kable(digits = 4, caption = "Top 20 Items by Support")

Top 20 Items by Support
item	n	support
whole milk	2513	0.2555
other vegetables	1903	0.1935
rolls/buns	1809	0.1839
soda	1715	0.1744
yogurt	1372	0.1395
bottled water	1086	0.1104
root vegetables	1072	0.1090
tropical fruit	1032	0.1049
shopping bags	968	0.0984
sausage	924	0.0940
pastry	875	0.0890
citrus fruit	814	0.0828
bottled beer	790	0.0803
newspapers	784	0.0797
canned beer	764	0.0777
pip fruit	744	0.0756
fruit/vegetable juice	709	0.0721
whipped/sour cream	705	0.0717
brown bread	638	0.0649
domestic eggs	624	0.0634

item_freq %>%
  head(20) %>%
  mutate(item = reorder(item, support)) %>%
  ggplot(aes(x = support, y = item, fill = support)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#a8d5e2", high = "#1a6b8a") +
  scale_x_continuous(labels = scales::percent_format()) +
  labs(
    title = "Top 20 Items by Transaction Support",
    x     = "Support (%)",
    y     = NULL
  ) +
  theme_minimal(base_size = 12)

Whole milk dominates by a mile, appearing in about 1 in 4 baskets. It’s essentially the gravity center of this entire dataset.

3. Market Basket Analysis

Association rules describe the relationship {antecedent} => {consequent}.

# For efficiency, work with lists of item sets per transaction
txn_list <- txns %>%
  group_by(tid) %>%
  summarise(items = list(item), .groups = "drop")

# Item supports 
item_sup <- item_freq %>%
  select(item, support) %>%
  deframe()                         # named numeric vector

# For each transaction, generate all 2-item combinations
pair_counts <- txn_list$items %>%
  map(function(its) {
    if (length(its) < 2) return(NULL)
    combn(sort(its), 2, simplify = FALSE)
  }) %>%
  compact() %>%
  flatten() %>%
  map_chr(~ paste(.x, collapse = "|||")) %>%
  table() %>%
  as_tibble(.name_repair = "minimal") %>%
  setNames(c("pair", "n")) %>%
  mutate(
    antecedent  = str_split_fixed(pair, "\\|\\|\\|", 2)[, 1],
    consequent  = str_split_fixed(pair, "\\|\\|\\|", 2)[, 2],
    pair_support = as.integer(n) / n_txns
  )

cat("Candidate pairs generated:", nrow(pair_counts), "\n")

## Candidate pairs generated: 9559

MIN_SUPPORT    <- 0.005    # rule must appear in >= 0.5% of transactions
MIN_CONFIDENCE <- 0.20     # at least 20% of the time, {A} => {B}

rules <- pair_counts %>%
  filter(pair_support >= MIN_SUPPORT) %>%
  # Both directions: A -> B  and  B -> A
  bind_rows(
    pair_counts %>%
      filter(pair_support >= MIN_SUPPORT) %>%
      rename(antecedent = consequent, consequent = antecedent)
  ) %>%
  mutate(
    sup_antecedent = item_sup[antecedent],
    sup_consequent = item_sup[consequent],
    confidence     = pair_support / sup_antecedent,
    lift           = confidence   / sup_consequent
  ) %>%
  filter(confidence >= MIN_CONFIDENCE) %>%
  select(antecedent, consequent, pair_support, confidence, lift) %>%
  rename(support = pair_support) %>%
  arrange(desc(lift))

cat("Rules meeting thresholds:", nrow(rules), "\n")

## Rules meeting thresholds: 264

4. Results

4.1 Top 10 Rules by Lift

rules %>%
  head(10) %>%
  mutate(
    Rule       = paste0("{", antecedent, "} => {", consequent, "}"),
    Support    = round(support,    4),
    Confidence = round(confidence, 4),
    Lift       = round(lift,       4)
  ) %>%
  select(Rule, Support, Confidence, Lift) %>%
  kable(caption = "Top 10 Association Rules by Lift")

Top 10 Association Rules by Lift
Rule	Support	Confidence	Lift
{herbs} => {root vegetables}	0.0070	0.4312	3.9565
{berries} => {whipped/sour cream}	0.0090	0.2722	3.7969
{sliced cheese} => {sausage}	0.0070	0.2863	3.0474
{beef} => {root vegetables}	0.0174	0.3314	3.0404
{onions} => {root vegetables}	0.0095	0.3049	2.7975
{grapes} => {tropical fruit}	0.0061	0.2727	2.5991
{pip fruit} => {tropical fruit}	0.0204	0.2702	2.5746
{soft cheese} => {yogurt}	0.0060	0.3512	2.5175
{herbs} => {other vegetables}	0.0077	0.4750	2.4549
{onions} => {other vegetables}	0.0142	0.4590	2.3723

rules %>%
  head(10) %>%
  mutate(Rule = paste0("{", antecedent, "}\n=> {", consequent, "}"),
         Rule = reorder(Rule, lift)) %>%
  ggplot(aes(x = lift, y = Rule, fill = confidence)) +
  geom_col() +
  geom_text(aes(label = round(lift, 2)), hjust = -0.15, size = 3.5) +
  scale_fill_gradient(low = "#f7c59f", high = "#e05c5c",
                      name = "Confidence") +
  scale_x_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(
    title = "Top 10 Association Rules by Lift",
    x     = "Lift",
    y     = NULL
  ) +
  theme_minimal(base_size = 11)

Lift is the most useful metric here because it controls for item popularity. Herbs and root vegetables topping this chart makes intuitive sense.

4.2 Support vs. Confidence vs. Lift (Scatter)

rules %>%
  ggplot(aes(x = support, y = confidence, color = lift, size = lift)) +
  geom_point(alpha = 0.65) +
  scale_color_gradient(low = "#91c4f2", high = "#c0392b", name = "Lift") +
  scale_size_continuous(range = c(1, 6), guide = "none") +
  scale_x_continuous(labels = scales::percent_format()) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    title    = "Association Rules: Support vs. Confidence",
    subtitle = "Bubble size and color reflect Lift",
    x        = "Support",
    y        = "Confidence"
  ) +
  theme_minimal(base_size = 12)

Each bubble is one rule, and you want to look at the top-right corner: high support and high confidence. The color tells you that those red bubbles are the really interesting ones where the co-purchase is way above what you’d expect by chance.

4.3 Item Co-occurrence Heatmap (Top 15 Items)

top15 <- item_freq$item[1:15]

heatmap_data <- pair_counts %>%
  filter(antecedent %in% top15, consequent %in% top15) %>%
  select(antecedent, consequent, pair_support) %>%
  bind_rows(
    tibble(antecedent = top15, consequent = top15, pair_support = item_sup[top15])
  )

# Make symmetric
heatmap_sym <- heatmap_data %>%
  bind_rows(
    heatmap_data %>% rename(antecedent = consequent, consequent = antecedent)
  ) %>%
  group_by(antecedent, consequent) %>%
  summarise(pair_support = max(pair_support), .groups = "drop")

heatmap_sym %>%
  ggplot(aes(x = antecedent, y = consequent, fill = pair_support)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "#1a6b8a",
                      name = "Support", labels = scales::percent_format()) +
  labs(
    title = "Co-occurrence Support Heatmap — Top 15 Items",
    x     = NULL, y = NULL
  ) +
  theme_minimal(base_size = 10) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The darker the cell, the more often that pair ends up in the same basket. Whole milk lights up the whole right column and bottom row because it co-occurs with almost everything.

5. Extra Credit — Cluster Analysis

We cluster the top 50 items based on their co-occurrence profiles across transactions.

TOP_N <- 50
top_items <- item_freq$item[1:TOP_N]

# Build item x item co-occurrence matrix (support for each pair)
cooc_wide <- pair_counts %>%
  filter(antecedent %in% top_items, consequent %in% top_items) %>%
  select(antecedent, consequent, pair_support) %>%
  bind_rows(
    # Symmetric: add reversed pairs
    pair_counts %>%
      filter(antecedent %in% top_items, consequent %in% top_items) %>%
      rename(antecedent = consequent, consequent = antecedent) %>%
      select(antecedent, consequent, pair_support)
  ) %>%
  group_by(antecedent, consequent) %>%
  summarise(pair_support = max(pair_support), .groups = "drop") %>%
  pivot_wider(
    names_from  = consequent,
    values_from = pair_support,
    values_fill = 0
  ) %>%
  column_to_rownames("antecedent")

# Keep only columns that exist as rows (square matrix)
common <- intersect(rownames(cooc_wide), colnames(cooc_wide))
cooc_mat <- as.matrix(cooc_wide[common, common])

cat("Co-occurrence matrix:", nrow(cooc_mat), "x", ncol(cooc_mat), "\n")

## Co-occurrence matrix: 50 x 50

set.seed(42)
wss <- map_dbl(1:10, function(k) {
  kmeans(cooc_mat, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})

tibble(k = 1:10, wss = wss) %>%
  ggplot(aes(x = k, y = wss)) +
  geom_line(color = "#1a6b8a", linewidth = 1) +
  geom_point(size = 3, color = "#e05c5c") +
  scale_x_continuous(breaks = 1:10) +
  labs(
    title = "Elbow Method — Choosing K",
    x     = "Number of Clusters (K)",
    y     = "Total Within-Cluster SS"
  ) +
  theme_minimal(base_size = 12)

We’re looking for the bend where adding another cluster stops buying us much. K=5 is a reasonable call here — the curve flattens noticeably after that point, meaning five groups capture the main structure without overfitting.

K <- 5
set.seed(42)
km <- kmeans(cooc_mat, centers = K, nstart = 50, iter.max = 200)

cluster_df <- tibble(
  item    = names(km$cluster),
  cluster = km$cluster
) %>%
  left_join(item_freq %>% select(item, support), by = "item") %>%
  arrange(cluster, desc(support))

cluster_df %>%
  group_by(cluster) %>%
  summarise(
    n_items     = n(),
    top_items   = paste(head(item, 5), collapse = ", "),
    avg_support = round(mean(support), 4)
  ) %>%
  kable(caption = paste("K-Means Cluster Summary (K =", K, ")"))

K-Means Cluster Summary (K = 5 )
cluster	n_items	top_items	avg_support
1	21	canned beer, chocolate, white bread, cream cheese, waffles	0.0360
2	10	soda, bottled water, root vegetables, tropical fruit, shopping bags	0.1010
3	1	whole milk	0.2555
4	15	bottled beer, newspapers, fruit/vegetable juice, brown bread, domestic eggs	0.0599
5	3	other vegetables, rolls/buns, yogurt	0.1723

# PCA for 2D visualization
pca <- prcomp(cooc_mat, scale. = TRUE)
pca_df <- as_tibble(pca$x[, 1:2], rownames = "item") %>%
  left_join(cluster_df %>% select(item, cluster), by = "item") %>%
  mutate(cluster = factor(cluster))

pca_df %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster, label = item)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_text(aes(label = item), size = 2.8, vjust = -0.7, hjust = 0.5, show.legend = FALSE) +
  scale_color_brewer(palette = "Set1", name = "Cluster") +
  labs(
    title    = "K-Means Clusters of Grocery Items (PCA projection)",
    subtitle = paste("Top", nrow(cooc_mat), "items clustered by co-occurrence profile"),
    x        = paste0("PC1 (", round(summary(pca)$importance[2,1]*100,1), "% var)"),
    y        = paste0("PC2 (", round(summary(pca)$importance[2,2]*100,1), "% var)")
  ) +
  theme_minimal(base_size = 11)

PCA squashes the 50-item co-occurrence space down to two dimensions so we can actually see what the clusters are doing. Items that land close together tend to be bought by the same types of shoppers.

sil <- silhouette(km$cluster, dist(cooc_mat))

sil_df <- as.data.frame(sil[, 1:3]) %>%
  mutate(item = rownames(cooc_mat)) %>%
  arrange(cluster, sil_width) %>%
  mutate(order = row_number(), cluster = factor(cluster))

ggplot(sil_df, aes(x = order, y = sil_width, fill = cluster)) +
  geom_col(width = 0.9) +
  scale_fill_brewer(palette = "Set1", name = "Cluster") +
  geom_hline(yintercept = mean(sil_df$sil_width),
             linetype = "dashed", color = "gray40") +
  labs(
    title    = "Silhouette Plot - Cluster Cohesion & Separation",
    subtitle = paste("Average silhouette width:", round(mean(sil_df$sil_width), 3)),
    x        = "Items",
    y        = "Silhouette Width"
  ) +
  theme_minimal(base_size = 11)

Each bar shows how well that item fits its assigned cluster. Positive is good, negative means it probably belongs somewhere else. A decent average silhouette around 0.2–0.3 is typical for grocery data since shopping baskets naturally overlap a lot across categories.

6. Summary

Association Rules

264 rules met the thresholds (support >= 0.5%, confidence >= 20%).
The highest-lift rules involve specialty produce pairings (e.g., citrus fruit, root vegetables, other vegetables) often purchased together with whole milk or yogurt.
Lift > 2 indicates a meaningful positive association; the top rules here show items are 2-3x more likely to be purchased together than by chance alone.

Cluster Analysis

K-Means with K = 5 (chosen via elbow method) partitions the top 50 items into coherent shopping segments visible in the PCA plot.
Clusters tend to reflect natural grocery departments: dairy/staples, fresh produce, baked goods/breakfast, beverages, and convenience/snack items.

Homework 10

Groceries Dataset

Haoming Chen

4/28/2026