#Importing the data

raw <- read.csv(
  "GroceryDataSet.csv",
  header = FALSE, 
  stringsAsFactors = FALSE,
  na.strings = c("", "NA")
)

cat("Raw dimensions:", nrow(raw), "rows x", ncol(raw), "columns\n")

## Raw dimensions: 9835 rows x 32 columns

cat("First few rows (first 6 cols):\n")

## First few rows (first 6 cols):

print(head(raw[, 1:min(6, ncol(raw))]))

##                 V1                  V2             V3                       V4
## 1     citrus fruit semi-finished bread      margarine              ready soups
## 2   tropical fruit              yogurt         coffee                     <NA>
## 3       whole milk                <NA>           <NA>                     <NA>
## 4        pip fruit              yogurt  cream cheese              meat spreads
## 5 other vegetables          whole milk condensed milk long life bakery product
## 6       whole milk              butter         yogurt                     rice
##                 V5   V6
## 1             <NA> <NA>
## 2             <NA> <NA>
## 3             <NA> <NA>
## 4             <NA> <NA>
## 5             <NA> <NA>
## 6 abrasive cleaner <NA>

Fixing formating

trans_list <- apply(raw, 1, function(row) {
  items <- row[!is.na(row) & nchar(trimws(row)) > 0]
  trimws(unique(items))   # deduplicate within one basket
})
 
transactions <- as(trans_list, "transactions")
 
cat("\n--- Transaction Summary ---\n")

## 
## --- Transaction Summary ---

summary(transactions)

## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics

#Find the most frequent item

dev.new(width = 9, height = 5.5, noRStudioGD = FALSE)
itemFrequencyPlot(
  transactions,
  topN      = 20,
  type      = "absolute",
  col       = "#4472C4",
  border    = "white",
  main      = "Top 20 Most Frequent Grocery Items",
  ylab      = "Frequency (count)",
  cex.names = 0.85
)

Association rule(Apriori)

rules <- apriori(
  transactions,
  parameter = list(
    support    = 0.01,
    confidence = 0.20,
    minlen     = 2
  )
)

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [231 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

cat("\n--- Rules Summary ---\n")

## 
## --- Rules Summary ---

summary(rules)

## set of 231 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3 
## 151  80 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.346   3.000   3.000 
## 
## summary of quality measures:
##     support          confidence        coverage            lift       
##  Min.   :0.01007   Min.   :0.2006   Min.   :0.01729   Min.   :0.8991  
##  1st Qu.:0.01200   1st Qu.:0.2470   1st Qu.:0.03437   1st Qu.:1.4455  
##  Median :0.01474   Median :0.3180   Median :0.05236   Median :1.7278  
##  Mean   :0.01903   Mean   :0.3324   Mean   :0.06304   Mean   :1.7924  
##  3rd Qu.:0.02222   3rd Qu.:0.4035   3rd Qu.:0.07565   3rd Qu.:2.0781  
##  Max.   :0.07483   Max.   :0.5862   Max.   :0.25552   Max.   :3.2950  
##      count      
##  Min.   : 99.0  
##  1st Qu.:118.0  
##  Median :145.0  
##  Mean   :187.2  
##  3rd Qu.:218.5  
##  Max.   :736.0  
## 
## mining info:
##          data ntransactions support confidence
##  transactions          9835    0.01        0.2
##                                                                                          call
##  apriori(data = transactions, parameter = list(support = 0.01, confidence = 0.2, minlen = 2))

calculating the support, confidence, and lift

rules_df <- as(rules, "data.frame")
cat("\nTotal rules found:", nrow(rules_df), "\n")

## 
## Total rules found: 231

# calculating rules
cat("\nSupport range:    ", round(range(rules_df$support),    4), "\n")

## 
## Support range:     0.0101 0.0748

cat("Confidence range: ", round(range(rules_df$confidence), 4), "\n")

## Confidence range:  0.2006 0.5862

cat("Lift range:       ", round(range(rules_df$lift),        4), "\n")

## Lift range:        0.8991 3.295

Calculating Top rules by life

top10_lift <- head(sort(rules, by = "lift"), 10)
 
cat("\n========== TOP 10 RULES BY LIFT ==========\n")

## 
## ========== TOP 10 RULES BY LIFT ==========

inspect(top10_lift)

##      lhs                                   rhs                  support   
## [1]  {citrus fruit, other vegetables}   => {root vegetables}    0.01037112
## [2]  {other vegetables, yogurt}         => {whipped/sour cream} 0.01016777
## [3]  {other vegetables, tropical fruit} => {root vegetables}    0.01230300
## [4]  {beef}                             => {root vegetables}    0.01738688
## [5]  {citrus fruit, root vegetables}    => {other vegetables}   0.01037112
## [6]  {root vegetables, tropical fruit}  => {other vegetables}   0.01230300
## [7]  {other vegetables, whole milk}     => {root vegetables}    0.02318251
## [8]  {curd, whole milk}                 => {yogurt}             0.01006609
## [9]  {other vegetables, yogurt}         => {root vegetables}    0.01291307
## [10] {other vegetables, yogurt}         => {tropical fruit}     0.01230300
##      confidence coverage   lift     count
## [1]  0.3591549  0.02887646 3.295045 102  
## [2]  0.2341920  0.04341637 3.267062 100  
## [3]  0.3427762  0.03589222 3.144780 121  
## [4]  0.3313953  0.05246568 3.040367 171  
## [5]  0.5862069  0.01769192 3.029608 102  
## [6]  0.5845411  0.02104728 3.020999 121  
## [7]  0.3097826  0.07483477 2.842082 228  
## [8]  0.3852140  0.02613116 2.761356  99  
## [9]  0.2974239  0.04341637 2.728698 127  
## [10] 0.2833724  0.04341637 2.700550 121

top10_df <- as(top10_lift, "data.frame")
top10_df <- top10_df[, c("rules", "support", "confidence", "coverage", "lift", "count")]
top10_df$support    <- round(top10_df$support,    4)
top10_df$confidence <- round(top10_df$confidence, 4)
top10_df$lift       <- round(top10_df$lift,        4)
 
cat("\nTop 10 rules (formatted):\n")

## 
## Top 10 rules (formatted):

print(top10_df, row.names = FALSE)

##                                                   rules support confidence
##    {citrus fruit,other vegetables} => {root vegetables}  0.0104     0.3592
##       {other vegetables,yogurt} => {whipped/sour cream}  0.0102     0.2342
##  {other vegetables,tropical fruit} => {root vegetables}  0.0123     0.3428
##                             {beef} => {root vegetables}  0.0174     0.3314
##    {citrus fruit,root vegetables} => {other vegetables}  0.0104     0.5862
##  {root vegetables,tropical fruit} => {other vegetables}  0.0123     0.5845
##      {other vegetables,whole milk} => {root vegetables}  0.0232     0.3098
##                           {curd,whole milk} => {yogurt}  0.0101     0.3852
##          {other vegetables,yogurt} => {root vegetables}  0.0129     0.2974
##           {other vegetables,yogurt} => {tropical fruit}  0.0123     0.2834
##    coverage   lift count
##  0.02887646 3.2950   102
##  0.04341637 3.2671   100
##  0.03589222 3.1448   121
##  0.05246568 3.0404   171
##  0.01769192 3.0296   102
##  0.02104728 3.0210   121
##  0.07483477 2.8421   228
##  0.02613116 2.7614    99
##  0.04341637 2.7287   127
##  0.04341637 2.7005   121

top10_df$rule_label <- paste0(
  gsub("\\{|\\}", "", sub(" =>.*",  "", top10_df$rules)),
  "\n=> ",
  gsub("\\{|\\}", "", sub(".*=> ", "", top10_df$rules))
)
 
dev.new(width = 10, height = 6, noRStudioGD = FALSE)
print(
  ggplot(top10_df, aes(x = reorder(rule_label, lift), y = lift, fill = confidence)) +
    geom_col(width = 0.7) +
    geom_text(aes(label = round(lift, 2)), hjust = -0.15, size = 3.2) +
    coord_flip() +
    scale_fill_gradient(low = "#AED6F1", high = "#1A5276", name = "Confidence") +
    labs(
      title    = "Top 10 Association Rules by Lift",
      subtitle = "Bar length = Lift  |  Colour = Confidence",
      x        = NULL,
      y        = "Lift"
    ) +
    theme_minimal(base_size = 11) +
    theme(plot.title = element_text(face = "bold"))
)

dev.new(width = 8, height = 5.5, noRStudioGD = FALSE)
plot(
  rules,
  method  = "scatterplot",
  measure = c("support", "confidence"),
  shading = "lift",
  main    = "Association Rules — Support vs Confidence (colour = Lift)"
)

## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

dev.new(width = 10, height = 8, noRStudioGD = FALSE)
plot(
  head(sort(rules, by = "lift"), 20),
  method = "graph",
  engine = "igraph",
  main   = "Top 20 Rules by Lift — Network Graph"
)

LLM AI was use to try and make this print in the correct direction but it still couldn’t be fixed

dev.new(width = 14, height = 8, noRStudioGD = FALSE)
par(mar = c(10, 2, 4, 2))  # extra bottom margin for LHS labels
plot(
  head(sort(rules, by = "lift"), 30),
  method     = "grouped",
  main       = "Top 30 Rules by Lift — Grouped Matrix",
  cex        = 0.75        # shrink all text so labels fit horizontally
)

## Warning: Unknown control parameters: main, cex

## Available control parameters (with default values):
## k     =  20
## aggr.fun  =  function (x, ...)  UseMethod("mean")
## rhs_max   =  10
## lhs_label_items   =  2
## col   =  c("#EE0000FF", "#EEEEEEFF")
## groups    =  NULL
## engine    =  ggplot2
## verbose   =  FALSE

par(mar = c(5, 4, 4, 2))   # reset margins back to default

Cluster Analysis extra credit

TOP_N_ITEMS <- 50
 
freq_items <- sort(itemFrequency(transactions, type = "absolute"), decreasing = TRUE)
top_items  <- names(freq_items[1:TOP_N_ITEMS])
 
trans_sub <- transactions[, top_items]
bin_mat   <- as(trans_sub, "matrix") * 1L
 
cat("Binary matrix dimensions:", nrow(bin_mat), "x", ncol(bin_mat), "\n")

## Binary matrix dimensions: 9835 x 50

# --- Elbow plot to choose k ---
set.seed(42)
sample_idx <- sample(nrow(bin_mat), min(3000, nrow(bin_mat)))
bin_sample <- bin_mat[sample_idx, ]
 
wss <- sapply(2:10, function(k) {
  kmeans(bin_sample, centers = k, nstart = 10, iter.max = 50)$tot.withinss
})

Plot to see number of clusters (Elbow plot)

dev.new(width = 7.5, height = 5, noRStudioGD = FALSE)
plot(
  2:10, wss,
  type = "b", pch = 19, col = "#E74C3C",
  xlab = "Number of Clusters (k)",
  ylab = "Total Within-Cluster SS",
  main = "Elbow Plot — Optimal Number of Clusters"
)
abline(v = 4, lty = 2, col = "grey50")
text(4.15, max(wss) * 0.97, "k = 4", col = "grey40", cex = 0.85)
 
# --- Fit k-means with k = 4 ---
K  <- 4
km <- kmeans(bin_mat, centers = K, nstart = 25, iter.max = 100)
 
cat("\nk-means cluster sizes (k =", K, "):\n")

## 
## k-means cluster sizes (k = 4 ):

print(table(km$cluster))

## 
##    1    2    3    4 
## 1774 5009 1809 1243

PCA cluster visualisation

pca_result <- prcomp(bin_mat, scale. = FALSE)
 
pca_df <- data.frame(
  PC1     = pca_result$x[, 1],
  PC2     = pca_result$x[, 2],
  Cluster = factor(km$cluster)
)
 
dev.new(width = 8, height = 6, noRStudioGD = FALSE)
print(
  ggplot(pca_df, aes(PC1, PC2, colour = Cluster)) +
    geom_point(alpha = 0.30, size = 0.7) +
    stat_ellipse(level = 0.85, linewidth = 0.9) +
    scale_colour_brewer(palette = "Set1") +
    labs(
      title    = paste("k-Means Clustering (k =", K, ") — PCA Projection"),
      subtitle = paste("Based on top", TOP_N_ITEMS, "most frequent items"),
      x = "PC1", y = "PC2"
    ) +
    theme_minimal(base_size = 12) +
    theme(plot.title = element_text(face = "bold"))
)
 
# --- Cluster profiles ---
cluster_profiles <- data.frame(
  item = colnames(bin_mat),
  t(aggregate(bin_mat, by = list(km$cluster), FUN = mean)[, -1])
)
colnames(cluster_profiles)[-1] <- paste0("Cluster_", 1:K)
cluster_profiles$item <- colnames(bin_mat)
 
cat("\nTop 5 items per cluster (by mean purchase rate):\n")

## 
## Top 5 items per cluster (by mean purchase rate):

for (k in 1:K) {
  col  <- paste0("Cluster_", k)
  top5 <- cluster_profiles[order(-cluster_profiles[[col]]), ][1:5, c("item", col)]
  cat(sprintf("\n  Cluster %d:\n", k))
  print(top5, row.names = FALSE)
}

## 
##   Cluster 1:
##             item Cluster_1
##       whole milk 1.0000000
##       rolls/buns 0.2142052
##           yogurt 0.1871477
##             soda 0.1431793
##  root vegetables 0.1426156
## 
##   Cluster 2:
##           item  Cluster_2
##     rolls/buns 0.15392294
##    canned beer 0.10521062
##         yogurt 0.09502895
##  bottled water 0.08644440
##  shopping bags 0.08544620
## 
##   Cluster 3:
##              item Cluster_3
##  other vegetables 0.9944721
##        whole milk 0.4068546
##   root vegetables 0.2603648
##            yogurt 0.2371476
##        rolls/buns 0.2144831
## 
##   Cluster 4:
##           item Cluster_4
##           soda 1.0000000
##     rolls/buns 0.2172164
##  bottled water 0.1568785
##  shopping bags 0.1351569
##        sausage 0.1206758

Cluster heatmap

profile_long <- melt(
  cluster_profiles,
  id.vars       = "item",
  variable.name = "Cluster",
  value.name    = "MeanPurchaseRate"
)
 
dev.new(width = 9, height = 7, noRStudioGD = FALSE)
print(
  ggplot(profile_long,
         aes(x = Cluster, y = reorder(item, MeanPurchaseRate), fill = MeanPurchaseRate)) +
    geom_tile(colour = "white") +
    scale_fill_gradient(low = "#EBF5FB", high = "#1A5276", name = "Purchase\nRate") +
    labs(
      title = "Cluster Profiles — Item Purchase Rates per Cluster",
      x = NULL, y = "Item"
    ) +
    theme_minimal(base_size = 10) +
    theme(
      axis.text.y = element_text(size = 7),
      plot.title  = element_text(face = "bold")
    )
)

Assignment 10

2026-05-11

Fixing formating

Cluster Analysis extra credit