#Importing the data
raw <- read.csv(
"GroceryDataSet.csv",
header = FALSE,
stringsAsFactors = FALSE,
na.strings = c("", "NA")
)
cat("Raw dimensions:", nrow(raw), "rows x", ncol(raw), "columns\n")
## Raw dimensions: 9835 rows x 32 columns
cat("First few rows (first 6 cols):\n")
## First few rows (first 6 cols):
print(head(raw[, 1:min(6, ncol(raw))]))
## V1 V2 V3 V4
## 1 citrus fruit semi-finished bread margarine ready soups
## 2 tropical fruit yogurt coffee <NA>
## 3 whole milk <NA> <NA> <NA>
## 4 pip fruit yogurt cream cheese meat spreads
## 5 other vegetables whole milk condensed milk long life bakery product
## 6 whole milk butter yogurt rice
## V5 V6
## 1 <NA> <NA>
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> <NA>
## 6 abrasive cleaner <NA>
trans_list <- apply(raw, 1, function(row) {
items <- row[!is.na(row) & nchar(trimws(row)) > 0]
trimws(unique(items)) # deduplicate within one basket
})
transactions <- as(trans_list, "transactions")
cat("\n--- Transaction Summary ---\n")
##
## --- Transaction Summary ---
summary(transactions)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
#Find the most frequent item
dev.new(width = 9, height = 5.5, noRStudioGD = FALSE)
itemFrequencyPlot(
transactions,
topN = 20,
type = "absolute",
col = "#4472C4",
border = "white",
main = "Top 20 Most Frequent Grocery Items",
ylab = "Frequency (count)",
cex.names = 0.85
)
Association rule(Apriori)
rules <- apriori(
transactions,
parameter = list(
support = 0.01,
confidence = 0.20,
minlen = 2
)
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.2 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 98
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [231 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
cat("\n--- Rules Summary ---\n")
##
## --- Rules Summary ---
summary(rules)
## set of 231 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 151 80
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.346 3.000 3.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01007 Min. :0.2006 Min. :0.01729 Min. :0.8991
## 1st Qu.:0.01200 1st Qu.:0.2470 1st Qu.:0.03437 1st Qu.:1.4455
## Median :0.01474 Median :0.3180 Median :0.05236 Median :1.7278
## Mean :0.01903 Mean :0.3324 Mean :0.06304 Mean :1.7924
## 3rd Qu.:0.02222 3rd Qu.:0.4035 3rd Qu.:0.07565 3rd Qu.:2.0781
## Max. :0.07483 Max. :0.5862 Max. :0.25552 Max. :3.2950
## count
## Min. : 99.0
## 1st Qu.:118.0
## Median :145.0
## Mean :187.2
## 3rd Qu.:218.5
## Max. :736.0
##
## mining info:
## data ntransactions support confidence
## transactions 9835 0.01 0.2
## call
## apriori(data = transactions, parameter = list(support = 0.01, confidence = 0.2, minlen = 2))
calculating the support, confidence, and lift
rules_df <- as(rules, "data.frame")
cat("\nTotal rules found:", nrow(rules_df), "\n")
##
## Total rules found: 231
# calculating rules
cat("\nSupport range: ", round(range(rules_df$support), 4), "\n")
##
## Support range: 0.0101 0.0748
cat("Confidence range: ", round(range(rules_df$confidence), 4), "\n")
## Confidence range: 0.2006 0.5862
cat("Lift range: ", round(range(rules_df$lift), 4), "\n")
## Lift range: 0.8991 3.295
Calculating Top rules by life
top10_lift <- head(sort(rules, by = "lift"), 10)
cat("\n========== TOP 10 RULES BY LIFT ==========\n")
##
## ========== TOP 10 RULES BY LIFT ==========
inspect(top10_lift)
## lhs rhs support
## [1] {citrus fruit, other vegetables} => {root vegetables} 0.01037112
## [2] {other vegetables, yogurt} => {whipped/sour cream} 0.01016777
## [3] {other vegetables, tropical fruit} => {root vegetables} 0.01230300
## [4] {beef} => {root vegetables} 0.01738688
## [5] {citrus fruit, root vegetables} => {other vegetables} 0.01037112
## [6] {root vegetables, tropical fruit} => {other vegetables} 0.01230300
## [7] {other vegetables, whole milk} => {root vegetables} 0.02318251
## [8] {curd, whole milk} => {yogurt} 0.01006609
## [9] {other vegetables, yogurt} => {root vegetables} 0.01291307
## [10] {other vegetables, yogurt} => {tropical fruit} 0.01230300
## confidence coverage lift count
## [1] 0.3591549 0.02887646 3.295045 102
## [2] 0.2341920 0.04341637 3.267062 100
## [3] 0.3427762 0.03589222 3.144780 121
## [4] 0.3313953 0.05246568 3.040367 171
## [5] 0.5862069 0.01769192 3.029608 102
## [6] 0.5845411 0.02104728 3.020999 121
## [7] 0.3097826 0.07483477 2.842082 228
## [8] 0.3852140 0.02613116 2.761356 99
## [9] 0.2974239 0.04341637 2.728698 127
## [10] 0.2833724 0.04341637 2.700550 121
top10_df <- as(top10_lift, "data.frame")
top10_df <- top10_df[, c("rules", "support", "confidence", "coverage", "lift", "count")]
top10_df$support <- round(top10_df$support, 4)
top10_df$confidence <- round(top10_df$confidence, 4)
top10_df$lift <- round(top10_df$lift, 4)
cat("\nTop 10 rules (formatted):\n")
##
## Top 10 rules (formatted):
print(top10_df, row.names = FALSE)
## rules support confidence
## {citrus fruit,other vegetables} => {root vegetables} 0.0104 0.3592
## {other vegetables,yogurt} => {whipped/sour cream} 0.0102 0.2342
## {other vegetables,tropical fruit} => {root vegetables} 0.0123 0.3428
## {beef} => {root vegetables} 0.0174 0.3314
## {citrus fruit,root vegetables} => {other vegetables} 0.0104 0.5862
## {root vegetables,tropical fruit} => {other vegetables} 0.0123 0.5845
## {other vegetables,whole milk} => {root vegetables} 0.0232 0.3098
## {curd,whole milk} => {yogurt} 0.0101 0.3852
## {other vegetables,yogurt} => {root vegetables} 0.0129 0.2974
## {other vegetables,yogurt} => {tropical fruit} 0.0123 0.2834
## coverage lift count
## 0.02887646 3.2950 102
## 0.04341637 3.2671 100
## 0.03589222 3.1448 121
## 0.05246568 3.0404 171
## 0.01769192 3.0296 102
## 0.02104728 3.0210 121
## 0.07483477 2.8421 228
## 0.02613116 2.7614 99
## 0.04341637 2.7287 127
## 0.04341637 2.7005 121
top10_df$rule_label <- paste0(
gsub("\\{|\\}", "", sub(" =>.*", "", top10_df$rules)),
"\n=> ",
gsub("\\{|\\}", "", sub(".*=> ", "", top10_df$rules))
)
dev.new(width = 10, height = 6, noRStudioGD = FALSE)
print(
ggplot(top10_df, aes(x = reorder(rule_label, lift), y = lift, fill = confidence)) +
geom_col(width = 0.7) +
geom_text(aes(label = round(lift, 2)), hjust = -0.15, size = 3.2) +
coord_flip() +
scale_fill_gradient(low = "#AED6F1", high = "#1A5276", name = "Confidence") +
labs(
title = "Top 10 Association Rules by Lift",
subtitle = "Bar length = Lift | Colour = Confidence",
x = NULL,
y = "Lift"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
)
dev.new(width = 8, height = 5.5, noRStudioGD = FALSE)
plot(
rules,
method = "scatterplot",
measure = c("support", "confidence"),
shading = "lift",
main = "Association Rules — Support vs Confidence (colour = Lift)"
)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
dev.new(width = 10, height = 8, noRStudioGD = FALSE)
plot(
head(sort(rules, by = "lift"), 20),
method = "graph",
engine = "igraph",
main = "Top 20 Rules by Lift — Network Graph"
)
LLM AI was use to try and make this print in the correct direction but it still couldn’t be fixed
dev.new(width = 14, height = 8, noRStudioGD = FALSE)
par(mar = c(10, 2, 4, 2)) # extra bottom margin for LHS labels
plot(
head(sort(rules, by = "lift"), 30),
method = "grouped",
main = "Top 30 Rules by Lift — Grouped Matrix",
cex = 0.75 # shrink all text so labels fit horizontally
)
## Warning: Unknown control parameters: main, cex
## Available control parameters (with default values):
## k = 20
## aggr.fun = function (x, ...) UseMethod("mean")
## rhs_max = 10
## lhs_label_items = 2
## col = c("#EE0000FF", "#EEEEEEFF")
## groups = NULL
## engine = ggplot2
## verbose = FALSE
par(mar = c(5, 4, 4, 2)) # reset margins back to default
TOP_N_ITEMS <- 50
freq_items <- sort(itemFrequency(transactions, type = "absolute"), decreasing = TRUE)
top_items <- names(freq_items[1:TOP_N_ITEMS])
trans_sub <- transactions[, top_items]
bin_mat <- as(trans_sub, "matrix") * 1L
cat("Binary matrix dimensions:", nrow(bin_mat), "x", ncol(bin_mat), "\n")
## Binary matrix dimensions: 9835 x 50
# --- Elbow plot to choose k ---
set.seed(42)
sample_idx <- sample(nrow(bin_mat), min(3000, nrow(bin_mat)))
bin_sample <- bin_mat[sample_idx, ]
wss <- sapply(2:10, function(k) {
kmeans(bin_sample, centers = k, nstart = 10, iter.max = 50)$tot.withinss
})
Plot to see number of clusters (Elbow plot)
dev.new(width = 7.5, height = 5, noRStudioGD = FALSE)
plot(
2:10, wss,
type = "b", pch = 19, col = "#E74C3C",
xlab = "Number of Clusters (k)",
ylab = "Total Within-Cluster SS",
main = "Elbow Plot — Optimal Number of Clusters"
)
abline(v = 4, lty = 2, col = "grey50")
text(4.15, max(wss) * 0.97, "k = 4", col = "grey40", cex = 0.85)
# --- Fit k-means with k = 4 ---
K <- 4
km <- kmeans(bin_mat, centers = K, nstart = 25, iter.max = 100)
cat("\nk-means cluster sizes (k =", K, "):\n")
##
## k-means cluster sizes (k = 4 ):
print(table(km$cluster))
##
## 1 2 3 4
## 1774 5009 1809 1243
PCA cluster visualisation
pca_result <- prcomp(bin_mat, scale. = FALSE)
pca_df <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = factor(km$cluster)
)
dev.new(width = 8, height = 6, noRStudioGD = FALSE)
print(
ggplot(pca_df, aes(PC1, PC2, colour = Cluster)) +
geom_point(alpha = 0.30, size = 0.7) +
stat_ellipse(level = 0.85, linewidth = 0.9) +
scale_colour_brewer(palette = "Set1") +
labs(
title = paste("k-Means Clustering (k =", K, ") — PCA Projection"),
subtitle = paste("Based on top", TOP_N_ITEMS, "most frequent items"),
x = "PC1", y = "PC2"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))
)
# --- Cluster profiles ---
cluster_profiles <- data.frame(
item = colnames(bin_mat),
t(aggregate(bin_mat, by = list(km$cluster), FUN = mean)[, -1])
)
colnames(cluster_profiles)[-1] <- paste0("Cluster_", 1:K)
cluster_profiles$item <- colnames(bin_mat)
cat("\nTop 5 items per cluster (by mean purchase rate):\n")
##
## Top 5 items per cluster (by mean purchase rate):
for (k in 1:K) {
col <- paste0("Cluster_", k)
top5 <- cluster_profiles[order(-cluster_profiles[[col]]), ][1:5, c("item", col)]
cat(sprintf("\n Cluster %d:\n", k))
print(top5, row.names = FALSE)
}
##
## Cluster 1:
## item Cluster_1
## whole milk 1.0000000
## rolls/buns 0.2142052
## yogurt 0.1871477
## soda 0.1431793
## root vegetables 0.1426156
##
## Cluster 2:
## item Cluster_2
## rolls/buns 0.15392294
## canned beer 0.10521062
## yogurt 0.09502895
## bottled water 0.08644440
## shopping bags 0.08544620
##
## Cluster 3:
## item Cluster_3
## other vegetables 0.9944721
## whole milk 0.4068546
## root vegetables 0.2603648
## yogurt 0.2371476
## rolls/buns 0.2144831
##
## Cluster 4:
## item Cluster_4
## soda 1.0000000
## rolls/buns 0.2172164
## bottled water 0.1568785
## shopping bags 0.1351569
## sausage 0.1206758
Cluster heatmap
profile_long <- melt(
cluster_profiles,
id.vars = "item",
variable.name = "Cluster",
value.name = "MeanPurchaseRate"
)
dev.new(width = 9, height = 7, noRStudioGD = FALSE)
print(
ggplot(profile_long,
aes(x = Cluster, y = reorder(item, MeanPurchaseRate), fill = MeanPurchaseRate)) +
geom_tile(colour = "white") +
scale_fill_gradient(low = "#EBF5FB", high = "#1A5276", name = "Purchase\nRate") +
labs(
title = "Cluster Profiles — Item Purchase Rates per Cluster",
x = NULL, y = "Item"
) +
theme_minimal(base_size = 10) +
theme(
axis.text.y = element_text(size = 7),
plot.title = element_text(face = "bold")
)
)