Market Basket Analysis - Grocery Dataset
# Load libraries
library(arules)
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.4.3
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.4.3
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
1. Load Grocery Dataset from GitHub
url <- "https://raw.githubusercontent.com/tanzil64/DATA624/refs/heads/main/GroceryDataSet.csv"
groceries <- read.transactions(
url,
format = "basket",
sep = ","
)
2. Dataset Summary
summary(groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
# Plot top 20 frequent items
itemFrequencyPlot(
groceries,
topN = 20,
type = "absolute",
main = "Top 20 Grocery Items"
)

3. Association Rule Mining
rules <- apriori(
groceries,
parameter = list(
support = 0.001,
confidence = 0.20,
minlen = 2
)
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.2 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [21633 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Summary of generated rules
summary(rules)
## set of 21633 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 620 9337 9824 1792 60
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 4.000 3.599 4.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001017 Min. :0.2000 Min. :0.001017 Min. : 0.8028
## 1st Qu.:0.001118 1st Qu.:0.2632 1st Qu.:0.002745 1st Qu.: 2.1178
## Median :0.001322 Median :0.3548 Median :0.004169 Median : 2.7571
## Mean :0.001948 Mean :0.3967 Mean :0.005840 Mean : 3.0214
## 3rd Qu.:0.001932 3rd Qu.:0.5000 3rd Qu.:0.006101 3rd Qu.: 3.6148
## Max. :0.074835 Max. :1.0000 Max. :0.255516 Max. :35.7158
## count
## Min. : 10.00
## 1st Qu.: 11.00
## Median : 13.00
## Mean : 19.15
## 3rd Qu.: 19.00
## Max. :736.00
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.001 0.2
## call
## apriori(data = groceries, parameter = list(support = 0.001, confidence = 0.2, minlen = 2))
4. Top 10 Rules by Lift
rules_lift <- sort(
rules,
by = "lift",
decreasing = TRUE
)
top10_rules <- head(rules_lift, 10)
inspect(top10_rules)
## lhs rhs support
## [1] {bottled beer, red/blush wine} => {liquor} 0.001931876
## [2] {hamburger meat, soda} => {Instant food products} 0.001220132
## [3] {ham, white bread} => {processed cheese} 0.001931876
## [4] {bottled beer, liquor} => {red/blush wine} 0.001931876
## [5] {Instant food products, soda} => {hamburger meat} 0.001220132
## [6] {curd, sugar} => {flour} 0.001118454
## [7] {baking powder, sugar} => {flour} 0.001016777
## [8] {processed cheese, white bread} => {ham} 0.001931876
## [9] {fruit/vegetable juice, ham} => {processed cheese} 0.001118454
## [10] {margarine, sugar} => {flour} 0.001626843
## confidence coverage lift count
## [1] 0.3958333 0.004880529 35.71579 19
## [2] 0.2105263 0.005795628 26.20919 12
## [3] 0.3800000 0.005083884 22.92822 19
## [4] 0.4130435 0.004677173 21.49356 19
## [5] 0.6315789 0.001931876 18.99565 12
## [6] 0.3235294 0.003457041 18.60767 11
## [7] 0.3125000 0.003253686 17.97332 10
## [8] 0.4634146 0.004168785 17.80345 19
## [9] 0.2894737 0.003863752 17.46610 11
## [10] 0.2962963 0.005490595 17.04137 16
# Convert to dataframe
top10_df <- as(top10_rules, "data.frame")
# Print results
top10_df
# Save results
write.csv(
top10_df,
"Top10RulesByLift.csv",
row.names = FALSE
)
5. Rule Visualization
# Scatterplot
plot(
rules,
measure = c("support", "confidence"),
shading = "lift"
)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

# Graph visualization
plot(
top10_rules,
method = "graph",
control = list(type = "items")
)
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE

Cluster Analysis
# Convert transactions into matrix
groceries_matrix <- as(groceries, "matrix")
# Top 20 frequent items
item_freq <- itemFrequency(groceries)
top_items <- names(
sort(item_freq, decreasing = TRUE)
)[1:20]
cluster_data <- groceries_matrix[, top_items]
# Convert TRUE/FALSE to numeric
cluster_data <- apply(
cluster_data,
2,
as.numeric
)
# K-Means Clustering
set.seed(123)
kmeans_result <- kmeans(
cluster_data,
centers = 3,
nstart = 25
)
# Cluster sizes
kmeans_result$size
## [1] 6154 1777 1904
# Cluster summary
cluster_summary <- aggregate(
cluster_data,
by = list(cluster = kmeans_result$cluster),
FUN = mean
)
cluster_summary
# Save clustering output
write.csv(
cluster_summary,
"ClusterSummary.csv",
row.names = FALSE
)