Market Basket Analysis - Grocery Dataset

# Load libraries
library(arules)
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.4.3
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.4.3
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3

1. Load Grocery Dataset from GitHub

url <- "https://raw.githubusercontent.com/tanzil64/DATA624/refs/heads/main/GroceryDataSet.csv"

groceries <- read.transactions(
  url,
  format = "basket",
  sep = ","
)

2. Dataset Summary

summary(groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics
# Plot top 20 frequent items
itemFrequencyPlot(
  groceries,
  topN = 20,
  type = "absolute",
  main = "Top 20 Grocery Items"
)

3. Association Rule Mining

rules <- apriori(
  groceries,
  parameter = list(
    support = 0.001,
    confidence = 0.20,
    minlen = 2
  )
)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [21633 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Summary of generated rules
summary(rules)
## set of 21633 rules
## 
## rule length distribution (lhs + rhs):sizes
##    2    3    4    5    6 
##  620 9337 9824 1792   60 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   4.000   3.599   4.000   6.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001017   Min.   :0.2000   Min.   :0.001017   Min.   : 0.8028  
##  1st Qu.:0.001118   1st Qu.:0.2632   1st Qu.:0.002745   1st Qu.: 2.1178  
##  Median :0.001322   Median :0.3548   Median :0.004169   Median : 2.7571  
##  Mean   :0.001948   Mean   :0.3967   Mean   :0.005840   Mean   : 3.0214  
##  3rd Qu.:0.001932   3rd Qu.:0.5000   3rd Qu.:0.006101   3rd Qu.: 3.6148  
##  Max.   :0.074835   Max.   :1.0000   Max.   :0.255516   Max.   :35.7158  
##      count       
##  Min.   : 10.00  
##  1st Qu.: 11.00  
##  Median : 13.00  
##  Mean   : 19.15  
##  3rd Qu.: 19.00  
##  Max.   :736.00  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.001        0.2
##                                                                                        call
##  apriori(data = groceries, parameter = list(support = 0.001, confidence = 0.2, minlen = 2))

4. Top 10 Rules by Lift

rules_lift <- sort(
  rules,
  by = "lift",
  decreasing = TRUE
)

top10_rules <- head(rules_lift, 10)

inspect(top10_rules)
##      lhs                                rhs                     support    
## [1]  {bottled beer, red/blush wine}  => {liquor}                0.001931876
## [2]  {hamburger meat, soda}          => {Instant food products} 0.001220132
## [3]  {ham, white bread}              => {processed cheese}      0.001931876
## [4]  {bottled beer, liquor}          => {red/blush wine}        0.001931876
## [5]  {Instant food products, soda}   => {hamburger meat}        0.001220132
## [6]  {curd, sugar}                   => {flour}                 0.001118454
## [7]  {baking powder, sugar}          => {flour}                 0.001016777
## [8]  {processed cheese, white bread} => {ham}                   0.001931876
## [9]  {fruit/vegetable juice, ham}    => {processed cheese}      0.001118454
## [10] {margarine, sugar}              => {flour}                 0.001626843
##      confidence coverage    lift     count
## [1]  0.3958333  0.004880529 35.71579 19   
## [2]  0.2105263  0.005795628 26.20919 12   
## [3]  0.3800000  0.005083884 22.92822 19   
## [4]  0.4130435  0.004677173 21.49356 19   
## [5]  0.6315789  0.001931876 18.99565 12   
## [6]  0.3235294  0.003457041 18.60767 11   
## [7]  0.3125000  0.003253686 17.97332 10   
## [8]  0.4634146  0.004168785 17.80345 19   
## [9]  0.2894737  0.003863752 17.46610 11   
## [10] 0.2962963  0.005490595 17.04137 16
# Convert to dataframe
top10_df <- as(top10_rules, "data.frame")

# Print results
top10_df
# Save results
write.csv(
  top10_df,
  "Top10RulesByLift.csv",
  row.names = FALSE
)

5. Rule Visualization

# Scatterplot
plot(
  rules,
  measure = c("support", "confidence"),
  shading = "lift"
)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

# Graph visualization
plot(
  top10_rules,
  method = "graph",
  control = list(type = "items")
)
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## layout    =  stress
## circular  =  FALSE
## ggraphdots    =  NULL
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE

Cluster Analysis

# Convert transactions into matrix
groceries_matrix <- as(groceries, "matrix")

# Top 20 frequent items
item_freq <- itemFrequency(groceries)

top_items <- names(
  sort(item_freq, decreasing = TRUE)
)[1:20]

cluster_data <- groceries_matrix[, top_items]

# Convert TRUE/FALSE to numeric
cluster_data <- apply(
  cluster_data,
  2,
  as.numeric
)

# K-Means Clustering
set.seed(123)

kmeans_result <- kmeans(
  cluster_data,
  centers = 3,
  nstart = 25
)

# Cluster sizes
kmeans_result$size
## [1] 6154 1777 1904
# Cluster summary
cluster_summary <- aggregate(
  cluster_data,
  by = list(cluster = kmeans_result$cluster),
  FUN = mean
)

cluster_summary

# Save clustering output
write.csv(
  cluster_summary,
  "ClusterSummary.csv",
  row.names = FALSE
)