Loading and Preparing the Data

# Load required packages
library(arules)
library(arulesViz)
library(tidyverse)
library(dendextend)
library(plotly)
library(heatmaply)

# Read the data directly from GitHub
github_url <- "https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/624/GroceryDataSet.csv"
groceries <- read.transactions(github_url, 
                              format = "basket", 
                              sep = ",",
                              rm.duplicates = TRUE)

# Summary of the transactions
summary(groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics

Exploratory Data Analysis

# Number of transactions and items
cat("Number of transactions:", nrow(groceries), "\n")
## Number of transactions: 9835
cat("Number of unique items:", ncol(groceries), "\n")
## Number of unique items: 169
# Plot item frequency (top 20 items)
itemFrequencyPlot(groceries, topN = 20, type = "absolute", main = "Top 20 Items by Frequency")

Mining Association Rules

# Mine association rules
rules <- apriori(groceries, 
                parameter = list(support = 0.001, 
                                 confidence = 0.5, 
                                 minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [5668 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Summary of rules
summary(rules)
## set of 5668 rules
## 
## rule length distribution (lhs + rhs):sizes
##    2    3    4    5    6 
##   11 1461 3211  939   46 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    3.00    4.00    3.92    4.00    6.00 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.001017   Min.   :0.5000   Min.   :0.001017   Min.   : 1.957  
##  1st Qu.:0.001118   1st Qu.:0.5455   1st Qu.:0.001729   1st Qu.: 2.464  
##  Median :0.001322   Median :0.6000   Median :0.002135   Median : 2.899  
##  Mean   :0.001668   Mean   :0.6250   Mean   :0.002788   Mean   : 3.262  
##  3rd Qu.:0.001729   3rd Qu.:0.6842   3rd Qu.:0.002949   3rd Qu.: 3.691  
##  Max.   :0.022267   Max.   :1.0000   Max.   :0.043416   Max.   :18.996  
##      count      
##  Min.   : 10.0  
##  1st Qu.: 11.0  
##  Median : 13.0  
##  Mean   : 16.4  
##  3rd Qu.: 17.0  
##  Max.   :219.0  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.001        0.5
##                                                                                        call
##  apriori(data = groceries, parameter = list(support = 0.001, confidence = 0.5, minlen = 2))

Analyzing the Rules

# Sort rules by lift and inspect top 10
rules_sorted <- sort(rules, by = "lift", decreasing = TRUE)
top_rules <- head(rules_sorted, 10)
inspect(top_rules)
##      lhs                         rhs                  support confidence    coverage     lift count
## [1]  {Instant food products,                                                                       
##       soda}                   => {hamburger meat} 0.001220132  0.6315789 0.001931876 18.99565    12
## [2]  {popcorn,                                                                                     
##       soda}                   => {salty snack}    0.001220132  0.6315789 0.001931876 16.69779    12
## [3]  {baking powder,                                                                               
##       flour}                  => {sugar}          0.001016777  0.5555556 0.001830198 16.40807    10
## [4]  {ham,                                                                                         
##       processed cheese}       => {white bread}    0.001931876  0.6333333 0.003050330 15.04549    19
## [5]  {Instant food products,                                                                       
##       whole milk}             => {hamburger meat} 0.001525165  0.5000000 0.003050330 15.03823    15
## [6]  {curd,                                                                                        
##       other vegetables,                                                                            
##       whipped/sour cream,                                                                          
##       yogurt}                 => {cream cheese}   0.001016777  0.5882353 0.001728521 14.83409    10
## [7]  {domestic eggs,                                                                               
##       processed cheese}       => {white bread}    0.001118454  0.5238095 0.002135231 12.44364    11
## [8]  {other vegetables,                                                                            
##       tropical fruit,                                                                              
##       white bread,                                                                                 
##       yogurt}                 => {butter}         0.001016777  0.6666667 0.001525165 12.03058    10
## [9]  {hamburger meat,                                                                              
##       whipped/sour cream,                                                                          
##       yogurt}                 => {butter}         0.001016777  0.6250000 0.001626843 11.27867    10
## [10] {domestic eggs,                                                                               
##       other vegetables,                                                                            
##       tropical fruit,                                                                              
##       whole milk,                                                                                  
##       yogurt}                 => {butter}         0.001016777  0.6250000 0.001626843 11.27867    10

The association rules reveal several strong relationships between grocery items, characterized by high lift values ranging from 11.28 to 18.99. These patterns suggest specific purchasing behaviors related to meal preparation, convenience foods, and baking.

Visualizing the Rules

# Plot top 10 rules
plot(top_rules, method = "graph", engine = "htmlwidget")

Strong correlations exist between convenience foods and snacks, indicating intentional purchases for quick meals or leisure. Customers who buy instant food products with soda are 19 times more likely to also purchase hamburger meat for fast meals. Additionally, popcorn and soda have a lift value of 16.7, reflecting a classic “movie night” snack bundle. Despite these patterns being rare (support ~0.0012), their high lift values show they are significant.

Baking ingredients often correlate with meal components. Purchasing baking powder and flour increases the likelihood of buying sugar by 16.4 times. Similarly, ham and processed cheese often appear with white bread (lift = 15.0), suggesting sandwich-making habits. These patterns indicate customers buy complementary items together, emphasizing the need for strategic product placement in stores.

Patterns involving dairy products frequently connect to cooking and baking. For example, the purchase of curd, vegetables, whipped cream, and yogurt predicts cream cheese purchases (lift = 14.8), suggesting dessert or dip preparation. Additionally, combinations of yogurt, eggs, and vegetables can lead to butter purchases (lifts ~11-12), indicating recipes that require multiple dairy ingredients.

Cluster Analysis

# Create a binary matrix of transactions
binary_matrix <- as(groceries, "matrix")

# Calculate distance matrix
d <- dist(t(binary_matrix), method = "binary")

# Hierarchical clustering
hc <- hclust(d, method = "ward.D2")

# Plot dendrogram
dend <- as.dendrogram(hc) %>% 
  set("branches_k_color", k = 5) %>%  # Color branches by cluster
  set("labels_cex", 0.6)  # Adjust label size

# Interactive plot
dend %>% plot(horiz = TRUE)

heatmaply(
  as.matrix(dist(t(binary_matrix))), 
  dendrogram = "row",  # Show only item clustering
  k_row = 5,           # Color clusters
  fontsize_row = 8,
  labRow = colnames(binary_matrix)
)
# Cut the dendrogram into 5 clusters
clusters <- cutree(hc, k = 5)  

# Print items grouped by cluster
for (i in 1:5) {
  cat("\nCluster", i, "items:\n")
  print(names(clusters[clusters == i]))
}
## 
## Cluster 1 items:
##  [1] "abrasive cleaner"          "artif. sweetener"         
##  [3] "baby cosmetics"            "baby food"                
##  [5] "bags"                      "bathroom cleaner"         
##  [7] "brandy"                    "cake bar"                 
##  [9] "candles"                   "canned fruit"             
## [11] "cereals"                   "chocolate marshmallow"    
## [13] "cleaner"                   "cocoa drinks"             
## [15] "cooking chocolate"         "cookware"                 
## [17] "cream"                     "curd cheese"              
## [19] "decalcifier"               "dental care"              
## [21] "detergent"                 "female sanitary products" 
## [23] "finished products"         "fish"                     
## [25] "flower (seeds)"            "flower soil/fertilizer"   
## [27] "frozen chicken"            "frozen dessert"           
## [29] "frozen fish"               "frozen fruits"            
## [31] "frozen potato products"    "hair spray"               
## [33] "honey"                     "instant coffee"           
## [35] "ketchup"                   "kitchen towels"           
## [37] "kitchen utensil"           "light bulbs"              
## [39] "liqueur"                   "liver loaf"               
## [41] "make up remover"           "male cosmetics"           
## [43] "mayonnaise"                "mustard"                  
## [45] "nut snack"                 "nuts/prunes"              
## [47] "organic products"          "organic sausage"          
## [49] "packaged fruit/vegetables" "photo/film"               
## [51] "potato products"           "preservation products"    
## [53] "prosecco"                  "pudding powder"           
## [55] "ready soups"               "rice"                     
## [57] "roll products"             "rubbing alcohol"          
## [59] "rum"                       "salad dressing"           
## [61] "sauces"                    "seasonal products"        
## [63] "skin care"                 "snack products"           
## [65] "soap"                      "softener"                 
## [67] "sound storage medium"      "soups"                    
## [69] "specialty fat"             "specialty vegetables"     
## [71] "spices"                    "spread cheese"            
## [73] "sweet spreads"             "syrup"                    
## [75] "tea"                       "tidbits"                  
## [77] "toilet cleaner"            "vinegar"                  
## [79] "whisky"                    "white wine"               
## [81] "zwieback"                 
## 
## Cluster 2 items:
##  [1] "baking powder"          "beverages"              "butter milk"           
##  [4] "canned fish"            "canned vegetables"      "cat food"              
##  [7] "chewing gum"            "cling film/bags"        "coffee"                
## [10] "condensed milk"         "cream cheese"           "curd"                  
## [13] "dessert"                "dish cleaner"           "dishes"                
## [16] "dog food"               "flour"                  "frozen meals"          
## [19] "grapes"                 "hamburger meat"         "hard cheese"           
## [22] "herbs"                  "house keeping products" "hygiene articles"      
## [25] "ice cream"              "Instant food products"  "jam"                   
## [28] "liquor"                 "liquor (appetizer)"     "meat"                  
## [31] "meat spreads"           "misc. beverages"        "napkins"               
## [34] "oil"                    "onions"                 "pasta"                 
## [37] "pet care"               "pickled vegetables"     "pot plants"            
## [40] "red/blush wine"         "salt"                   "semi-finished bread"   
## [43] "sliced cheese"          "soft cheese"            "sparkling wine"        
## [46] "specialty cheese"       "sugar"                  "turkey"                
## [49] "UHT-milk"              
## 
## Cluster 3 items:
##  [1] "beef"               "berries"            "butter"            
##  [4] "chicken"            "citrus fruit"       "domestic eggs"     
##  [7] "frozen vegetables"  "margarine"          "other vegetables"  
## [10] "pip fruit"          "pork"               "root vegetables"   
## [13] "tropical fruit"     "whipped/sour cream" "whole milk"        
## [16] "yogurt"            
## 
## Cluster 4 items:
##  [1] "bottled beer"          "bottled water"         "brown bread"          
##  [4] "canned beer"           "frankfurter"           "fruit/vegetable juice"
##  [7] "newspapers"            "pastry"                "rolls/buns"           
## [10] "sausage"               "shopping bags"         "soda"                 
## 
## Cluster 5 items:
##  [1] "candy"                    "chocolate"               
##  [3] "ham"                      "long life bakery product"
##  [5] "popcorn"                  "processed cheese"        
##  [7] "salty snack"              "specialty bar"           
##  [9] "specialty chocolate"      "waffles"                 
## [11] "white bread"

Conclusion

These rules reveal niche yet deliberate purchasing behaviors, offering valuable insights for retailers to improve cross-selling, optimize store layouts, and customize promotions according to customer habits. Although these associations are robust, they are observed infrequently (low support). Furthermore, external factors such as promotions or regional preferences may impact these patterns. Nevertheless, the elevated lift values indicate that these behaviors are significant and non-random, providing retailers with opportunities to implement targeted strategies.