MBA assignment

library(arules)
library(arulesViz)
library(cluster)
library(factoextra)
library(tidyverse)
library(ggplot2)
library(tidytext)

url<- 'https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/GroceryDataSet.csv'
df<- read.transactions(url, format = 'basket', sep = ',')
#need to use the inspect() command to view the results in arules:
inspect(head(df,5))

##     items                      
## [1] {citrus fruit,             
##      margarine,                
##      ready soups,              
##      semi-finished bread}      
## [2] {coffee,                   
##      tropical fruit,           
##      yogurt}                   
## [3] {whole milk}               
## [4] {cream cheese,             
##      meat spreads,             
##      pip fruit,                
##      yogurt}                   
## [5] {condensed milk,           
##      long life bakery product, 
##      other vegetables,         
##      whole milk}

To mining the rules:

supp = the support–defined as minimal % of carts must contain the all items in the rule. (Or how frequently the items must appear in the carts). We use 0.1%.

conf = how confident that item B appears along side with item A or with a set of items. We use 80% confidence.

minlen = minimal number of items in a rule (usual 2-4 is good)

maxlen = maximum number of items in a rule

rules<-apriori(df, parameter = list(supp = 0.001, conf = 0.8, minlen = 2, maxlen = 4), control = list(verbose=FALSE))

To inspect the top 10 rules based on lift: liquor,red/blush wine has the strongest link to be purchased with bottled beer. Lift = 11.2

inspect(sort(rules, by = 'lift', decreasing = TRUE)[1:10])

##      lhs                         rhs                  support confidence    coverage      lift count
## [1]  {liquor,                                                                                       
##       red/blush wine}         => {bottled beer}   0.001931876  0.9047619 0.002135231 11.235269    19
## [2]  {citrus fruit,                                                                                 
##       fruit/vegetable juice,                                                                        
##       grapes}                 => {tropical fruit} 0.001118454  0.8461538 0.001321810  8.063879    11
## [3]  {butter,                                                                                       
##       cream cheese,                                                                                 
##       root vegetables}        => {yogurt}         0.001016777  0.9090909 0.001118454  6.516698    10
## [4]  {pip fruit,                                                                                    
##       sausage,                                                                                      
##       sliced cheese}          => {yogurt}         0.001220132  0.8571429 0.001423488  6.144315    12
## [5]  {butter,                                                                                       
##       tropical fruit,                                                                               
##       white bread}            => {yogurt}         0.001118454  0.8461538 0.001321810  6.065542    11
## [6]  {butter,                                                                                       
##       margarine,                                                                                    
##       tropical fruit}         => {yogurt}         0.001118454  0.8461538 0.001321810  6.065542    11
## [7]  {cream cheese,                                                                                 
##       margarine,                                                                                    
##       whipped/sour cream}     => {yogurt}         0.001016777  0.8333333 0.001220132  5.973639    10
## [8]  {beef,                                                                                         
##       butter,                                                                                       
##       tropical fruit}         => {yogurt}         0.001016777  0.8333333 0.001220132  5.973639    10
## [9]  {fruit/vegetable juice,                                                                        
##       pork,                                                                                         
##       tropical fruit}         => {yogurt}         0.001016777  0.8333333 0.001220132  5.973639    10
## [10] {butter milk,                                                                                  
##       other vegetables,                                                                             
##       pastry}                 => {yogurt}         0.001220132  0.8000000 0.001525165  5.734694    12

To visualize:

plot(rules[1:10], method='graph', engine='htmlwidge')

Cluster analysis:

#convert transaction object into a matrix:
trans_matrix<- as(df, 'matrix')

Compute Jaccard distance

dissimilarity <- dist(trans_matrix, method = "binary")

Compute the distances to hierarchical method:

hc <- hclust(dissimilarity, method='ward.D2')

#plot the dendrogram:
plot(hc, main = "Dendrogram of Grocery Transactions", xlab = '', sub = '')

Dividing the items into 4 clusters and count them: Cluster 1 has the biggest number of items while 2 to 4 are about the same.

#Cut the clusters into 6:
clusters <- cutree(hc, k=4)

table(clusters)

## clusters
##    1    2    3    4 
## 8810  366  340  319

Analyze clusters:

# Add cluster labels to matrix
clustered_data<- as.data.frame(trans_matrix)
clustered_data$Cluster <- clusters

# View average presence of items per cluster
list1<- aggregate(. ~ Cluster, data = clustered_data, mean)

To visualize with the top ten items in the clusters:

long_list1<- list1 %>% 
  pivot_longer(cols = -1,
               names_to = 'items',
               values_to = 'average_presence') %>% 
  group_by(Cluster) %>% 
  slice_max(average_presence, n = 10) %>% 
  mutate(items = reorder_within(items, average_presence, Cluster))

long_list1 %>% ggplot(aes(y=items, x=average_presence, fill=as.factor(Cluster)))+
  geom_bar(stat = 'identity')+
  facet_wrap(~Cluster, scales='free_y')+
  scale_y_reordered(labels = function(x) gsub("___\\d+", "", x))+
  labs(title='Top 10 most frequently appear items in four clusters')

visualize:

fviz_cluster(list(data= trans_matrix, cluster = clusters))

MBA assignment

Chi Hang(Philip) Cheung

2025-04-25