library(arules)
library(arulesViz)
library(cluster)
library(factoextra)
library(tidyverse)
library(ggplot2)
library(tidytext)
url<- 'https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/GroceryDataSet.csv'
df<- read.transactions(url, format = 'basket', sep = ',')
#need to use the inspect() command to view the results in arules:
inspect(head(df,5))
## items
## [1] {citrus fruit,
## margarine,
## ready soups,
## semi-finished bread}
## [2] {coffee,
## tropical fruit,
## yogurt}
## [3] {whole milk}
## [4] {cream cheese,
## meat spreads,
## pip fruit,
## yogurt}
## [5] {condensed milk,
## long life bakery product,
## other vegetables,
## whole milk}
To mining the rules:
supp = the support–defined as minimal % of carts must contain the all items in the rule. (Or how frequently the items must appear in the carts). We use 0.1%.
conf = how confident that item B appears along side with item A or with a set of items. We use 80% confidence.
minlen = minimal number of items in a rule (usual 2-4 is good)
maxlen = maximum number of items in a rule
rules<-apriori(df, parameter = list(supp = 0.001, conf = 0.8, minlen = 2, maxlen = 4), control = list(verbose=FALSE))
To inspect the top 10 rules based on lift: liquor,red/blush wine has the strongest link to be purchased with bottled beer. Lift = 11.2
inspect(sort(rules, by = 'lift', decreasing = TRUE)[1:10])
## lhs rhs support confidence coverage lift count
## [1] {liquor,
## red/blush wine} => {bottled beer} 0.001931876 0.9047619 0.002135231 11.235269 19
## [2] {citrus fruit,
## fruit/vegetable juice,
## grapes} => {tropical fruit} 0.001118454 0.8461538 0.001321810 8.063879 11
## [3] {butter,
## cream cheese,
## root vegetables} => {yogurt} 0.001016777 0.9090909 0.001118454 6.516698 10
## [4] {pip fruit,
## sausage,
## sliced cheese} => {yogurt} 0.001220132 0.8571429 0.001423488 6.144315 12
## [5] {butter,
## tropical fruit,
## white bread} => {yogurt} 0.001118454 0.8461538 0.001321810 6.065542 11
## [6] {butter,
## margarine,
## tropical fruit} => {yogurt} 0.001118454 0.8461538 0.001321810 6.065542 11
## [7] {cream cheese,
## margarine,
## whipped/sour cream} => {yogurt} 0.001016777 0.8333333 0.001220132 5.973639 10
## [8] {beef,
## butter,
## tropical fruit} => {yogurt} 0.001016777 0.8333333 0.001220132 5.973639 10
## [9] {fruit/vegetable juice,
## pork,
## tropical fruit} => {yogurt} 0.001016777 0.8333333 0.001220132 5.973639 10
## [10] {butter milk,
## other vegetables,
## pastry} => {yogurt} 0.001220132 0.8000000 0.001525165 5.734694 12
To visualize:
plot(rules[1:10], method='graph', engine='htmlwidge')
Cluster analysis:
#convert transaction object into a matrix:
trans_matrix<- as(df, 'matrix')
Compute Jaccard distance
dissimilarity <- dist(trans_matrix, method = "binary")
Compute the distances to hierarchical method:
hc <- hclust(dissimilarity, method='ward.D2')
#plot the dendrogram:
plot(hc, main = "Dendrogram of Grocery Transactions", xlab = '', sub = '')
Dividing the items into 4 clusters and count them: Cluster 1 has the biggest number of items while 2 to 4 are about the same.
#Cut the clusters into 6:
clusters <- cutree(hc, k=4)
table(clusters)
## clusters
## 1 2 3 4
## 8810 366 340 319
Analyze clusters:
# Add cluster labels to matrix
clustered_data<- as.data.frame(trans_matrix)
clustered_data$Cluster <- clusters
# View average presence of items per cluster
list1<- aggregate(. ~ Cluster, data = clustered_data, mean)
To visualize with the top ten items in the clusters:
long_list1<- list1 %>%
pivot_longer(cols = -1,
names_to = 'items',
values_to = 'average_presence') %>%
group_by(Cluster) %>%
slice_max(average_presence, n = 10) %>%
mutate(items = reorder_within(items, average_presence, Cluster))
long_list1 %>% ggplot(aes(y=items, x=average_presence, fill=as.factor(Cluster)))+
geom_bar(stat = 'identity')+
facet_wrap(~Cluster, scales='free_y')+
scale_y_reordered(labels = function(x) gsub("___\\d+", "", x))+
labs(title='Top 10 most frequently appear items in four clusters')
visualize:
fviz_cluster(list(data= trans_matrix, cluster = clusters))