library(tidyverse)
library(arules)
library(kableExtra)
library(igraph)
library(stats)
library(cluster)
library(factoextra)
library(reactable)
library(arulesViz)
library(NbClust)
receipt is a representation of stuff that went into a customer’s basket - and therefore ‘Market Basket Analysis’. That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item. The data set is attached. Your assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift. Extra credit: do a simple cluster analysis on the data as well. Use whichever packages you like.
grocery_list <- read.transactions("/Users/mohamedhassan/Downloads/GroceryDataSet (1).csv", sep = ",")
summary(grocery_list)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
grocery_frequency <- tibble(Item_Name = names(itemFrequency(grocery_list)),
Frequency = itemFrequency(grocery_list))
grocery_frequency %>%
arrange(desc(Frequency)) %>%
reactable()
itemFrequencyPlot(grocery_list, topN=20, type="absolute", main="Top 20 Items by Frequency", col=rainbow(20))
Whole Milk is the highest frequency item, followed by Other Vegetables, and Rolls/Buns.
summary(grocery_frequency$Frequency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001017 0.0038637 0.0104728 0.0260915 0.0310117 0.2555160
To calculate the support parameter, I multiplied the items that were
purchased at least 5 times in a day over a 30-day period and divided it
by the total transactions in the grocery list. I used the max value of
the Frequency value in grocery_frequency as the confidence
parameter. Minlen is set as minimal number of items per item set:
set.seed(1234)
frequency_per_day <- 5
days_per_period <- 30
total_transactions <- length(grocery_list)
support_value <- (frequency_per_day*days_per_period)/total_transactions # 0.01525165
conf_value <- max(grocery_frequency$Frequency) # 0.2555160
grocery_rules <- apriori(grocery_list,
parameter = list(
support = support_value,
confidence = conf_value,
minlen=2
))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.255516 0.1 1 none FALSE TRUE 5 0.01525165 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 150
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [71 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [71 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(grocery_rules)
## set of 71 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 58 13
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.183 2.000 3.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01556 Min. :0.2587 Min. :0.03437 Min. :1.205
## 1st Qu.:0.01871 1st Qu.:0.3111 1st Qu.:0.05287 1st Qu.:1.507
## Median :0.02227 Median :0.3615 Median :0.06345 Median :1.741
## Mean :0.02638 Mean :0.3633 Mean :0.07472 Mean :1.758
## 3rd Qu.:0.02913 3rd Qu.:0.4054 3rd Qu.:0.08277 3rd Qu.:1.930
## Max. :0.07483 Max. :0.5129 Max. :0.25552 Max. :3.040
## count
## Min. :153.0
## 1st Qu.:184.0
## Median :219.0
## Mean :259.5
## 3rd Qu.:286.5
## Max. :736.0
##
## mining info:
## data ntransactions support confidence
## grocery_list 9835 0.01525165 0.255516
## call
## apriori(data = grocery_list, parameter = list(support = support_value, confidence = conf_value, minlen = 2))
top.confidence <- sort(grocery_rules, decreasing = TRUE, na.last = NA, by = "confidence")
reactable(inspect(head(top.confidence, 10)))
## lhs rhs support
## [1] {other vegetables, yogurt} => {whole milk} 0.02226741
## [2] {butter} => {whole milk} 0.02755465
## [3] {curd} => {whole milk} 0.02613116
## [4] {other vegetables, root vegetables} => {whole milk} 0.02318251
## [5] {other vegetables, tropical fruit} => {whole milk} 0.01708185
## [6] {root vegetables, whole milk} => {other vegetables} 0.02318251
## [7] {domestic eggs} => {whole milk} 0.02999492
## [8] {rolls/buns, yogurt} => {whole milk} 0.01555669
## [9] {whipped/sour cream} => {whole milk} 0.03223183
## [10] {root vegetables} => {whole milk} 0.04890696
## confidence coverage lift count
## [1] 0.5128806 0.04341637 2.007235 219
## [2] 0.4972477 0.05541434 1.946053 271
## [3] 0.4904580 0.05327911 1.919481 257
## [4] 0.4892704 0.04738180 1.914833 228
## [5] 0.4759207 0.03589222 1.862587 168
## [6] 0.4740125 0.04890696 2.449770 228
## [7] 0.4727564 0.06344687 1.850203 295
## [8] 0.4526627 0.03436706 1.771563 153
## [9] 0.4496454 0.07168277 1.759754 317
## [10] 0.4486940 0.10899847 1.756031 481
top.lift <- sort(grocery_rules, decreasing = TRUE, na.last = NA, by = "lift")
reactable(inspect(head(top.lift, 10)))
## lhs rhs support confidence
## [1] {beef} => {root vegetables} 0.01738688 0.3313953
## [2] {other vegetables, whole milk} => {root vegetables} 0.02318251 0.3097826
## [3] {pip fruit} => {tropical fruit} 0.02043721 0.2701613
## [4] {root vegetables, whole milk} => {other vegetables} 0.02318251 0.4740125
## [5] {curd} => {yogurt} 0.01728521 0.3244275
## [6] {root vegetables} => {other vegetables} 0.04738180 0.4347015
## [7] {chicken} => {other vegetables} 0.01789527 0.4170616
## [8] {other vegetables, whole milk} => {yogurt} 0.02226741 0.2975543
## [9] {tropical fruit, whole milk} => {other vegetables} 0.01708185 0.4038462
## [10] {whipped/sour cream} => {other vegetables} 0.02887646 0.4028369
## coverage lift count
## [1] 0.05246568 3.040367 171
## [2] 0.07483477 2.842082 228
## [3] 0.07564820 2.574648 201
## [4] 0.04890696 2.449770 228
## [5] 0.05327911 2.325615 170
## [6] 0.10899847 2.246605 466
## [7] 0.04290798 2.155439 176
## [8] 0.07483477 2.132979 219
## [9] 0.04229792 2.087140 168
## [10] 0.07168277 2.081924 284
grocery_rules_df <- grocery_rules %>%
DATAFRAME()
reactable(grocery_rules_df)
grocery_rules_df %>%
select(lift, count) %>%
summary()
## lift count
## Min. :1.205 Min. :153.0
## 1st Qu.:1.507 1st Qu.:184.0
## Median :1.741 Median :219.0
## Mean :1.758 Mean :259.5
## 3rd Qu.:1.930 3rd Qu.:286.5
## Max. :3.040 Max. :736.0
grocery_rules_scaled <- grocery_rules_df %>%
select(lift, count) %>%
scale()
reactable(grocery_rules_scaled)
grocery_rules_scaled <- as_tibble(grocery_rules_scaled)
Determining Number of Clusters
fviz_nbclust(grocery_rules_scaled, kmeans, method = "wss")
fviz_nbclust(grocery_rules_scaled, kmeans, method = "silhouette")
fviz_nbclust(grocery_rules_scaled, kmeans, method = "gap_stat")
When looking at each method for determining the number of clusters, the Silhouette and Gap Statistics methods both recommend the number of clusters at 2, while the Elbow method appears to recommend 3 or 4 as the number of clusters. I will produce separate K-Means Clustering models with the number of clusters set at 2, 3 and 4, respectively.
Clusters = 2
set.seed(1234)
km_grocery <- kmeans(grocery_rules_scaled, centers = 2, nstart = 25)
km_grocery$size
## [1] 8 63
set.seed(1234)
clust1 <- fviz_cluster(km_grocery, data = grocery_rules_scaled, repel = TRUE)
clust1
Clusters = 3
set.seed(1234)
km_grocery2 <- kmeans(grocery_rules_scaled, centers = 3, nstart = 25)
km_grocery2$size
## [1] 39 8 24
set.seed(1234)
clust2 <- fviz_cluster(km_grocery2, data = grocery_rules_scaled, repel = TRUE)
clust2
Clusters = 4
set.seed(1234)
km_grocery3 <- kmeans(grocery_rules_scaled, centers = 4, nstart = 25)
km_grocery3$size
## [1] 29 8 5 29
set.seed(1234)
clust3 <- fviz_cluster(km_grocery3, data = grocery_rules_scaled, repel = TRUE)
clust3
It appears that the data points are mostly clustered in two areas, Cluster 1 and Cluster 4
set.seed(1234)
d <- dist(grocery_rules_scaled)
set.seed(1234)
hc <- hclust(d, method = "complete")
set.seed(1234)
plot(hc)
set.seed(1234)
fviz_dend(hc, k = 4)
set.seed(1234)
# Cut tree into 3 groups
sub_grps <- cutree(hc, k = 3)
# Visualize the result in a scatter plot
fviz_cluster(list(data = grocery_rules_scaled, cluster = sub_grps))