library(tidyverse)
library(caret)
library(arules)
# Read in file from BB posting
data <- read.transactions("https://learn-us-east-1-prod-fleet02-xythos.content.blackboardcdn.com/61aab133e7df2/15994286?X-Blackboard-S3-Bucket=learn-us-east-1-prod-fleet01-xythos&X-Blackboard-Expiration=1714154400000&X-Blackboard-Signature=EXNbsp64188aDjV5ghvpIntldSZyodmooKdARIQis%2Bs%3D&X-Blackboard-Client-Id=100211&X-Blackboard-S3-Region=us-east-1&response-cache-control=private%2C%20max-age%3D21600&response-content-disposition=inline%3B%20filename%2A%3DUTF-8%27%27GroceryDataSet.csv&response-content-type=text%2Fcsv&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEHYaCXVzLWVhc3QtMSJHMEUCIQCPzoozXPE1F4nXsIGTwWl5%2FKQ2WV1Lvj8P77kujmDfFgIgGLl3RYlXs2RPhBOEechJ2e75alzuSmnEDkIjsXAGMEoqvQUIv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARADGgw2MzU1Njc5MjQxODMiDFtXL4Jkwt65okwHxiqRBR%2Bm8NJuX3%2FeT81Xh%2BOa%2FCTSnAbnFIpY9qgdBVxBS1uvaxRGtm%2FECck%2BrXiPvFzG0z4XdNx1x4joRiWSkQgYO7QVOltxsCGSsDPj8Lre85MmyHsT25DhcTf1ieRl%2BSzKDwljW%2Bh2klESX1pMaWIKW4HOvypaNPcWrLlBqXpb44pW4yo7m4dJqnpgOmldN%2FzAEAAcoJYlPQQxdvbDo5k0naI%2Ffw5o9nJJkK76oJQF2WQ0rC5Cf3groL0HIMDtkLT%2FYItLtxsYy5n1k3or6meEKhFovEeW3jaIOH23X4Cl1dqj3X%2FGO7XwYg7ru5C9t5f84WpjGSKNO0DuJb090I1J6lT%2FFcZeoSSWgsySropjeB%2B499%2BXkSrPPKB81Zc9fRT0yiDNBdZfD0cdMEVNLy%2BmKKuS5%2BJk5sb3ZoIa0%2F8GLb9jgwqVLwELtN%2B0jJIwLxXB3fEe%2Fi2q4bcyzJ1K2za3Sfj7VAiggDcYxbK%2FH%2BdrP5Vv433C%2BgtJKCP2nLaLLWfrqUcILs6t8%2B%2B15tbDbW7jD%2FM4WvGplummBa1VRcDeT7%2Frvk8oDQU8LhB6D1fH4viBb17SzNQYIwh4Ix6KcRjIPmCAbOpV2IAT42hGgTKkoaugmKCdSxs%2F3Q20tN7ZVADsRpz34TvAMfqOWnk8jDHF7IZUrHuDp9Y%2BrrlwmVf7skK3cRZcnzJ%2FnpW267%2Fgqkl%2FcvBau1hYUZfRNRhrL9%2B4SeOMwROACwlBkIat4jDtIio7GEJOZTwIRu0R%2F5X05oBEW99QqeDtW0JRFeCbLgNE2hf9PQ2TaBqNF3KdqAtCsbkHJTLa%2B%2FbroYdmOSH6EOxFrQYzcNMtyxBcoR6TEL9yrSNZ8U0ltb0uSkN1URue%2B5nf6DDT266xBjqxAenwDmGoi%2BXKOet8wunv6iHxx9uAGmdzj9A61u86JYR25vAA3w1v2vFPlwpsbgi3hDm1EltjIGmAwpnI%2FQhmms2cxA8sYdcMbdVuNQ3N4J6CtSs76G47WaVn%2BxYxe8uJKoTEZPXPg9GwosEnqLERltXlql4RO3%2BPHDwrq4F0tS0zbFBsKVPs%2FQz5I7s%2FtmE2HRJ2WplMVXQFMshwkuUUY9Pqa40WxJVN7O96kvew7RquEg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240426T120000Z&X-Amz-SignedHeaders=host&X-Amz-Expires=21600&X-Amz-Credential=ASIAZH6WM4PL6OFBYI4R%2F20240426%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=ccc7803bceb51d061e6b1ab78934216319096403228d1a48247b4054bd3c92a1", sep=",")
head(data)
## transactions in sparse format with
## 6 transactions (rows) and
## 169 items (columns)
# Print a summary of our transaction set
summary(data)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
Given that our dataset is a list of transactions without a label
we’re trying to predict (a very common unsupervised learning task) we
can use association rules to mine this data for helpful insights. I
found Practical
Machine Learning in R Chapter 11 to be a really helpful
resource on association rules (and unsupervised learnign more broadly).
Specifically, we can use the A Priori
Algorithm from the arules library, which is a very
commonly-used association rule mining algorithm.
rules <-
apriori(data,
parameter = list(
support = 0.0085,
confidence = 0.5,
minlen = 2
))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.0085 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 83
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [96 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [28 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)
## set of 28 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 1 27
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 2.964 3.000 3.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.008541 Min. :0.5000 Min. :0.01434 Min. :1.957
## 1st Qu.:0.009354 1st Qu.:0.5162 1st Qu.:0.01759 1st Qu.:2.025
## Median :0.010219 Median :0.5278 Median :0.01871 Median :2.164
## Mean :0.011344 Mean :0.5425 Mean :0.02108 Mean :2.243
## 3rd Qu.:0.012405 3rd Qu.:0.5737 3rd Qu.:0.02430 3rd Qu.:2.292
## Max. :0.022267 Max. :0.6389 Max. :0.04342 Max. :3.030
## count
## Min. : 84.0
## 1st Qu.: 92.0
## Median :100.5
## Mean :111.6
## 3rd Qu.:122.0
## Max. :219.0
##
## mining info:
## data ntransactions support confidence
## data 9835 0.0085 0.5
## call
## apriori(data = data, parameter = list(support = 0.0085, confidence = 0.5, minlen = 2))
We can also use the inspect function and some dply magic
to get the top-10 rules by lift, along with each rule’s support and
confidence
rules %>%
sort(by = "lift") %>%
head(n = 10) %>%
inspect()
## lhs rhs support confidence coverage lift count
## [1] {citrus fruit,
## root vegetables} => {other vegetables} 0.010371124 0.5862069 0.01769192 3.029608 102
## [2] {root vegetables,
## tropical fruit} => {other vegetables} 0.012302999 0.5845411 0.02104728 3.020999 121
## [3] {rolls/buns,
## root vegetables} => {other vegetables} 0.012201322 0.5020921 0.02430097 2.594890 120
## [4] {root vegetables,
## whipped/sour cream} => {other vegetables} 0.008540925 0.5000000 0.01708185 2.584078 84
## [5] {root vegetables,
## yogurt} => {other vegetables} 0.012913066 0.5000000 0.02582613 2.584078 127
## [6] {butter,
## yogurt} => {whole milk} 0.009354347 0.6388889 0.01464159 2.500387 92
## [7] {domestic eggs,
## root vegetables} => {whole milk} 0.008540925 0.5957447 0.01433655 2.331536 84
## [8] {curd,
## yogurt} => {whole milk} 0.010066090 0.5823529 0.01728521 2.279125 99
## [9] {pip fruit,
## root vegetables} => {whole milk} 0.008947636 0.5751634 0.01555669 2.250988 88
## [10] {curd,
## other vegetables} => {whole milk} 0.009862735 0.5739645 0.01718353 2.246296 97
To increase sales based on this ruleset, a store manager may want to consider placing vegetables next to fruits (citrus or tropical), as those items are frequently purchased together. In addition, grouping dairy products together also would help to increase complementary purchases.
We can use the K-Means clustering method on this transaction data.
First we’ll want to convert to a format suitable for clustering. In our
case, each row will represent one transaction, with the column values
representing items and whetehr or not they were purchased
(1) or not (0) in a given transaction
# Convert the transactions dataset to a quasi-adjacency matrix, multiply by 1 to convert booleans to 0/1
df <- 1 * as(data,"matrix")
Now that we have our transactions as suitable feature vectors, we can
try various values of \(k\) to see
which parititions our transaction data best. I found this DataCamp
tutorial helpful for this step, using the kmeans
function within the stats library.
# Decide how many clusters to look at
n_clusters <- 15
# Initialize total within sum of squares error
wss <- numeric(n_clusters)
set.seed(1234)
# Look over 1 to n possible clusters
for (i in 1:n_clusters) {
# Fit the model: km.out
km.out <- kmeans(df, centers = i, nstart = 20)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Produce a scree plot
wss_df <- tibble(clusters = 1:n_clusters, wss = wss)
scree_plot <- ggplot(wss_df, aes(x = clusters, y = wss, group = 1)) +
geom_point(size = 4)+
geom_line() +
scale_x_continuous(breaks = seq(2, 15, 2)) +
xlab('Number of clusters')
scree_plot
Other evaluation metrics we could use for our clusters could be the Silhouette score or the Davied-Boulding index for an unsupervised learning task