output: html_document: theme: yeti highlight: tango toc: yes toc_float: yes pdf_document: dev: cairo_pdf toc: yes —
Market basket analysis
Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket – and therefore ‘Market Basket Analysis’.
That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item.
Here is the dataset = GroceryDataSet.csv (comma separated file)
You assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift. Turn in as you would the other problems from HA and KJ. You should packaged this with your HW #2 set.
NOTE: Bonus - Use a Python Library in addition to R and get a Bonus.
# Load data from github
dfmkt_raw <- read.csv('https://raw.githubusercontent.com/klgriffen96/summer23_data624/main/hw_2/GroceryDataSet.csv', header=F)
# Add column for transaction number and reorder so that it is first
dfmkt <- dfmkt_raw
dfmkt$trans.num <- row.names(dfmkt_raw)
dfmkt <- dfmkt %>%
dplyr::select(trans.num, 1:ncol(dfmkt) - 1)
# Gather each item into its own row (i.e. make df into "long" format)
dfmkt <- gather(dfmkt, key='item.num', value='item.name', 2:ncol(dfmkt)) %>% arrange(trans.num, item.num)
# Remove items that are blank
dfmkt <- dfmkt %>%
filter(item.name != '')
# Create a variable to indicate the presence of this item for this transaction
dfmkt$val <- 1
# Spread to "wide" format; this will fill in "1" into columns that have that item
# and NAs into columns that don't have that item
dfmkt2 <- spread(dfmkt, key='item.name', value='val', fill=NA)
# Replace the NAs with zeros
dfmkt2[is.na(dfmkt2)] <- 0
# Combine rows by trnasaction num
dfmkt2 <- dfmkt2 %>%
group_by(trans.num) %>%
summarize(across(c(-item.num), list(sum))) %>%
ungroup()
# Show unique item count
print(paste0('There are ', length(unique(dfmkt$item.name)), ' unique items.'))
## [1] "There are 169 unique items."
# Get item counts
dfgrp <- dfmkt %>%
group_by(item.name) %>%
summarize(ct=n()) %>%
ungroup() %>%
arrange(desc(ct))
# Most frequently purchased items
#dfgrp %>%
# head(10) %>%
# kbl(caption='10 Most purchased items') %>%
# kable_classic(full_width=F)
# Most frequently purchased items
dfgrp %>%
arrange(desc(ct)) %>%
head(10) %>%
ggplot(aes(x=reorder(item.name, ct), y=ct)) +
geom_bar(stat='identity', width=0.5) +
geom_text(aes(label=ct), hjust=-0.1) +
coord_flip() +
xlab('Item') +
ylab('Count') +
ggtitle('Most puchased items')
# Infrequently purchased items
#dfgrp %>%
# arrange(ct) %>%
# head(10) %>%
# kbl(caption='10 Least purchased items') %>%
# kable_classic(full_width=F)
# Infrequently purchased items
dfgrp %>%
arrange(ct) %>%
head(10) %>%
ggplot(aes(x=reorder(item.name, desc(ct)), y=ct)) +
geom_bar(stat='identity', width=0.5) +
geom_text(aes(label=ct), hjust=-0.1) +
coord_flip() +
xlab('Item') +
ylab('Count') +
ggtitle('Least puchased items')
# Numbers of items purchased per transaction
data.frame(Items=rowSums(dfmkt2[,2:ncol(dfmkt2)])) %>%
ggplot() +
geom_histogram(aes(x=Items), bins=30) +
xlab('Items purchased per transaction') +
ylab('') +
ggtitle('Number of items purchased per transaction')
# Load required packages
library(arules)
## Warning: package 'arules' was built under R version 4.2.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.3
# Create transactional database from data set
trans <- read.transactions('https://raw.githubusercontent.com/klgriffen96/summer23_data624/main/hw_2/GroceryDataSet.csv',
format='basket', rm.duplicates=F, sep=',')
# Create associate rules
arules <- apriori(trans, parameter=list(supp = 0.01, conf=0.5, maxlen=10))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 98
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Show top 10 rules
inspect(rev(arules[1:10])) %>%
kbl(caption='Top 10 association rules') %>%
kable_classic(full_width=F)
## lhs rhs support
## [1] {tropical fruit, yogurt} => {whole milk} 0.01514997
## [2] {root vegetables, tropical fruit} => {whole milk} 0.01199797
## [3] {root vegetables, tropical fruit} => {other vegetables} 0.01230300
## [4] {citrus fruit, root vegetables} => {other vegetables} 0.01037112
## [5] {other vegetables, pip fruit} => {whole milk} 0.01352313
## [6] {other vegetables, whipped/sour cream} => {whole milk} 0.01464159
## [7] {whipped/sour cream, yogurt} => {whole milk} 0.01087951
## [8] {domestic eggs, other vegetables} => {whole milk} 0.01230300
## [9] {butter, other vegetables} => {whole milk} 0.01148958
## [10] {curd, yogurt} => {whole milk} 0.01006609
## confidence coverage lift count
## [1] 0.5173611 0.02928317 2.024770 149
## [2] 0.5700483 0.02104728 2.230969 118
## [3] 0.5845411 0.02104728 3.020999 121
## [4] 0.5862069 0.01769192 3.029608 102
## [5] 0.5175097 0.02613116 2.025351 133
## [6] 0.5070423 0.02887646 1.984385 144
## [7] 0.5245098 0.02074225 2.052747 107
## [8] 0.5525114 0.02226741 2.162336 121
## [9] 0.5736041 0.02003050 2.244885 113
## [10] 0.5823529 0.01728521 2.279125 99
| lhs | rhs | support | confidence | coverage | lift | count | ||
|---|---|---|---|---|---|---|---|---|
| [1] | {tropical fruit, yogurt} | => | {whole milk} | 0.0151500 | 0.5173611 | 0.0292832 | 2.024770 | 149 |
| [2] | {root vegetables, tropical fruit} | => | {whole milk} | 0.0119980 | 0.5700483 | 0.0210473 | 2.230969 | 118 |
| [3] | {root vegetables, tropical fruit} | => | {other vegetables} | 0.0123030 | 0.5845411 | 0.0210473 | 3.020999 | 121 |
| [4] | {citrus fruit, root vegetables} | => | {other vegetables} | 0.0103711 | 0.5862069 | 0.0176919 | 3.029608 | 102 |
| [5] | {other vegetables, pip fruit} | => | {whole milk} | 0.0135231 | 0.5175097 | 0.0261312 | 2.025351 | 133 |
| [6] | {other vegetables, whipped/sour cream} | => | {whole milk} | 0.0146416 | 0.5070423 | 0.0288765 | 1.984385 | 144 |
| [7] | {whipped/sour cream, yogurt} | => | {whole milk} | 0.0108795 | 0.5245098 | 0.0207422 | 2.052747 | 107 |
| [8] | {domestic eggs, other vegetables} | => | {whole milk} | 0.0123030 | 0.5525114 | 0.0222674 | 2.162336 | 121 |
| [9] | {butter, other vegetables} | => | {whole milk} | 0.0114896 | 0.5736041 | 0.0200305 | 2.244885 | 113 |
| [10] | {curd, yogurt} | => | {whole milk} | 0.0100661 | 0.5823529 | 0.0172852 | 2.279125 | 99 |