output: html_document: theme: yeti highlight: tango toc: yes toc_float: yes pdf_document: dev: cairo_pdf toc: yes —

Market basket analysis

Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket – and therefore ‘Market Basket Analysis’.

That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item.

Here is the dataset = GroceryDataSet.csv (comma separated file)

You assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift. Turn in as you would the other problems from HA and KJ. You should packaged this with your HW #2 set.

NOTE: Bonus - Use a Python Library in addition to R and get a Bonus.

Load data

# Load data from github
dfmkt_raw <- read.csv('https://raw.githubusercontent.com/klgriffen96/summer23_data624/main/hw_2/GroceryDataSet.csv', header=F)

Data preparation

# Add column for transaction number and reorder so that it is first
dfmkt <- dfmkt_raw
dfmkt$trans.num <- row.names(dfmkt_raw)
dfmkt <- dfmkt %>%
    dplyr::select(trans.num, 1:ncol(dfmkt) - 1)

# Gather each item into its own row (i.e. make df into "long" format)
dfmkt <- gather(dfmkt, key='item.num', value='item.name', 2:ncol(dfmkt)) %>% arrange(trans.num, item.num)

# Remove items that are blank
dfmkt <- dfmkt %>%
    filter(item.name != '')

# Create a variable to indicate the presence of this item for this transaction
dfmkt$val <- 1

# Spread to "wide" format; this will fill in "1" into columns that have that item
# and NAs into columns that don't have that item
dfmkt2 <- spread(dfmkt, key='item.name', value='val', fill=NA)

# Replace the NAs with zeros
dfmkt2[is.na(dfmkt2)] <- 0

# Combine rows by trnasaction num
dfmkt2 <- dfmkt2 %>%
    group_by(trans.num) %>%
    summarize(across(c(-item.num), list(sum))) %>%
    ungroup()

EDA

# Show unique item count
print(paste0('There are ', length(unique(dfmkt$item.name)), ' unique items.'))
## [1] "There are 169 unique items."
# Get item counts
dfgrp <- dfmkt %>%
    group_by(item.name) %>%
    summarize(ct=n()) %>%
    ungroup() %>%
    arrange(desc(ct))

# Most frequently purchased items
#dfgrp %>%
#    head(10) %>%
#    kbl(caption='10 Most purchased items') %>%
#    kable_classic(full_width=F)

# Most frequently purchased items
dfgrp %>%
    arrange(desc(ct)) %>%
    head(10) %>%
    ggplot(aes(x=reorder(item.name, ct), y=ct)) +
    geom_bar(stat='identity', width=0.5) +
    geom_text(aes(label=ct), hjust=-0.1) +
    coord_flip() +
    xlab('Item') +
    ylab('Count') +
    ggtitle('Most puchased items')

# Infrequently purchased items
#dfgrp %>%
#    arrange(ct) %>%
#    head(10) %>%
#    kbl(caption='10 Least purchased items') %>%
#    kable_classic(full_width=F)

# Infrequently purchased items
dfgrp %>%
    arrange(ct) %>%
    head(10) %>%
    ggplot(aes(x=reorder(item.name, desc(ct)), y=ct)) +
    geom_bar(stat='identity', width=0.5) +
    geom_text(aes(label=ct), hjust=-0.1) +
    coord_flip() +
    xlab('Item') +
    ylab('Count') +
    ggtitle('Least puchased items')

# Numbers of items purchased per transaction
data.frame(Items=rowSums(dfmkt2[,2:ncol(dfmkt2)])) %>%
    ggplot() +
    geom_histogram(aes(x=Items), bins=30) +
    xlab('Items purchased per transaction') +
    ylab('') +
    ggtitle('Number of items purchased per transaction')

Association rules

# Load required packages
library(arules)
## Warning: package 'arules' was built under R version 4.2.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.3
# Create transactional database from data set
trans <- read.transactions('https://raw.githubusercontent.com/klgriffen96/summer23_data624/main/hw_2/GroceryDataSet.csv', 
    format='basket', rm.duplicates=F, sep=',')

# Create associate rules
arules <- apriori(trans, parameter=list(supp = 0.01, conf=0.5, maxlen=10))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Show top 10 rules
inspect(rev(arules[1:10])) %>%
    kbl(caption='Top 10 association rules') %>%
    kable_classic(full_width=F)
##      lhs                                       rhs                support   
## [1]  {tropical fruit, yogurt}               => {whole milk}       0.01514997
## [2]  {root vegetables, tropical fruit}      => {whole milk}       0.01199797
## [3]  {root vegetables, tropical fruit}      => {other vegetables} 0.01230300
## [4]  {citrus fruit, root vegetables}        => {other vegetables} 0.01037112
## [5]  {other vegetables, pip fruit}          => {whole milk}       0.01352313
## [6]  {other vegetables, whipped/sour cream} => {whole milk}       0.01464159
## [7]  {whipped/sour cream, yogurt}           => {whole milk}       0.01087951
## [8]  {domestic eggs, other vegetables}      => {whole milk}       0.01230300
## [9]  {butter, other vegetables}             => {whole milk}       0.01148958
## [10] {curd, yogurt}                         => {whole milk}       0.01006609
##      confidence coverage   lift     count
## [1]  0.5173611  0.02928317 2.024770 149  
## [2]  0.5700483  0.02104728 2.230969 118  
## [3]  0.5845411  0.02104728 3.020999 121  
## [4]  0.5862069  0.01769192 3.029608 102  
## [5]  0.5175097  0.02613116 2.025351 133  
## [6]  0.5070423  0.02887646 1.984385 144  
## [7]  0.5245098  0.02074225 2.052747 107  
## [8]  0.5525114  0.02226741 2.162336 121  
## [9]  0.5736041  0.02003050 2.244885 113  
## [10] 0.5823529  0.01728521 2.279125  99
Top 10 association rules
lhs rhs support confidence coverage lift count
[1] {tropical fruit, yogurt} => {whole milk} 0.0151500 0.5173611 0.0292832 2.024770 149
[2] {root vegetables, tropical fruit} => {whole milk} 0.0119980 0.5700483 0.0210473 2.230969 118
[3] {root vegetables, tropical fruit} => {other vegetables} 0.0123030 0.5845411 0.0210473 3.020999 121
[4] {citrus fruit, root vegetables} => {other vegetables} 0.0103711 0.5862069 0.0176919 3.029608 102
[5] {other vegetables, pip fruit} => {whole milk} 0.0135231 0.5175097 0.0261312 2.025351 133
[6] {other vegetables, whipped/sour cream} => {whole milk} 0.0146416 0.5070423 0.0288765 1.984385 144
[7] {whipped/sour cream, yogurt} => {whole milk} 0.0108795 0.5245098 0.0207422 2.052747 107
[8] {domestic eggs, other vegetables} => {whole milk} 0.0123030 0.5525114 0.0222674 2.162336 121
[9] {butter, other vegetables} => {whole milk} 0.0114896 0.5736041 0.0200305 2.244885 113
[10] {curd, yogurt} => {whole milk} 0.0100661 0.5823529 0.0172852 2.279125 99