Recommender Systems

Market Basket Analysis

# import data
url <- 'https://raw.githubusercontent.com/dataconsumer101/data624/main/GroceryDataSet.csv'
df <- read.csv(url, header = F, na.strings=c(""))

# convert column names to lowercase
names(df) <- lapply(names(df), tolower)

# add row index as new field
df$row <- row.names(df) %>%
  as.numeric()

# quick look at what the data looks like
head(df)[1:4]
##                 v1                  v2             v3                       v4
## 1     citrus fruit semi-finished bread      margarine              ready soups
## 2   tropical fruit              yogurt         coffee                     <NA>
## 3       whole milk                <NA>           <NA>                     <NA>
## 4        pip fruit              yogurt  cream cheese              meat spreads
## 5 other vegetables          whole milk condensed milk long life bakery product
## 6       whole milk              butter         yogurt                     rice

Exploratory Data Analysis

# df long for plots
df2 <- df %>%
  gather(item_num, item, -row) %>%
  filter(!is.na(item)) %>%
  mutate(item_num = substr(item_num, 2, nchar(item_num))) %>%
  mutate(item_num = as.numeric(item_num))
# plot item purchase frequency
df2 %>%
  group_by(item) %>%
  summarize(purchases = n()) %>%
  ggplot(aes(x = reorder(item, purchases), y = purchases)) +
  geom_col() +
  coord_flip() +
  labs(title = 'Ranked Item Purchases',
       x = 'Item',
       y = 'Purchase Count') +
  theme(axis.text.y = element_text(size = 6))

df2 %>%
  group_by(row) %>%
  summarize(basket_size = max(item_num)) %>%
  ggplot(aes(x = basket_size)) +
  geom_histogram() +
  labs(title = 'Distribution of Basket Sizes',
       x = 'Basket Size',
       y = element_blank())

tx <- read.transactions(url, sep = ",", format = "basket")

rules <- tx %>%
  apriori(parameter = list(minlen = 2, 
                           supp = 0.001, 
                           conf = 0.1
                           ),
          control = list(verbose = F)
          )

The rules above were created with low thresholds. Next we’ll sort them and look at the top ranking associations by significance, confidence, and lift.

top_signficance <- rules %>%
  sort(by = 'support', decreasing = T)

top_signficance[1:10] %>%
  inspect()
##      lhs                   rhs                support    confidence coverage 
## [1]  {other vegetables} => {whole milk}       0.07483477 0.3867578  0.1934926
## [2]  {whole milk}       => {other vegetables} 0.07483477 0.2928770  0.2555160
## [3]  {rolls/buns}       => {whole milk}       0.05663447 0.3079049  0.1839349
## [4]  {whole milk}       => {rolls/buns}       0.05663447 0.2216474  0.2555160
## [5]  {yogurt}           => {whole milk}       0.05602440 0.4016035  0.1395018
## [6]  {whole milk}       => {yogurt}           0.05602440 0.2192598  0.2555160
## [7]  {root vegetables}  => {whole milk}       0.04890696 0.4486940  0.1089985
## [8]  {whole milk}       => {root vegetables}  0.04890696 0.1914047  0.2555160
## [9]  {root vegetables}  => {other vegetables} 0.04738180 0.4347015  0.1089985
## [10] {other vegetables} => {root vegetables}  0.04738180 0.2448765  0.1934926
##      lift     count
## [1]  1.513634 736  
## [2]  1.513634 736  
## [3]  1.205032 557  
## [4]  1.205032 557  
## [5]  1.571735 551  
## [6]  1.571735 551  
## [7]  1.756031 481  
## [8]  1.756031 481  
## [9]  2.246605 466  
## [10] 2.246605 466

Items ranked by the highest support are found together most often. We should also consider that each of the items individually have high coverage, so we’re also looking at some of the most popular items. Looking at lift, we can see that all of the items on the right side are more likely to be bought when the items on the left side are purchased.

top_confidence <- rules %>%
  sort(by = 'confidence', decreasing = T)

top_confidence[1:10] %>%
  inspect()
##      lhs                     rhs                    support confidence    coverage     lift count
## [1]  {rice,                                                                                      
##       sugar}              => {whole milk}       0.001220132          1 0.001220132 3.913649    12
## [2]  {canned fish,                                                                               
##       hygiene articles}   => {whole milk}       0.001118454          1 0.001118454 3.913649    11
## [3]  {butter,                                                                                    
##       rice,                                                                                      
##       root vegetables}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [4]  {flour,                                                                                     
##       root vegetables,                                                                           
##       whipped/sour cream} => {whole milk}       0.001728521          1 0.001728521 3.913649    17
## [5]  {butter,                                                                                    
##       domestic eggs,                                                                             
##       soft cheese}        => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [6]  {citrus fruit,                                                                              
##       root vegetables,                                                                           
##       soft cheese}        => {other vegetables} 0.001016777          1 0.001016777 5.168156    10
## [7]  {butter,                                                                                    
##       hygiene articles,                                                                          
##       pip fruit}          => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [8]  {hygiene articles,                                                                          
##       root vegetables,                                                                           
##       whipped/sour cream} => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [9]  {hygiene articles,                                                                          
##       pip fruit,                                                                                 
##       root vegetables}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [10] {cream cheese,                                                                              
##       domestic eggs,                                                                             
##       sugar}              => {whole milk}       0.001118454          1 0.001118454 3.913649    11

When we rank by confidence, the top 10 rules using the low thresholds show combinations of items with a confidence of 1. The support level of these items is very low, appearing in approximately 0.1% of transactions. With the low support threshold, we’re basically seeing a handful of item combinations that were always purchased with milk or other vegetables.

Let’s increase the support threshold and take another look.

rules2 <- tx %>%
  apriori(parameter = list(minlen = 2, 
                           supp = 0.01, 
                           conf = 0.1
                           ),
          control = list(verbose = F)
          )

top_confidence2 <- rules2 %>%
  sort(by = 'confidence', decreasing = T)

top_confidence2[1:10] %>%
  inspect()
##      lhs                                 rhs                support   
## [1]  {citrus fruit,root vegetables}   => {other vegetables} 0.01037112
## [2]  {root vegetables,tropical fruit} => {other vegetables} 0.01230300
## [3]  {curd,yogurt}                    => {whole milk}       0.01006609
## [4]  {butter,other vegetables}        => {whole milk}       0.01148958
## [5]  {root vegetables,tropical fruit} => {whole milk}       0.01199797
## [6]  {root vegetables,yogurt}         => {whole milk}       0.01453991
## [7]  {domestic eggs,other vegetables} => {whole milk}       0.01230300
## [8]  {whipped/sour cream,yogurt}      => {whole milk}       0.01087951
## [9]  {rolls/buns,root vegetables}     => {whole milk}       0.01270971
## [10] {other vegetables,pip fruit}     => {whole milk}       0.01352313
##      confidence coverage   lift     count
## [1]  0.5862069  0.01769192 3.029608 102  
## [2]  0.5845411  0.02104728 3.020999 121  
## [3]  0.5823529  0.01728521 2.279125  99  
## [4]  0.5736041  0.02003050 2.244885 113  
## [5]  0.5700483  0.02104728 2.230969 118  
## [6]  0.5629921  0.02582613 2.203354 143  
## [7]  0.5525114  0.02226741 2.162336 121  
## [8]  0.5245098  0.02074225 2.052747 107  
## [9]  0.5230126  0.02430097 2.046888 125  
## [10] 0.5175097  0.02613116 2.025351 133

After increasing the support threshold to look at item combinations that were purchased in at least 1% of all transactions, we’re seeing more popular item combinations that are purchased. The coverage of the left hand side is showing that these combinations were purchased together at least 1.7% of the time, and with just over 50% confidence and 2+ lift, we see that other vegetables and milk are likely to end up in someone’s cart if they’re purchasing the items on the left.

top_lift <- rules %>%
  sort(by = 'lift', decreasing = T)

top_lift[1:10] %>%
  inspect()
##      lhs                        rhs                         support confidence    coverage     lift count
## [1]  {bottled beer,                                                                                      
##       red/blush wine}        => {liquor}                0.001931876  0.3958333 0.004880529 35.71579    19
## [2]  {hamburger meat,                                                                                    
##       soda}                  => {Instant food products} 0.001220132  0.2105263 0.005795628 26.20919    12
## [3]  {ham,                                                                                               
##       white bread}           => {processed cheese}      0.001931876  0.3800000 0.005083884 22.92822    19
## [4]  {other vegetables,                                                                                  
##       root vegetables,                                                                                   
##       whole milk,                                                                                        
##       yogurt}                => {rice}                  0.001321810  0.1688312 0.007829181 22.13939    13
## [5]  {bottled beer,                                                                                      
##       liquor}                => {red/blush wine}        0.001931876  0.4130435 0.004677173 21.49356    19
## [6]  {Instant food products,                                                                             
##       soda}                  => {hamburger meat}        0.001220132  0.6315789 0.001931876 18.99565    12
## [7]  {curd,                                                                                              
##       sugar}                 => {flour}                 0.001118454  0.3235294 0.003457041 18.60767    11
## [8]  {salty snack,                                                                                       
##       soda}                  => {popcorn}               0.001220132  0.1304348 0.009354347 18.06797    12
## [9]  {baking powder,                                                                                     
##       sugar}                 => {flour}                 0.001016777  0.3125000 0.003253686 17.97332    10
## [10] {processed cheese,                                                                                  
##       white bread}           => {ham}                   0.001931876  0.4634146 0.004168785 17.80345    19

When we rank the top ten rules by lift, we’re seeing the highest chance of items being purchased together. When someone buys bottled beer and wine, there’s a very high likelihood that they will also purchase liquor. The same idea applies to the other associations – if someone has the left hand side in their cart, they’re very likely to also buy the tiems on the right hand side.

# using support = 0.01, conf = 0.1
rules2 %>%
  plot(method = 'graph')

That’s a lot of information. Let’s try to adjust the support thresholds and plot again.

# support = 0.02
tx %>%
  apriori(parameter = list(minlen = 2, 
                           supp = 0.02,
                           conf = 0.25
                           ),
          control = list(verbose = F)
          ) %>%
  plot(method = 'graph')

# support = 0.03
tx %>%
  apriori(parameter = list(minlen = 2, 
                           supp = 0.03,
                           conf = 0.25
                           ),
          control = list(verbose = F)
          ) %>%
  plot(method = 'graph')

# support = 0.04
tx %>%
  apriori(parameter = list(minlen = 2, 
                           supp = 0.04,
                           conf = 0.25
                           ),
          control = list(verbose = F)
          ) %>%
  plot(method = 'graph')

# support = 0.05
tx %>%
  apriori(parameter = list(minlen = 2, 
                           supp = 0.05,
                           conf = 0.25
                           ),
          control = list(verbose = F)
          ) %>%
  plot(method = 'graph')

library(arules)
library(pander)
library(arulesViz)

tx <- read.transactions('https://raw.githubusercontent.com/dataconsumer101/data624/main/GroceryDataSet.csv', sep = ",", format = "basket")

summary(tx)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics
itemFrequencyPlot(tx, topN = 20, main = 'Top 20 items purchased')

crossTable(tx, measure = 'support', sort = T)[1:5, 1:5] %>%
  pander(split.table = Inf, round = 3)
  whole milk other vegetables rolls/buns soda yogurt
whole milk 0.256 0.075 0.057 0.04 0.056
other vegetables 0.075 0.193 0.043 0.033 0.043
rolls/buns 0.057 0.043 0.184 0.038 0.034
soda 0.04 0.033 0.038 0.174 0.027
yogurt 0.056 0.043 0.034 0.027 0.14
rules <- apriori(tx, control = list(verbose = F), parameter = list(support = 0.001, confidence = 0.25, minlen = 2))

rules_toplift <- sort(rules, by = 'lift', decreasing = T)[1:10]

inspect(rules_toplift)
##      lhs                                   rhs                support    
## [1]  {bottled beer,red/blush wine}      => {liquor}           0.001931876
## [2]  {ham,white bread}                  => {processed cheese} 0.001931876
## [3]  {bottled beer,liquor}              => {red/blush wine}   0.001931876
## [4]  {Instant food products,soda}       => {hamburger meat}   0.001220132
## [5]  {curd,sugar}                       => {flour}            0.001118454
## [6]  {baking powder,sugar}              => {flour}            0.001016777
## [7]  {processed cheese,white bread}     => {ham}              0.001931876
## [8]  {fruit/vegetable juice,ham}        => {processed cheese} 0.001118454
## [9]  {margarine,sugar}                  => {flour}            0.001626843
## [10] {root vegetables,sugar,whole milk} => {flour}            0.001016777
##      confidence coverage    lift     count
## [1]  0.3958333  0.004880529 35.71579 19   
## [2]  0.3800000  0.005083884 22.92822 19   
## [3]  0.4130435  0.004677173 21.49356 19   
## [4]  0.6315789  0.001931876 18.99565 12   
## [5]  0.3235294  0.003457041 18.60767 11   
## [6]  0.3125000  0.003253686 17.97332 10   
## [7]  0.4634146  0.004168785 17.80345 19   
## [8]  0.2894737  0.003863752 17.46610 11   
## [9]  0.2962963  0.005490595 17.04137 16   
## [10] 0.2941176  0.003457041 16.91606 10
rules_top_supp <- sort(rules, by = 'support', decreasing = T)[1:10]

inspect(rules_top_supp)
##      lhs                   rhs                support    confidence coverage 
## [1]  {other vegetables} => {whole milk}       0.07483477 0.3867578  0.1934926
## [2]  {whole milk}       => {other vegetables} 0.07483477 0.2928770  0.2555160
## [3]  {rolls/buns}       => {whole milk}       0.05663447 0.3079049  0.1839349
## [4]  {yogurt}           => {whole milk}       0.05602440 0.4016035  0.1395018
## [5]  {root vegetables}  => {whole milk}       0.04890696 0.4486940  0.1089985
## [6]  {root vegetables}  => {other vegetables} 0.04738180 0.4347015  0.1089985
## [7]  {yogurt}           => {other vegetables} 0.04341637 0.3112245  0.1395018
## [8]  {tropical fruit}   => {whole milk}       0.04229792 0.4031008  0.1049314
## [9]  {tropical fruit}   => {other vegetables} 0.03589222 0.3420543  0.1049314
## [10] {bottled water}    => {whole milk}       0.03436706 0.3109476  0.1105236
##      lift     count
## [1]  1.513634 736  
## [2]  1.513634 736  
## [3]  1.205032 557  
## [4]  1.571735 551  
## [5]  1.756031 481  
## [6]  2.246605 466  
## [7]  1.608457 427  
## [8]  1.577595 416  
## [9]  1.767790 353  
## [10] 1.216940 338
rules_soda_1 <- apriori(tx, 
                        control = list(verbose = F),
                        parameter = list(support = 0.001, confidence = 0.15, minlen = 2, target = 'rules'),
                        appearance = list(default = 'rhs', lhs = 'soda')
                        )

plot(rules_soda_1, method = 'graph', interactive = F, shading = NA)

rules_soda_r <- apriori(tx,
                        control = list(verbose = F),
                        parameter = list(support = 0.001, confidence = 0.5, minlen = 2, target = 'rules'),
                        appearance = list(default = 'lhs', rhs = 'soda')
                        )

inspect(sort(rules_soda_r, by = 'support', decreasing = T)[1:10])
##      lhs                                        rhs    support     confidence
## [1]  {bottled water,sausage,yogurt}          => {soda} 0.002033554 0.5128205 
## [2]  {chocolate,other vegetables,rolls/buns} => {soda} 0.001931876 0.5000000 
## [3]  {canned beer,tropical fruit}            => {soda} 0.001728521 0.5666667 
## [4]  {chewing gum,shopping bags}             => {soda} 0.001626843 0.5000000 
## [5]  {bottled water,newspapers,yogurt}       => {soda} 0.001626843 0.5000000 
## [6]  {candy,waffles}                         => {soda} 0.001525165 0.5172414 
## [7]  {canned beer,pork}                      => {soda} 0.001525165 0.5172414 
## [8]  {candy,other vegetables,rolls/buns}     => {soda} 0.001525165 0.5769231 
## [9]  {shopping bags,white bread,whole milk}  => {soda} 0.001525165 0.5357143 
## [10] {chocolate,napkins,whole milk}          => {soda} 0.001423488 0.5000000 
##      coverage    lift     count
## [1]  0.003965430 2.940869 20   
## [2]  0.003863752 2.867347 19   
## [3]  0.003050330 3.249660 17   
## [4]  0.003253686 2.867347 16   
## [5]  0.003253686 2.867347 16   
## [6]  0.002948653 2.966221 15   
## [7]  0.002948653 2.966221 15   
## [8]  0.002643620 3.308477 15   
## [9]  0.002846975 3.072157 15   
## [10] 0.002846975 2.867347 14
plot(rules_soda_r, method = 'graph', interactive = F, shading = NA)