Recommender Systems
# import data
url <- 'https://raw.githubusercontent.com/dataconsumer101/data624/main/GroceryDataSet.csv'
df <- read.csv(url, header = F, na.strings=c(""))
# convert column names to lowercase
names(df) <- lapply(names(df), tolower)
# add row index as new field
df$row <- row.names(df) %>%
as.numeric()
# quick look at what the data looks like
head(df)[1:4]## v1 v2 v3 v4
## 1 citrus fruit semi-finished bread margarine ready soups
## 2 tropical fruit yogurt coffee <NA>
## 3 whole milk <NA> <NA> <NA>
## 4 pip fruit yogurt cream cheese meat spreads
## 5 other vegetables whole milk condensed milk long life bakery product
## 6 whole milk butter yogurt rice
# df long for plots
df2 <- df %>%
gather(item_num, item, -row) %>%
filter(!is.na(item)) %>%
mutate(item_num = substr(item_num, 2, nchar(item_num))) %>%
mutate(item_num = as.numeric(item_num))# plot item purchase frequency
df2 %>%
group_by(item) %>%
summarize(purchases = n()) %>%
ggplot(aes(x = reorder(item, purchases), y = purchases)) +
geom_col() +
coord_flip() +
labs(title = 'Ranked Item Purchases',
x = 'Item',
y = 'Purchase Count') +
theme(axis.text.y = element_text(size = 6))df2 %>%
group_by(row) %>%
summarize(basket_size = max(item_num)) %>%
ggplot(aes(x = basket_size)) +
geom_histogram() +
labs(title = 'Distribution of Basket Sizes',
x = 'Basket Size',
y = element_blank())tx <- read.transactions(url, sep = ",", format = "basket")
rules <- tx %>%
apriori(parameter = list(minlen = 2,
supp = 0.001,
conf = 0.1
),
control = list(verbose = F)
)The rules above were created with low thresholds. Next we’ll sort them and look at the top ranking associations by significance, confidence, and lift.
top_signficance <- rules %>%
sort(by = 'support', decreasing = T)
top_signficance[1:10] %>%
inspect()## lhs rhs support confidence coverage
## [1] {other vegetables} => {whole milk} 0.07483477 0.3867578 0.1934926
## [2] {whole milk} => {other vegetables} 0.07483477 0.2928770 0.2555160
## [3] {rolls/buns} => {whole milk} 0.05663447 0.3079049 0.1839349
## [4] {whole milk} => {rolls/buns} 0.05663447 0.2216474 0.2555160
## [5] {yogurt} => {whole milk} 0.05602440 0.4016035 0.1395018
## [6] {whole milk} => {yogurt} 0.05602440 0.2192598 0.2555160
## [7] {root vegetables} => {whole milk} 0.04890696 0.4486940 0.1089985
## [8] {whole milk} => {root vegetables} 0.04890696 0.1914047 0.2555160
## [9] {root vegetables} => {other vegetables} 0.04738180 0.4347015 0.1089985
## [10] {other vegetables} => {root vegetables} 0.04738180 0.2448765 0.1934926
## lift count
## [1] 1.513634 736
## [2] 1.513634 736
## [3] 1.205032 557
## [4] 1.205032 557
## [5] 1.571735 551
## [6] 1.571735 551
## [7] 1.756031 481
## [8] 1.756031 481
## [9] 2.246605 466
## [10] 2.246605 466
Items ranked by the highest support are found together most often. We should also consider that each of the items individually have high coverage, so we’re also looking at some of the most popular items. Looking at lift, we can see that all of the items on the right side are more likely to be bought when the items on the left side are purchased.
top_confidence <- rules %>%
sort(by = 'confidence', decreasing = T)
top_confidence[1:10] %>%
inspect()## lhs rhs support confidence coverage lift count
## [1] {rice,
## sugar} => {whole milk} 0.001220132 1 0.001220132 3.913649 12
## [2] {canned fish,
## hygiene articles} => {whole milk} 0.001118454 1 0.001118454 3.913649 11
## [3] {butter,
## rice,
## root vegetables} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [4] {flour,
## root vegetables,
## whipped/sour cream} => {whole milk} 0.001728521 1 0.001728521 3.913649 17
## [5] {butter,
## domestic eggs,
## soft cheese} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [6] {citrus fruit,
## root vegetables,
## soft cheese} => {other vegetables} 0.001016777 1 0.001016777 5.168156 10
## [7] {butter,
## hygiene articles,
## pip fruit} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [8] {hygiene articles,
## root vegetables,
## whipped/sour cream} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [9] {hygiene articles,
## pip fruit,
## root vegetables} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [10] {cream cheese,
## domestic eggs,
## sugar} => {whole milk} 0.001118454 1 0.001118454 3.913649 11
When we rank by confidence, the top 10 rules using the low thresholds show combinations of items with a confidence of 1. The support level of these items is very low, appearing in approximately 0.1% of transactions. With the low support threshold, we’re basically seeing a handful of item combinations that were always purchased with milk or other vegetables.
Let’s increase the support threshold and take another look.
rules2 <- tx %>%
apriori(parameter = list(minlen = 2,
supp = 0.01,
conf = 0.1
),
control = list(verbose = F)
)
top_confidence2 <- rules2 %>%
sort(by = 'confidence', decreasing = T)
top_confidence2[1:10] %>%
inspect()## lhs rhs support
## [1] {citrus fruit,root vegetables} => {other vegetables} 0.01037112
## [2] {root vegetables,tropical fruit} => {other vegetables} 0.01230300
## [3] {curd,yogurt} => {whole milk} 0.01006609
## [4] {butter,other vegetables} => {whole milk} 0.01148958
## [5] {root vegetables,tropical fruit} => {whole milk} 0.01199797
## [6] {root vegetables,yogurt} => {whole milk} 0.01453991
## [7] {domestic eggs,other vegetables} => {whole milk} 0.01230300
## [8] {whipped/sour cream,yogurt} => {whole milk} 0.01087951
## [9] {rolls/buns,root vegetables} => {whole milk} 0.01270971
## [10] {other vegetables,pip fruit} => {whole milk} 0.01352313
## confidence coverage lift count
## [1] 0.5862069 0.01769192 3.029608 102
## [2] 0.5845411 0.02104728 3.020999 121
## [3] 0.5823529 0.01728521 2.279125 99
## [4] 0.5736041 0.02003050 2.244885 113
## [5] 0.5700483 0.02104728 2.230969 118
## [6] 0.5629921 0.02582613 2.203354 143
## [7] 0.5525114 0.02226741 2.162336 121
## [8] 0.5245098 0.02074225 2.052747 107
## [9] 0.5230126 0.02430097 2.046888 125
## [10] 0.5175097 0.02613116 2.025351 133
After increasing the support threshold to look at item combinations that were purchased in at least 1% of all transactions, we’re seeing more popular item combinations that are purchased. The coverage of the left hand side is showing that these combinations were purchased together at least 1.7% of the time, and with just over 50% confidence and 2+ lift, we see that other vegetables and milk are likely to end up in someone’s cart if they’re purchasing the items on the left.
top_lift <- rules %>%
sort(by = 'lift', decreasing = T)
top_lift[1:10] %>%
inspect()## lhs rhs support confidence coverage lift count
## [1] {bottled beer,
## red/blush wine} => {liquor} 0.001931876 0.3958333 0.004880529 35.71579 19
## [2] {hamburger meat,
## soda} => {Instant food products} 0.001220132 0.2105263 0.005795628 26.20919 12
## [3] {ham,
## white bread} => {processed cheese} 0.001931876 0.3800000 0.005083884 22.92822 19
## [4] {other vegetables,
## root vegetables,
## whole milk,
## yogurt} => {rice} 0.001321810 0.1688312 0.007829181 22.13939 13
## [5] {bottled beer,
## liquor} => {red/blush wine} 0.001931876 0.4130435 0.004677173 21.49356 19
## [6] {Instant food products,
## soda} => {hamburger meat} 0.001220132 0.6315789 0.001931876 18.99565 12
## [7] {curd,
## sugar} => {flour} 0.001118454 0.3235294 0.003457041 18.60767 11
## [8] {salty snack,
## soda} => {popcorn} 0.001220132 0.1304348 0.009354347 18.06797 12
## [9] {baking powder,
## sugar} => {flour} 0.001016777 0.3125000 0.003253686 17.97332 10
## [10] {processed cheese,
## white bread} => {ham} 0.001931876 0.4634146 0.004168785 17.80345 19
When we rank the top ten rules by lift, we’re seeing the highest chance of items being purchased together. When someone buys bottled beer and wine, there’s a very high likelihood that they will also purchase liquor. The same idea applies to the other associations – if someone has the left hand side in their cart, they’re very likely to also buy the tiems on the right hand side.
# using support = 0.01, conf = 0.1
rules2 %>%
plot(method = 'graph') That’s a lot of information. Let’s try to adjust the support thresholds and plot again.
# support = 0.02
tx %>%
apriori(parameter = list(minlen = 2,
supp = 0.02,
conf = 0.25
),
control = list(verbose = F)
) %>%
plot(method = 'graph')# support = 0.03
tx %>%
apriori(parameter = list(minlen = 2,
supp = 0.03,
conf = 0.25
),
control = list(verbose = F)
) %>%
plot(method = 'graph')# support = 0.04
tx %>%
apriori(parameter = list(minlen = 2,
supp = 0.04,
conf = 0.25
),
control = list(verbose = F)
) %>%
plot(method = 'graph')# support = 0.05
tx %>%
apriori(parameter = list(minlen = 2,
supp = 0.05,
conf = 0.25
),
control = list(verbose = F)
) %>%
plot(method = 'graph')library(arules)
library(pander)
library(arulesViz)
tx <- read.transactions('https://raw.githubusercontent.com/dataconsumer101/data624/main/GroceryDataSet.csv', sep = ",", format = "basket")
summary(tx)## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
itemFrequencyPlot(tx, topN = 20, main = 'Top 20 items purchased')crossTable(tx, measure = 'support', sort = T)[1:5, 1:5] %>%
pander(split.table = Inf, round = 3)| Â | whole milk | other vegetables | rolls/buns | soda | yogurt |
|---|---|---|---|---|---|
| whole milk | 0.256 | 0.075 | 0.057 | 0.04 | 0.056 |
| other vegetables | 0.075 | 0.193 | 0.043 | 0.033 | 0.043 |
| rolls/buns | 0.057 | 0.043 | 0.184 | 0.038 | 0.034 |
| soda | 0.04 | 0.033 | 0.038 | 0.174 | 0.027 |
| yogurt | 0.056 | 0.043 | 0.034 | 0.027 | 0.14 |
rules <- apriori(tx, control = list(verbose = F), parameter = list(support = 0.001, confidence = 0.25, minlen = 2))
rules_toplift <- sort(rules, by = 'lift', decreasing = T)[1:10]
inspect(rules_toplift)## lhs rhs support
## [1] {bottled beer,red/blush wine} => {liquor} 0.001931876
## [2] {ham,white bread} => {processed cheese} 0.001931876
## [3] {bottled beer,liquor} => {red/blush wine} 0.001931876
## [4] {Instant food products,soda} => {hamburger meat} 0.001220132
## [5] {curd,sugar} => {flour} 0.001118454
## [6] {baking powder,sugar} => {flour} 0.001016777
## [7] {processed cheese,white bread} => {ham} 0.001931876
## [8] {fruit/vegetable juice,ham} => {processed cheese} 0.001118454
## [9] {margarine,sugar} => {flour} 0.001626843
## [10] {root vegetables,sugar,whole milk} => {flour} 0.001016777
## confidence coverage lift count
## [1] 0.3958333 0.004880529 35.71579 19
## [2] 0.3800000 0.005083884 22.92822 19
## [3] 0.4130435 0.004677173 21.49356 19
## [4] 0.6315789 0.001931876 18.99565 12
## [5] 0.3235294 0.003457041 18.60767 11
## [6] 0.3125000 0.003253686 17.97332 10
## [7] 0.4634146 0.004168785 17.80345 19
## [8] 0.2894737 0.003863752 17.46610 11
## [9] 0.2962963 0.005490595 17.04137 16
## [10] 0.2941176 0.003457041 16.91606 10
rules_top_supp <- sort(rules, by = 'support', decreasing = T)[1:10]
inspect(rules_top_supp)## lhs rhs support confidence coverage
## [1] {other vegetables} => {whole milk} 0.07483477 0.3867578 0.1934926
## [2] {whole milk} => {other vegetables} 0.07483477 0.2928770 0.2555160
## [3] {rolls/buns} => {whole milk} 0.05663447 0.3079049 0.1839349
## [4] {yogurt} => {whole milk} 0.05602440 0.4016035 0.1395018
## [5] {root vegetables} => {whole milk} 0.04890696 0.4486940 0.1089985
## [6] {root vegetables} => {other vegetables} 0.04738180 0.4347015 0.1089985
## [7] {yogurt} => {other vegetables} 0.04341637 0.3112245 0.1395018
## [8] {tropical fruit} => {whole milk} 0.04229792 0.4031008 0.1049314
## [9] {tropical fruit} => {other vegetables} 0.03589222 0.3420543 0.1049314
## [10] {bottled water} => {whole milk} 0.03436706 0.3109476 0.1105236
## lift count
## [1] 1.513634 736
## [2] 1.513634 736
## [3] 1.205032 557
## [4] 1.571735 551
## [5] 1.756031 481
## [6] 2.246605 466
## [7] 1.608457 427
## [8] 1.577595 416
## [9] 1.767790 353
## [10] 1.216940 338
rules_soda_1 <- apriori(tx,
control = list(verbose = F),
parameter = list(support = 0.001, confidence = 0.15, minlen = 2, target = 'rules'),
appearance = list(default = 'rhs', lhs = 'soda')
)
plot(rules_soda_1, method = 'graph', interactive = F, shading = NA)rules_soda_r <- apriori(tx,
control = list(verbose = F),
parameter = list(support = 0.001, confidence = 0.5, minlen = 2, target = 'rules'),
appearance = list(default = 'lhs', rhs = 'soda')
)
inspect(sort(rules_soda_r, by = 'support', decreasing = T)[1:10])## lhs rhs support confidence
## [1] {bottled water,sausage,yogurt} => {soda} 0.002033554 0.5128205
## [2] {chocolate,other vegetables,rolls/buns} => {soda} 0.001931876 0.5000000
## [3] {canned beer,tropical fruit} => {soda} 0.001728521 0.5666667
## [4] {chewing gum,shopping bags} => {soda} 0.001626843 0.5000000
## [5] {bottled water,newspapers,yogurt} => {soda} 0.001626843 0.5000000
## [6] {candy,waffles} => {soda} 0.001525165 0.5172414
## [7] {canned beer,pork} => {soda} 0.001525165 0.5172414
## [8] {candy,other vegetables,rolls/buns} => {soda} 0.001525165 0.5769231
## [9] {shopping bags,white bread,whole milk} => {soda} 0.001525165 0.5357143
## [10] {chocolate,napkins,whole milk} => {soda} 0.001423488 0.5000000
## coverage lift count
## [1] 0.003965430 2.940869 20
## [2] 0.003863752 2.867347 19
## [3] 0.003050330 3.249660 17
## [4] 0.003253686 2.867347 16
## [5] 0.003253686 2.867347 16
## [6] 0.002948653 2.966221 15
## [7] 0.002948653 2.966221 15
## [8] 0.002643620 3.308477 15
## [9] 0.002846975 3.072157 15
## [10] 0.002846975 2.867347 14
plot(rules_soda_r, method = 'graph', interactive = F, shading = NA)