library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(arules)
## Warning: package 'arules' was built under R version 4.5.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.5.3
Exploratory analysis
items <- read.csv("https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/GroceryDataSet.csv", sep = ",", na = c("", "N/A", "missing", "—"), header = FALSE )
How many items are in an average basket? We know the max is 31
item_counts <- rowSums(!is.na(items))
summary(item_counts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
The median number of items is 3 and the average is 4.4. The third quartile suggests that most shoppers don’t buy more than 10 items.
How many of each basket size?
item_counts <- data.frame(item_counts)
item_counts |> count(item_counts)
## item_counts n
## 1 1 2159
## 2 2 1643
## 3 3 1299
## 4 4 1005
## 5 5 855
## 6 6 645
## 7 7 545
## 8 8 438
## 9 9 350
## 10 10 246
## 11 11 182
## 12 12 117
## 13 13 78
## 14 14 77
## 15 15 55
## 16 16 46
## 17 17 29
## 18 18 14
## 19 19 14
## 20 20 9
## 21 21 11
## 22 22 4
## 23 23 6
## 24 24 1
## 25 26 1
## 26 27 1
## 27 28 1
## 28 29 3
## 29 32 1
About 7,000 out of 10,000 baskets contain 5 or fewer items.
What are the most popular items?
item_popularity <- sort(table(unlist(items)), decreasing = TRUE)
head(item_popularity, 20)
##
## whole milk other vegetables rolls/buns
## 2513 1903 1809
## soda yogurt bottled water
## 1715 1372 1087
## root vegetables tropical fruit shopping bags
## 1072 1032 969
## sausage pastry citrus fruit
## 924 875 814
## bottled beer newspapers canned beer
## 792 785 764
## pip fruit fruit/vegetable juice whipped/sour cream
## 744 711 705
## brown bread domestic eggs
## 638 624
Whole milk, other vegetables, rolls/buns, soda, and yogurt are among the most popular items.
Create a transactions object instead of a data frame. Remove duplicates (e.g., two cereals purchased in the same transaction)
groceries <- read.transactions("https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/GroceryDataSet.csv", format = "basket", sep = ",", rm.duplicates = TRUE)
Specifying: Support - item must appear in at least .1% of transactions Confidence - this rule must be true at least 5% of the time Min length - must be at least 2 items
rules <- apriori(groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [37937 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
summary(rules)
## set of 37937 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 3814 19639 12544 1880 60
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 3.334 4.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001017 Min. :0.0500 Min. :0.001017 Min. : 0.4193
## 1st Qu.:0.001118 1st Qu.:0.1358 1st Qu.:0.003864 1st Qu.: 1.8719
## Median :0.001423 Median :0.2273 Median :0.006812 Median : 2.4718
## Mean :0.002065 Mean :0.2797 Mean :0.012163 Mean : 2.7437
## 3rd Qu.:0.002034 3rd Qu.:0.3846 3rd Qu.:0.013218 3rd Qu.: 3.2970
## Max. :0.074835 Max. :1.0000 Max. :0.255516 Max. :35.7158
## count
## Min. : 10.00
## 1st Qu.: 11.00
## Median : 14.00
## Mean : 20.31
## 3rd Qu.: 20.00
## Max. :736.00
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.001 0.05
## call
## apriori(data = groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 2))
#exploring results
inspect(head(sort(rules, by = "confidence", decreasing = TRUE), n = 10))
## lhs rhs support confidence coverage lift count
## [1] {rice,
## sugar} => {whole milk} 0.001220132 1 0.001220132 3.913649 12
## [2] {canned fish,
## hygiene articles} => {whole milk} 0.001118454 1 0.001118454 3.913649 11
## [3] {butter,
## rice,
## root vegetables} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [4] {flour,
## root vegetables,
## whipped/sour cream} => {whole milk} 0.001728521 1 0.001728521 3.913649 17
## [5] {butter,
## domestic eggs,
## soft cheese} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [6] {citrus fruit,
## root vegetables,
## soft cheese} => {other vegetables} 0.001016777 1 0.001016777 5.168156 10
## [7] {butter,
## hygiene articles,
## pip fruit} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [8] {hygiene articles,
## root vegetables,
## whipped/sour cream} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [9] {hygiene articles,
## pip fruit,
## root vegetables} => {whole milk} 0.001016777 1 0.001016777 3.913649 10
## [10] {cream cheese,
## domestic eggs,
## sugar} => {whole milk} 0.001118454 1 0.001118454 3.913649 11
This gives us a set of rules with very high confidence (100% of the time these items are purchased together for 1), but low support. The items are ~.1% of shopping baskets. And, given that these sets are 3+ items, they might have only been purchased together twice.
Let’s try with sorting by support. Maybe there are more frequent combinations that don’t happen in 100% of cases.
inspect(head(sort(rules, by = "support", decreasing = TRUE), n = 20))
## lhs rhs support confidence coverage
## [1] {other vegetables} => {whole milk} 0.07483477 0.3867578 0.1934926
## [2] {whole milk} => {other vegetables} 0.07483477 0.2928770 0.2555160
## [3] {rolls/buns} => {whole milk} 0.05663447 0.3079049 0.1839349
## [4] {whole milk} => {rolls/buns} 0.05663447 0.2216474 0.2555160
## [5] {yogurt} => {whole milk} 0.05602440 0.4016035 0.1395018
## [6] {whole milk} => {yogurt} 0.05602440 0.2192598 0.2555160
## [7] {root vegetables} => {whole milk} 0.04890696 0.4486940 0.1089985
## [8] {whole milk} => {root vegetables} 0.04890696 0.1914047 0.2555160
## [9] {root vegetables} => {other vegetables} 0.04738180 0.4347015 0.1089985
## [10] {other vegetables} => {root vegetables} 0.04738180 0.2448765 0.1934926
## [11] {yogurt} => {other vegetables} 0.04341637 0.3112245 0.1395018
## [12] {other vegetables} => {yogurt} 0.04341637 0.2243826 0.1934926
## [13] {rolls/buns} => {other vegetables} 0.04260295 0.2316197 0.1839349
## [14] {other vegetables} => {rolls/buns} 0.04260295 0.2201787 0.1934926
## [15] {tropical fruit} => {whole milk} 0.04229792 0.4031008 0.1049314
## [16] {whole milk} => {tropical fruit} 0.04229792 0.1655392 0.2555160
## [17] {soda} => {whole milk} 0.04006101 0.2297376 0.1743772
## [18] {whole milk} => {soda} 0.04006101 0.1567847 0.2555160
## [19] {soda} => {rolls/buns} 0.03833249 0.2198251 0.1743772
## [20] {rolls/buns} => {soda} 0.03833249 0.2084024 0.1839349
## lift count
## [1] 1.5136341 736
## [2] 1.5136341 736
## [3] 1.2050318 557
## [4] 1.2050318 557
## [5] 1.5717351 551
## [6] 1.5717351 551
## [7] 1.7560310 481
## [8] 1.7560310 481
## [9] 2.2466049 466
## [10] 2.2466049 466
## [11] 1.6084566 427
## [12] 1.6084566 427
## [13] 1.1970465 419
## [14] 1.1970465 419
## [15] 1.5775950 416
## [16] 1.5775950 416
## [17] 0.8991124 394
## [18] 0.8991124 394
## [19] 1.1951242 377
## [20] 1.1951242 377
These are a lot of combinations of the popular items we saw above. Let’s dig a little deeper.
inspect(head(sort(rules, by = "support", decreasing = TRUE), n = 50))
## lhs rhs support confidence
## [1] {other vegetables} => {whole milk} 0.07483477 0.3867578
## [2] {whole milk} => {other vegetables} 0.07483477 0.2928770
## [3] {rolls/buns} => {whole milk} 0.05663447 0.3079049
## [4] {whole milk} => {rolls/buns} 0.05663447 0.2216474
## [5] {yogurt} => {whole milk} 0.05602440 0.4016035
## [6] {whole milk} => {yogurt} 0.05602440 0.2192598
## [7] {root vegetables} => {whole milk} 0.04890696 0.4486940
## [8] {whole milk} => {root vegetables} 0.04890696 0.1914047
## [9] {root vegetables} => {other vegetables} 0.04738180 0.4347015
## [10] {other vegetables} => {root vegetables} 0.04738180 0.2448765
## [11] {yogurt} => {other vegetables} 0.04341637 0.3112245
## [12] {other vegetables} => {yogurt} 0.04341637 0.2243826
## [13] {rolls/buns} => {other vegetables} 0.04260295 0.2316197
## [14] {other vegetables} => {rolls/buns} 0.04260295 0.2201787
## [15] {tropical fruit} => {whole milk} 0.04229792 0.4031008
## [16] {whole milk} => {tropical fruit} 0.04229792 0.1655392
## [17] {soda} => {whole milk} 0.04006101 0.2297376
## [18] {whole milk} => {soda} 0.04006101 0.1567847
## [19] {soda} => {rolls/buns} 0.03833249 0.2198251
## [20] {rolls/buns} => {soda} 0.03833249 0.2084024
## [21] {tropical fruit} => {other vegetables} 0.03589222 0.3420543
## [22] {other vegetables} => {tropical fruit} 0.03589222 0.1854966
## [23] {bottled water} => {whole milk} 0.03436706 0.3109476
## [24] {whole milk} => {bottled water} 0.03436706 0.1345006
## [25] {yogurt} => {rolls/buns} 0.03436706 0.2463557
## [26] {rolls/buns} => {yogurt} 0.03436706 0.1868436
## [27] {pastry} => {whole milk} 0.03324860 0.3737143
## [28] {whole milk} => {pastry} 0.03324860 0.1301234
## [29] {soda} => {other vegetables} 0.03274021 0.1877551
## [30] {other vegetables} => {soda} 0.03274021 0.1692065
## [31] {whipped/sour cream} => {whole milk} 0.03223183 0.4496454
## [32] {whole milk} => {whipped/sour cream} 0.03223183 0.1261441
## [33] {sausage} => {rolls/buns} 0.03060498 0.3257576
## [34] {rolls/buns} => {sausage} 0.03060498 0.1663903
## [35] {citrus fruit} => {whole milk} 0.03050330 0.3685504
## [36] {whole milk} => {citrus fruit} 0.03050330 0.1193792
## [37] {pip fruit} => {whole milk} 0.03009659 0.3978495
## [38] {whole milk} => {pip fruit} 0.03009659 0.1177875
## [39] {domestic eggs} => {whole milk} 0.02999492 0.4727564
## [40] {whole milk} => {domestic eggs} 0.02999492 0.1173896
## [41] {sausage} => {whole milk} 0.02989324 0.3181818
## [42] {whole milk} => {sausage} 0.02989324 0.1169916
## [43] {tropical fruit} => {yogurt} 0.02928317 0.2790698
## [44] {yogurt} => {tropical fruit} 0.02928317 0.2099125
## [45] {bottled water} => {soda} 0.02897814 0.2621895
## [46] {soda} => {bottled water} 0.02897814 0.1661808
## [47] {whipped/sour cream} => {other vegetables} 0.02887646 0.4028369
## [48] {other vegetables} => {whipped/sour cream} 0.02887646 0.1492380
## [49] {citrus fruit} => {other vegetables} 0.02887646 0.3488943
## [50] {other vegetables} => {citrus fruit} 0.02887646 0.1492380
## coverage lift count
## [1] 0.19349263 1.5136341 736
## [2] 0.25551601 1.5136341 736
## [3] 0.18393493 1.2050318 557
## [4] 0.25551601 1.2050318 557
## [5] 0.13950178 1.5717351 551
## [6] 0.25551601 1.5717351 551
## [7] 0.10899847 1.7560310 481
## [8] 0.25551601 1.7560310 481
## [9] 0.10899847 2.2466049 466
## [10] 0.19349263 2.2466049 466
## [11] 0.13950178 1.6084566 427
## [12] 0.19349263 1.6084566 427
## [13] 0.18393493 1.1970465 419
## [14] 0.19349263 1.1970465 419
## [15] 0.10493137 1.5775950 416
## [16] 0.25551601 1.5775950 416
## [17] 0.17437722 0.8991124 394
## [18] 0.25551601 0.8991124 394
## [19] 0.17437722 1.1951242 377
## [20] 0.18393493 1.1951242 377
## [21] 0.10493137 1.7677896 353
## [22] 0.19349263 1.7677896 353
## [23] 0.11052364 1.2169396 338
## [24] 0.25551601 1.2169396 338
## [25] 0.13950178 1.3393633 338
## [26] 0.18393493 1.3393633 338
## [27] 0.08896797 1.4625865 327
## [28] 0.25551601 1.4625865 327
## [29] 0.17437722 0.9703476 322
## [30] 0.19349263 0.9703476 322
## [31] 0.07168277 1.7597542 317
## [32] 0.25551601 1.7597542 317
## [33] 0.09395018 1.7710480 301
## [34] 0.18393493 1.7710480 301
## [35] 0.08276563 1.4423768 300
## [36] 0.25551601 1.4423768 300
## [37] 0.07564820 1.5570432 296
## [38] 0.25551601 1.5570432 296
## [39] 0.06344687 1.8502027 295
## [40] 0.25551601 1.8502027 295
## [41] 0.09395018 1.2452520 294
## [42] 0.25551601 1.2452520 294
## [43] 0.10493137 2.0004746 288
## [44] 0.13950178 2.0004746 288
## [45] 0.11052364 1.5035766 285
## [46] 0.17437722 1.5035766 285
## [47] 0.07168277 2.0819237 284
## [48] 0.19349263 2.0819237 284
## [49] 0.08276563 1.8031403 284
## [50] 0.19349263 1.8031403 284
33/34 - sausage and rolls/buns. Rules 43-48 are sort of interesting. We’re seeing the fruit/yogurt combination mentioned in the Springer article. Other vegetables + whipped/sour cream (hopefully it’s sour cream) could be a vegetable dip (crudite platters are alive and well!). 49 and 50 just show fruit and vegetables together, which is expected. These are all around 3% of transactions each, which is more significant than the above list. Confidence is lower because these are pairs of popular items, frequently purchased along with others.
Looking at trios using the same parameters and sorting by support.
rules_2 <- apriori(groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.001 3
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [34123 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules_2)
## set of 34123 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4 5 6
## 19639 12544 1880 60
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 3.000 3.483 4.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001017 Min. :0.0500 Min. :0.001017 Min. : 0.4193
## 1st Qu.:0.001118 1st Qu.:0.1512 1st Qu.:0.003559 1st Qu.: 1.9788
## Median :0.001322 Median :0.2473 Median :0.006202 Median : 2.5841
## Mean :0.001745 Mean :0.2960 Mean :0.008789 Mean : 2.8463
## 3rd Qu.:0.001830 3rd Qu.:0.4062 3rd Qu.:0.010778 3rd Qu.: 3.3975
## Max. :0.023183 Max. :1.0000 Max. :0.074835 Max. :35.7158
## count
## Min. : 10.00
## 1st Qu.: 11.00
## Median : 13.00
## Mean : 17.16
## 3rd Qu.: 18.00
## Max. :228.00
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.001 0.05
## call
## apriori(data = groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 3))
inspect(head(sort(rules_2, by = "support", decreasing = TRUE), n = 15))
## lhs rhs support
## [1] {other vegetables, root vegetables} => {whole milk} 0.02318251
## [2] {root vegetables, whole milk} => {other vegetables} 0.02318251
## [3] {other vegetables, whole milk} => {root vegetables} 0.02318251
## [4] {other vegetables, yogurt} => {whole milk} 0.02226741
## [5] {whole milk, yogurt} => {other vegetables} 0.02226741
## [6] {other vegetables, whole milk} => {yogurt} 0.02226741
## [7] {other vegetables, rolls/buns} => {whole milk} 0.01789527
## [8] {rolls/buns, whole milk} => {other vegetables} 0.01789527
## [9] {other vegetables, whole milk} => {rolls/buns} 0.01789527
## [10] {other vegetables, tropical fruit} => {whole milk} 0.01708185
## [11] {tropical fruit, whole milk} => {other vegetables} 0.01708185
## [12] {other vegetables, whole milk} => {tropical fruit} 0.01708185
## [13] {rolls/buns, yogurt} => {whole milk} 0.01555669
## [14] {whole milk, yogurt} => {rolls/buns} 0.01555669
## [15] {rolls/buns, whole milk} => {yogurt} 0.01555669
## confidence coverage lift count
## [1] 0.4892704 0.04738180 1.914833 228
## [2] 0.4740125 0.04890696 2.449770 228
## [3] 0.3097826 0.07483477 2.842082 228
## [4] 0.5128806 0.04341637 2.007235 219
## [5] 0.3974592 0.05602440 2.054131 219
## [6] 0.2975543 0.07483477 2.132979 219
## [7] 0.4200477 0.04260295 1.643919 176
## [8] 0.3159785 0.05663447 1.633026 176
## [9] 0.2391304 0.07483477 1.300082 176
## [10] 0.4759207 0.03589222 1.862587 168
## [11] 0.4038462 0.04229792 2.087140 168
## [12] 0.2282609 0.07483477 2.175335 168
## [13] 0.4526627 0.03436706 1.771563 153
## [14] 0.2776770 0.05602440 1.509648 153
## [15] 0.2746858 0.05663447 1.969049 153
We still have 3500 rules, even though we set the number of items to three. Tons of common items here.
inspect(head(sort(rules_2, by = "support", decreasing = TRUE), n = 40))
## lhs rhs support
## [1] {other vegetables, root vegetables} => {whole milk} 0.02318251
## [2] {root vegetables, whole milk} => {other vegetables} 0.02318251
## [3] {other vegetables, whole milk} => {root vegetables} 0.02318251
## [4] {other vegetables, yogurt} => {whole milk} 0.02226741
## [5] {whole milk, yogurt} => {other vegetables} 0.02226741
## [6] {other vegetables, whole milk} => {yogurt} 0.02226741
## [7] {other vegetables, rolls/buns} => {whole milk} 0.01789527
## [8] {rolls/buns, whole milk} => {other vegetables} 0.01789527
## [9] {other vegetables, whole milk} => {rolls/buns} 0.01789527
## [10] {other vegetables, tropical fruit} => {whole milk} 0.01708185
## [11] {tropical fruit, whole milk} => {other vegetables} 0.01708185
## [12] {other vegetables, whole milk} => {tropical fruit} 0.01708185
## [13] {rolls/buns, yogurt} => {whole milk} 0.01555669
## [14] {whole milk, yogurt} => {rolls/buns} 0.01555669
## [15] {rolls/buns, whole milk} => {yogurt} 0.01555669
## [16] {tropical fruit, yogurt} => {whole milk} 0.01514997
## [17] {tropical fruit, whole milk} => {yogurt} 0.01514997
## [18] {whole milk, yogurt} => {tropical fruit} 0.01514997
## [19] {other vegetables, whipped/sour cream} => {whole milk} 0.01464159
## [20] {whipped/sour cream, whole milk} => {other vegetables} 0.01464159
## [21] {other vegetables, whole milk} => {whipped/sour cream} 0.01464159
## [22] {root vegetables, yogurt} => {whole milk} 0.01453991
## [23] {root vegetables, whole milk} => {yogurt} 0.01453991
## [24] {whole milk, yogurt} => {root vegetables} 0.01453991
## [25] {other vegetables, soda} => {whole milk} 0.01392984
## [26] {soda, whole milk} => {other vegetables} 0.01392984
## [27] {other vegetables, whole milk} => {soda} 0.01392984
## [28] {other vegetables, pip fruit} => {whole milk} 0.01352313
## [29] {pip fruit, whole milk} => {other vegetables} 0.01352313
## [30] {other vegetables, whole milk} => {pip fruit} 0.01352313
## [31] {citrus fruit, other vegetables} => {whole milk} 0.01301474
## [32] {citrus fruit, whole milk} => {other vegetables} 0.01301474
## [33] {other vegetables, whole milk} => {citrus fruit} 0.01301474
## [34] {root vegetables, yogurt} => {other vegetables} 0.01291307
## [35] {other vegetables, root vegetables} => {yogurt} 0.01291307
## [36] {other vegetables, yogurt} => {root vegetables} 0.01291307
## [37] {rolls/buns, root vegetables} => {whole milk} 0.01270971
## [38] {root vegetables, whole milk} => {rolls/buns} 0.01270971
## [39] {rolls/buns, whole milk} => {root vegetables} 0.01270971
## [40] {domestic eggs, other vegetables} => {whole milk} 0.01230300
## confidence coverage lift count
## [1] 0.4892704 0.04738180 1.914833 228
## [2] 0.4740125 0.04890696 2.449770 228
## [3] 0.3097826 0.07483477 2.842082 228
## [4] 0.5128806 0.04341637 2.007235 219
## [5] 0.3974592 0.05602440 2.054131 219
## [6] 0.2975543 0.07483477 2.132979 219
## [7] 0.4200477 0.04260295 1.643919 176
## [8] 0.3159785 0.05663447 1.633026 176
## [9] 0.2391304 0.07483477 1.300082 176
## [10] 0.4759207 0.03589222 1.862587 168
## [11] 0.4038462 0.04229792 2.087140 168
## [12] 0.2282609 0.07483477 2.175335 168
## [13] 0.4526627 0.03436706 1.771563 153
## [14] 0.2776770 0.05602440 1.509648 153
## [15] 0.2746858 0.05663447 1.969049 153
## [16] 0.5173611 0.02928317 2.024770 149
## [17] 0.3581731 0.04229792 2.567516 149
## [18] 0.2704174 0.05602440 2.577089 149
## [19] 0.5070423 0.02887646 1.984385 144
## [20] 0.4542587 0.03223183 2.347679 144
## [21] 0.1956522 0.07483477 2.729417 144
## [22] 0.5629921 0.02582613 2.203354 143
## [23] 0.2972973 0.04890696 2.131136 143
## [24] 0.2595281 0.05602440 2.381025 143
## [25] 0.4254658 0.03274021 1.665124 137
## [26] 0.3477157 0.04006101 1.797049 137
## [27] 0.1861413 0.07483477 1.067463 137
## [28] 0.5175097 0.02613116 2.025351 133
## [29] 0.4493243 0.03009659 2.322178 133
## [30] 0.1807065 0.07483477 2.388775 133
## [31] 0.4507042 0.02887646 1.763898 128
## [32] 0.4266667 0.03050330 2.205080 128
## [33] 0.1739130 0.07483477 2.101271 128
## [34] 0.5000000 0.02582613 2.584078 127
## [35] 0.2725322 0.04738180 1.953611 127
## [36] 0.2974239 0.04341637 2.728698 127
## [37] 0.5230126 0.02430097 2.046888 125
## [38] 0.2598753 0.04890696 1.412865 125
## [39] 0.2244165 0.05663447 2.058896 125
## [40] 0.5525114 0.02226741 2.162336 121
Sorting by lift. Here are the top 10 unlikely combinations:
inspect(head(sort(rules, by = "lift", decreasing = TRUE), n = 10))
## lhs rhs support confidence coverage lift count
## [1] {bottled beer,
## red/blush wine} => {liquor} 0.001931876 0.3958333 0.004880529 35.71579 19
## [2] {hamburger meat,
## soda} => {Instant food products} 0.001220132 0.2105263 0.005795628 26.20919 12
## [3] {ham,
## white bread} => {processed cheese} 0.001931876 0.3800000 0.005083884 22.92822 19
## [4] {other vegetables,
## root vegetables,
## whole milk,
## yogurt} => {rice} 0.001321810 0.1688312 0.007829181 22.13939 13
## [5] {bottled beer,
## liquor} => {red/blush wine} 0.001931876 0.4130435 0.004677173 21.49356 19
## [6] {Instant food products,
## soda} => {hamburger meat} 0.001220132 0.6315789 0.001931876 18.99565 12
## [7] {curd,
## sugar} => {flour} 0.001118454 0.3235294 0.003457041 18.60767 11
## [8] {salty snack,
## soda} => {popcorn} 0.001220132 0.1304348 0.009354347 18.06797 12
## [9] {baking powder,
## sugar} => {flour} 0.001016777 0.3125000 0.003253686 17.97332 10
## [10] {processed cheese,
## white bread} => {ham} 0.001931876 0.4634146 0.004168785 17.80345 19
We’re seeing ham and cheese sandwiches; alcohol products; hamburger meat, soda, and instant food; snack products (chips, soda, salty snack); baking products. Support is low-ish for all of these, with just over .1% of transactions, and confidence varies from .13 to .46, On the higher end (ham sandwich combination), this represents about half of transactions with those items.
Plotting by lift:
plot(head(sort(rules, by = "lift", decreasing = TRUE), n = 10), method = "graph")
Cluster analysis with kmeans:
#make the groceries table into a matrix
basket_matrix <- as(groceries, "matrix")
basket_matrix <- basket_matrix * 1
set.seed(1122)
#makes 4 clusters (starting point)
clusters <- kmeans(basket_matrix, centers = 4)
items$cluster <- clusters$cluster
#cluster sizes
table(clusters$cluster)
##
## 1 2 3 4
## 5903 2319 520 1093
#they are enormous
#most frequent items in Cluster 1
colMeans(basket_matrix[clusters$cluster == 1, ]) |>
sort(decreasing = TRUE) |>
head(10)
## soda rolls/buns canned beer bottled water yogurt
## 0.18228020 0.16347620 0.10113502 0.09740810 0.09571404
## shopping bags bottled beer sausage pastry tropical fruit
## 0.09232594 0.08250042 0.07403015 0.06793156 0.06776215
There are a lot of items in each cluster. Let’s refine this a little with smaller clusters.
set.seed(1122)
#makes 8 clusters
clusters <- kmeans(basket_matrix, centers = 8)
items$cluster <- clusters$cluster
#cluster sizes
table(clusters$cluster)
##
## 1 2 3 4 5 6 7 8
## 3719 1190 431 1076 746 877 855 941
#they are still pretty big
#most frequent items in cluster 1
colMeans(basket_matrix[clusters$cluster == 1, ]) |>
sort(decreasing = TRUE) |>
head(10)
## canned beer bottled beer shopping bags bottled water pastry
## 0.12207583 0.08819575 0.08362463 0.08281796 0.06184458
## newspapers tropical fruit sausage coffee root vegetables
## 0.05888680 0.05619790 0.05404679 0.05216456 0.05189567
#cluster 2
colMeans(basket_matrix[clusters$cluster == 2, ]) |>
sort(decreasing = TRUE) |>
head(10)
## whole milk rolls/buns root vegetables tropical fruit pastry
## 1.00000000 0.20924370 0.12521008 0.11512605 0.11008403
## bottled water newspapers sausage domestic eggs brown bread
## 0.10924370 0.10924370 0.08907563 0.08487395 0.08067227
colMeans(basket_matrix[clusters$cluster == 3, ]) |>
sort(decreasing = TRUE) |>
head(10)
## napkins whole milk yogurt rolls/buns
## 1.0000000 0.3016241 0.2320186 0.2088167
## soda tropical fruit other vegetables root vegetables
## 0.1972158 0.1856148 0.1809745 0.1740139
## bottled water shopping bags
## 0.1531323 0.1415313
colMeans(basket_matrix[clusters$cluster == 4, ]) |>
sort(decreasing = TRUE) |>
head(10)
## other vegetables root vegetables rolls/buns yogurt
## 1.0000000 0.1988848 0.1960967 0.1728625
## soda tropical fruit sausage citrus fruit
## 0.1486989 0.1486989 0.1356877 0.1338290
## shopping bags bottled water
## 0.1328996 0.1171004
Trying 20 clusters
clusters <- kmeans(basket_matrix, centers = 20)
items$cluster <- clusters$cluster
#cluster sizes
table(clusters$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 286 2535 279 205 493 387 169 395 580 315 705 655 95 358 466 224
## 17 18 19 20
## 410 517 378 383
#they are still pretty big
#most frequent items in cluster 1
colMeans(basket_matrix[clusters$cluster == 1,]) |>
sort(decreasing = TRUE) |>
head(10)
## rolls/buns whole milk tropical fruit sausage bottled water
## 1.0000000 1.0000000 0.1573427 0.1538462 0.1503497
## root vegetables pastry newspapers margarine soda
## 0.1468531 0.1363636 0.1153846 0.1118881 0.1083916
#cluster 2
colMeans(basket_matrix[clusters$cluster == 2,]) |>
sort(decreasing = TRUE) |>
head(10)
## soda bottled beer bottled water
## 0.18343195 0.11360947 0.09743590
## newspapers fruit/vegetable juice citrus fruit
## 0.06429980 0.04654832 0.03984221
## chocolate pip fruit specialty chocolate
## 0.03944773 0.03826430 0.03668639
## misc. beverages
## 0.03629191
colMeans(basket_matrix[clusters$cluster == 3,]) |>
sort(decreasing = TRUE) |>
head(10)
## brown bread whole milk pastry
## 1.0000000 0.5519713 0.1863799
## other vegetables newspapers soda
## 0.1433692 0.1397849 0.1397849
## tropical fruit yogurt root vegetables
## 0.1326165 0.1182796 0.1111111
## fruit/vegetable juice
## 0.1003584
colMeans(basket_matrix[clusters$cluster == 4,]) |>
sort(decreasing = TRUE) |>
head(10)
## UHT-milk soda bottled water other vegetables
## 1.0000000 0.2926829 0.2682927 0.1804878
## yogurt brown bread root vegetables newspapers
## 0.1658537 0.1414634 0.1414634 0.1365854
## shopping bags citrus fruit
## 0.1317073 0.1219512