library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(arules)
## Warning: package 'arules' was built under R version 4.5.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.5.3

Exploratory analysis

items <- read.csv("https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/GroceryDataSet.csv", sep = ",", na = c("", "N/A", "missing", "—"), header = FALSE )

How many items are in an average basket? We know the max is 31

item_counts <- rowSums(!is.na(items))
summary(item_counts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000

The median number of items is 3 and the average is 4.4. The third quartile suggests that most shoppers don’t buy more than 10 items.

How many of each basket size?

item_counts <- data.frame(item_counts)

item_counts |> count(item_counts)
##    item_counts    n
## 1            1 2159
## 2            2 1643
## 3            3 1299
## 4            4 1005
## 5            5  855
## 6            6  645
## 7            7  545
## 8            8  438
## 9            9  350
## 10          10  246
## 11          11  182
## 12          12  117
## 13          13   78
## 14          14   77
## 15          15   55
## 16          16   46
## 17          17   29
## 18          18   14
## 19          19   14
## 20          20    9
## 21          21   11
## 22          22    4
## 23          23    6
## 24          24    1
## 25          26    1
## 26          27    1
## 27          28    1
## 28          29    3
## 29          32    1

About 7,000 out of 10,000 baskets contain 5 or fewer items.

What are the most popular items?

item_popularity <- sort(table(unlist(items)), decreasing = TRUE)

head(item_popularity, 20)
## 
##            whole milk      other vegetables            rolls/buns 
##                  2513                  1903                  1809 
##                  soda                yogurt         bottled water 
##                  1715                  1372                  1087 
##       root vegetables        tropical fruit         shopping bags 
##                  1072                  1032                   969 
##               sausage                pastry          citrus fruit 
##                   924                   875                   814 
##          bottled beer            newspapers           canned beer 
##                   792                   785                   764 
##             pip fruit fruit/vegetable juice    whipped/sour cream 
##                   744                   711                   705 
##           brown bread         domestic eggs 
##                   638                   624

Whole milk, other vegetables, rolls/buns, soda, and yogurt are among the most popular items.

Create a transactions object instead of a data frame. Remove duplicates (e.g., two cereals purchased in the same transaction)

groceries <- read.transactions("https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/GroceryDataSet.csv", format = "basket", sep = ",", rm.duplicates = TRUE)

Specifying: Support - item must appear in at least .1% of transactions Confidence - this rule must be true at least 5% of the time Min length - must be at least 2 items

rules <- apriori(groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.05    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [37937 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].
summary(rules)
## set of 37937 rules
## 
## rule length distribution (lhs + rhs):sizes
##     2     3     4     5     6 
##  3814 19639 12544  1880    60 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   3.334   4.000   6.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001017   Min.   :0.0500   Min.   :0.001017   Min.   : 0.4193  
##  1st Qu.:0.001118   1st Qu.:0.1358   1st Qu.:0.003864   1st Qu.: 1.8719  
##  Median :0.001423   Median :0.2273   Median :0.006812   Median : 2.4718  
##  Mean   :0.002065   Mean   :0.2797   Mean   :0.012163   Mean   : 2.7437  
##  3rd Qu.:0.002034   3rd Qu.:0.3846   3rd Qu.:0.013218   3rd Qu.: 3.2970  
##  Max.   :0.074835   Max.   :1.0000   Max.   :0.255516   Max.   :35.7158  
##      count       
##  Min.   : 10.00  
##  1st Qu.: 11.00  
##  Median : 14.00  
##  Mean   : 20.31  
##  3rd Qu.: 20.00  
##  Max.   :736.00  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.001       0.05
##                                                                                call
##  apriori(data = groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 2))
#exploring results
inspect(head(sort(rules, by = "confidence", decreasing = TRUE), n = 10))
##      lhs                     rhs                    support confidence    coverage     lift count
## [1]  {rice,                                                                                      
##       sugar}              => {whole milk}       0.001220132          1 0.001220132 3.913649    12
## [2]  {canned fish,                                                                               
##       hygiene articles}   => {whole milk}       0.001118454          1 0.001118454 3.913649    11
## [3]  {butter,                                                                                    
##       rice,                                                                                      
##       root vegetables}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [4]  {flour,                                                                                     
##       root vegetables,                                                                           
##       whipped/sour cream} => {whole milk}       0.001728521          1 0.001728521 3.913649    17
## [5]  {butter,                                                                                    
##       domestic eggs,                                                                             
##       soft cheese}        => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [6]  {citrus fruit,                                                                              
##       root vegetables,                                                                           
##       soft cheese}        => {other vegetables} 0.001016777          1 0.001016777 5.168156    10
## [7]  {butter,                                                                                    
##       hygiene articles,                                                                          
##       pip fruit}          => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [8]  {hygiene articles,                                                                          
##       root vegetables,                                                                           
##       whipped/sour cream} => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [9]  {hygiene articles,                                                                          
##       pip fruit,                                                                                 
##       root vegetables}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [10] {cream cheese,                                                                              
##       domestic eggs,                                                                             
##       sugar}              => {whole milk}       0.001118454          1 0.001118454 3.913649    11

This gives us a set of rules with very high confidence (100% of the time these items are purchased together for 1), but low support. The items are ~.1% of shopping baskets. And, given that these sets are 3+ items, they might have only been purchased together twice.

Let’s try with sorting by support. Maybe there are more frequent combinations that don’t happen in 100% of cases.

inspect(head(sort(rules, by = "support", decreasing = TRUE), n = 20))
##      lhs                   rhs                support    confidence coverage 
## [1]  {other vegetables} => {whole milk}       0.07483477 0.3867578  0.1934926
## [2]  {whole milk}       => {other vegetables} 0.07483477 0.2928770  0.2555160
## [3]  {rolls/buns}       => {whole milk}       0.05663447 0.3079049  0.1839349
## [4]  {whole milk}       => {rolls/buns}       0.05663447 0.2216474  0.2555160
## [5]  {yogurt}           => {whole milk}       0.05602440 0.4016035  0.1395018
## [6]  {whole milk}       => {yogurt}           0.05602440 0.2192598  0.2555160
## [7]  {root vegetables}  => {whole milk}       0.04890696 0.4486940  0.1089985
## [8]  {whole milk}       => {root vegetables}  0.04890696 0.1914047  0.2555160
## [9]  {root vegetables}  => {other vegetables} 0.04738180 0.4347015  0.1089985
## [10] {other vegetables} => {root vegetables}  0.04738180 0.2448765  0.1934926
## [11] {yogurt}           => {other vegetables} 0.04341637 0.3112245  0.1395018
## [12] {other vegetables} => {yogurt}           0.04341637 0.2243826  0.1934926
## [13] {rolls/buns}       => {other vegetables} 0.04260295 0.2316197  0.1839349
## [14] {other vegetables} => {rolls/buns}       0.04260295 0.2201787  0.1934926
## [15] {tropical fruit}   => {whole milk}       0.04229792 0.4031008  0.1049314
## [16] {whole milk}       => {tropical fruit}   0.04229792 0.1655392  0.2555160
## [17] {soda}             => {whole milk}       0.04006101 0.2297376  0.1743772
## [18] {whole milk}       => {soda}             0.04006101 0.1567847  0.2555160
## [19] {soda}             => {rolls/buns}       0.03833249 0.2198251  0.1743772
## [20] {rolls/buns}       => {soda}             0.03833249 0.2084024  0.1839349
##      lift      count
## [1]  1.5136341 736  
## [2]  1.5136341 736  
## [3]  1.2050318 557  
## [4]  1.2050318 557  
## [5]  1.5717351 551  
## [6]  1.5717351 551  
## [7]  1.7560310 481  
## [8]  1.7560310 481  
## [9]  2.2466049 466  
## [10] 2.2466049 466  
## [11] 1.6084566 427  
## [12] 1.6084566 427  
## [13] 1.1970465 419  
## [14] 1.1970465 419  
## [15] 1.5775950 416  
## [16] 1.5775950 416  
## [17] 0.8991124 394  
## [18] 0.8991124 394  
## [19] 1.1951242 377  
## [20] 1.1951242 377

These are a lot of combinations of the popular items we saw above. Let’s dig a little deeper.

inspect(head(sort(rules, by = "support", decreasing = TRUE), n = 50))
##      lhs                     rhs                  support    confidence
## [1]  {other vegetables}   => {whole milk}         0.07483477 0.3867578 
## [2]  {whole milk}         => {other vegetables}   0.07483477 0.2928770 
## [3]  {rolls/buns}         => {whole milk}         0.05663447 0.3079049 
## [4]  {whole milk}         => {rolls/buns}         0.05663447 0.2216474 
## [5]  {yogurt}             => {whole milk}         0.05602440 0.4016035 
## [6]  {whole milk}         => {yogurt}             0.05602440 0.2192598 
## [7]  {root vegetables}    => {whole milk}         0.04890696 0.4486940 
## [8]  {whole milk}         => {root vegetables}    0.04890696 0.1914047 
## [9]  {root vegetables}    => {other vegetables}   0.04738180 0.4347015 
## [10] {other vegetables}   => {root vegetables}    0.04738180 0.2448765 
## [11] {yogurt}             => {other vegetables}   0.04341637 0.3112245 
## [12] {other vegetables}   => {yogurt}             0.04341637 0.2243826 
## [13] {rolls/buns}         => {other vegetables}   0.04260295 0.2316197 
## [14] {other vegetables}   => {rolls/buns}         0.04260295 0.2201787 
## [15] {tropical fruit}     => {whole milk}         0.04229792 0.4031008 
## [16] {whole milk}         => {tropical fruit}     0.04229792 0.1655392 
## [17] {soda}               => {whole milk}         0.04006101 0.2297376 
## [18] {whole milk}         => {soda}               0.04006101 0.1567847 
## [19] {soda}               => {rolls/buns}         0.03833249 0.2198251 
## [20] {rolls/buns}         => {soda}               0.03833249 0.2084024 
## [21] {tropical fruit}     => {other vegetables}   0.03589222 0.3420543 
## [22] {other vegetables}   => {tropical fruit}     0.03589222 0.1854966 
## [23] {bottled water}      => {whole milk}         0.03436706 0.3109476 
## [24] {whole milk}         => {bottled water}      0.03436706 0.1345006 
## [25] {yogurt}             => {rolls/buns}         0.03436706 0.2463557 
## [26] {rolls/buns}         => {yogurt}             0.03436706 0.1868436 
## [27] {pastry}             => {whole milk}         0.03324860 0.3737143 
## [28] {whole milk}         => {pastry}             0.03324860 0.1301234 
## [29] {soda}               => {other vegetables}   0.03274021 0.1877551 
## [30] {other vegetables}   => {soda}               0.03274021 0.1692065 
## [31] {whipped/sour cream} => {whole milk}         0.03223183 0.4496454 
## [32] {whole milk}         => {whipped/sour cream} 0.03223183 0.1261441 
## [33] {sausage}            => {rolls/buns}         0.03060498 0.3257576 
## [34] {rolls/buns}         => {sausage}            0.03060498 0.1663903 
## [35] {citrus fruit}       => {whole milk}         0.03050330 0.3685504 
## [36] {whole milk}         => {citrus fruit}       0.03050330 0.1193792 
## [37] {pip fruit}          => {whole milk}         0.03009659 0.3978495 
## [38] {whole milk}         => {pip fruit}          0.03009659 0.1177875 
## [39] {domestic eggs}      => {whole milk}         0.02999492 0.4727564 
## [40] {whole milk}         => {domestic eggs}      0.02999492 0.1173896 
## [41] {sausage}            => {whole milk}         0.02989324 0.3181818 
## [42] {whole milk}         => {sausage}            0.02989324 0.1169916 
## [43] {tropical fruit}     => {yogurt}             0.02928317 0.2790698 
## [44] {yogurt}             => {tropical fruit}     0.02928317 0.2099125 
## [45] {bottled water}      => {soda}               0.02897814 0.2621895 
## [46] {soda}               => {bottled water}      0.02897814 0.1661808 
## [47] {whipped/sour cream} => {other vegetables}   0.02887646 0.4028369 
## [48] {other vegetables}   => {whipped/sour cream} 0.02887646 0.1492380 
## [49] {citrus fruit}       => {other vegetables}   0.02887646 0.3488943 
## [50] {other vegetables}   => {citrus fruit}       0.02887646 0.1492380 
##      coverage   lift      count
## [1]  0.19349263 1.5136341 736  
## [2]  0.25551601 1.5136341 736  
## [3]  0.18393493 1.2050318 557  
## [4]  0.25551601 1.2050318 557  
## [5]  0.13950178 1.5717351 551  
## [6]  0.25551601 1.5717351 551  
## [7]  0.10899847 1.7560310 481  
## [8]  0.25551601 1.7560310 481  
## [9]  0.10899847 2.2466049 466  
## [10] 0.19349263 2.2466049 466  
## [11] 0.13950178 1.6084566 427  
## [12] 0.19349263 1.6084566 427  
## [13] 0.18393493 1.1970465 419  
## [14] 0.19349263 1.1970465 419  
## [15] 0.10493137 1.5775950 416  
## [16] 0.25551601 1.5775950 416  
## [17] 0.17437722 0.8991124 394  
## [18] 0.25551601 0.8991124 394  
## [19] 0.17437722 1.1951242 377  
## [20] 0.18393493 1.1951242 377  
## [21] 0.10493137 1.7677896 353  
## [22] 0.19349263 1.7677896 353  
## [23] 0.11052364 1.2169396 338  
## [24] 0.25551601 1.2169396 338  
## [25] 0.13950178 1.3393633 338  
## [26] 0.18393493 1.3393633 338  
## [27] 0.08896797 1.4625865 327  
## [28] 0.25551601 1.4625865 327  
## [29] 0.17437722 0.9703476 322  
## [30] 0.19349263 0.9703476 322  
## [31] 0.07168277 1.7597542 317  
## [32] 0.25551601 1.7597542 317  
## [33] 0.09395018 1.7710480 301  
## [34] 0.18393493 1.7710480 301  
## [35] 0.08276563 1.4423768 300  
## [36] 0.25551601 1.4423768 300  
## [37] 0.07564820 1.5570432 296  
## [38] 0.25551601 1.5570432 296  
## [39] 0.06344687 1.8502027 295  
## [40] 0.25551601 1.8502027 295  
## [41] 0.09395018 1.2452520 294  
## [42] 0.25551601 1.2452520 294  
## [43] 0.10493137 2.0004746 288  
## [44] 0.13950178 2.0004746 288  
## [45] 0.11052364 1.5035766 285  
## [46] 0.17437722 1.5035766 285  
## [47] 0.07168277 2.0819237 284  
## [48] 0.19349263 2.0819237 284  
## [49] 0.08276563 1.8031403 284  
## [50] 0.19349263 1.8031403 284

33/34 - sausage and rolls/buns. Rules 43-48 are sort of interesting. We’re seeing the fruit/yogurt combination mentioned in the Springer article. Other vegetables + whipped/sour cream (hopefully it’s sour cream) could be a vegetable dip (crudite platters are alive and well!). 49 and 50 just show fruit and vegetables together, which is expected. These are all around 3% of transactions each, which is more significant than the above list. Confidence is lower because these are pairs of popular items, frequently purchased along with others.

Looking at trios using the same parameters and sorting by support.

rules_2 <- apriori(groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 3))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.05    0.1    1 none FALSE            TRUE       5   0.001      3
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [34123 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules_2)
## set of 34123 rules
## 
## rule length distribution (lhs + rhs):sizes
##     3     4     5     6 
## 19639 12544  1880    60 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   3.000   3.000   3.483   4.000   6.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001017   Min.   :0.0500   Min.   :0.001017   Min.   : 0.4193  
##  1st Qu.:0.001118   1st Qu.:0.1512   1st Qu.:0.003559   1st Qu.: 1.9788  
##  Median :0.001322   Median :0.2473   Median :0.006202   Median : 2.5841  
##  Mean   :0.001745   Mean   :0.2960   Mean   :0.008789   Mean   : 2.8463  
##  3rd Qu.:0.001830   3rd Qu.:0.4062   3rd Qu.:0.010778   3rd Qu.: 3.3975  
##  Max.   :0.023183   Max.   :1.0000   Max.   :0.074835   Max.   :35.7158  
##      count       
##  Min.   : 10.00  
##  1st Qu.: 11.00  
##  Median : 13.00  
##  Mean   : 17.16  
##  3rd Qu.: 18.00  
##  Max.   :228.00  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.001       0.05
##                                                                                call
##  apriori(data = groceries, parameter = list(supp = 0.001, conf = 0.05, minlen = 3))
inspect(head(sort(rules_2, by = "support", decreasing = TRUE), n = 15))
##      lhs                                    rhs                support   
## [1]  {other vegetables, root vegetables} => {whole milk}       0.02318251
## [2]  {root vegetables, whole milk}       => {other vegetables} 0.02318251
## [3]  {other vegetables, whole milk}      => {root vegetables}  0.02318251
## [4]  {other vegetables, yogurt}          => {whole milk}       0.02226741
## [5]  {whole milk, yogurt}                => {other vegetables} 0.02226741
## [6]  {other vegetables, whole milk}      => {yogurt}           0.02226741
## [7]  {other vegetables, rolls/buns}      => {whole milk}       0.01789527
## [8]  {rolls/buns, whole milk}            => {other vegetables} 0.01789527
## [9]  {other vegetables, whole milk}      => {rolls/buns}       0.01789527
## [10] {other vegetables, tropical fruit}  => {whole milk}       0.01708185
## [11] {tropical fruit, whole milk}        => {other vegetables} 0.01708185
## [12] {other vegetables, whole milk}      => {tropical fruit}   0.01708185
## [13] {rolls/buns, yogurt}                => {whole milk}       0.01555669
## [14] {whole milk, yogurt}                => {rolls/buns}       0.01555669
## [15] {rolls/buns, whole milk}            => {yogurt}           0.01555669
##      confidence coverage   lift     count
## [1]  0.4892704  0.04738180 1.914833 228  
## [2]  0.4740125  0.04890696 2.449770 228  
## [3]  0.3097826  0.07483477 2.842082 228  
## [4]  0.5128806  0.04341637 2.007235 219  
## [5]  0.3974592  0.05602440 2.054131 219  
## [6]  0.2975543  0.07483477 2.132979 219  
## [7]  0.4200477  0.04260295 1.643919 176  
## [8]  0.3159785  0.05663447 1.633026 176  
## [9]  0.2391304  0.07483477 1.300082 176  
## [10] 0.4759207  0.03589222 1.862587 168  
## [11] 0.4038462  0.04229792 2.087140 168  
## [12] 0.2282609  0.07483477 2.175335 168  
## [13] 0.4526627  0.03436706 1.771563 153  
## [14] 0.2776770  0.05602440 1.509648 153  
## [15] 0.2746858  0.05663447 1.969049 153

We still have 3500 rules, even though we set the number of items to three. Tons of common items here.

inspect(head(sort(rules_2, by = "support", decreasing = TRUE), n = 40))
##      lhs                                       rhs                  support   
## [1]  {other vegetables, root vegetables}    => {whole milk}         0.02318251
## [2]  {root vegetables, whole milk}          => {other vegetables}   0.02318251
## [3]  {other vegetables, whole milk}         => {root vegetables}    0.02318251
## [4]  {other vegetables, yogurt}             => {whole milk}         0.02226741
## [5]  {whole milk, yogurt}                   => {other vegetables}   0.02226741
## [6]  {other vegetables, whole milk}         => {yogurt}             0.02226741
## [7]  {other vegetables, rolls/buns}         => {whole milk}         0.01789527
## [8]  {rolls/buns, whole milk}               => {other vegetables}   0.01789527
## [9]  {other vegetables, whole milk}         => {rolls/buns}         0.01789527
## [10] {other vegetables, tropical fruit}     => {whole milk}         0.01708185
## [11] {tropical fruit, whole milk}           => {other vegetables}   0.01708185
## [12] {other vegetables, whole milk}         => {tropical fruit}     0.01708185
## [13] {rolls/buns, yogurt}                   => {whole milk}         0.01555669
## [14] {whole milk, yogurt}                   => {rolls/buns}         0.01555669
## [15] {rolls/buns, whole milk}               => {yogurt}             0.01555669
## [16] {tropical fruit, yogurt}               => {whole milk}         0.01514997
## [17] {tropical fruit, whole milk}           => {yogurt}             0.01514997
## [18] {whole milk, yogurt}                   => {tropical fruit}     0.01514997
## [19] {other vegetables, whipped/sour cream} => {whole milk}         0.01464159
## [20] {whipped/sour cream, whole milk}       => {other vegetables}   0.01464159
## [21] {other vegetables, whole milk}         => {whipped/sour cream} 0.01464159
## [22] {root vegetables, yogurt}              => {whole milk}         0.01453991
## [23] {root vegetables, whole milk}          => {yogurt}             0.01453991
## [24] {whole milk, yogurt}                   => {root vegetables}    0.01453991
## [25] {other vegetables, soda}               => {whole milk}         0.01392984
## [26] {soda, whole milk}                     => {other vegetables}   0.01392984
## [27] {other vegetables, whole milk}         => {soda}               0.01392984
## [28] {other vegetables, pip fruit}          => {whole milk}         0.01352313
## [29] {pip fruit, whole milk}                => {other vegetables}   0.01352313
## [30] {other vegetables, whole milk}         => {pip fruit}          0.01352313
## [31] {citrus fruit, other vegetables}       => {whole milk}         0.01301474
## [32] {citrus fruit, whole milk}             => {other vegetables}   0.01301474
## [33] {other vegetables, whole milk}         => {citrus fruit}       0.01301474
## [34] {root vegetables, yogurt}              => {other vegetables}   0.01291307
## [35] {other vegetables, root vegetables}    => {yogurt}             0.01291307
## [36] {other vegetables, yogurt}             => {root vegetables}    0.01291307
## [37] {rolls/buns, root vegetables}          => {whole milk}         0.01270971
## [38] {root vegetables, whole milk}          => {rolls/buns}         0.01270971
## [39] {rolls/buns, whole milk}               => {root vegetables}    0.01270971
## [40] {domestic eggs, other vegetables}      => {whole milk}         0.01230300
##      confidence coverage   lift     count
## [1]  0.4892704  0.04738180 1.914833 228  
## [2]  0.4740125  0.04890696 2.449770 228  
## [3]  0.3097826  0.07483477 2.842082 228  
## [4]  0.5128806  0.04341637 2.007235 219  
## [5]  0.3974592  0.05602440 2.054131 219  
## [6]  0.2975543  0.07483477 2.132979 219  
## [7]  0.4200477  0.04260295 1.643919 176  
## [8]  0.3159785  0.05663447 1.633026 176  
## [9]  0.2391304  0.07483477 1.300082 176  
## [10] 0.4759207  0.03589222 1.862587 168  
## [11] 0.4038462  0.04229792 2.087140 168  
## [12] 0.2282609  0.07483477 2.175335 168  
## [13] 0.4526627  0.03436706 1.771563 153  
## [14] 0.2776770  0.05602440 1.509648 153  
## [15] 0.2746858  0.05663447 1.969049 153  
## [16] 0.5173611  0.02928317 2.024770 149  
## [17] 0.3581731  0.04229792 2.567516 149  
## [18] 0.2704174  0.05602440 2.577089 149  
## [19] 0.5070423  0.02887646 1.984385 144  
## [20] 0.4542587  0.03223183 2.347679 144  
## [21] 0.1956522  0.07483477 2.729417 144  
## [22] 0.5629921  0.02582613 2.203354 143  
## [23] 0.2972973  0.04890696 2.131136 143  
## [24] 0.2595281  0.05602440 2.381025 143  
## [25] 0.4254658  0.03274021 1.665124 137  
## [26] 0.3477157  0.04006101 1.797049 137  
## [27] 0.1861413  0.07483477 1.067463 137  
## [28] 0.5175097  0.02613116 2.025351 133  
## [29] 0.4493243  0.03009659 2.322178 133  
## [30] 0.1807065  0.07483477 2.388775 133  
## [31] 0.4507042  0.02887646 1.763898 128  
## [32] 0.4266667  0.03050330 2.205080 128  
## [33] 0.1739130  0.07483477 2.101271 128  
## [34] 0.5000000  0.02582613 2.584078 127  
## [35] 0.2725322  0.04738180 1.953611 127  
## [36] 0.2974239  0.04341637 2.728698 127  
## [37] 0.5230126  0.02430097 2.046888 125  
## [38] 0.2598753  0.04890696 1.412865 125  
## [39] 0.2244165  0.05663447 2.058896 125  
## [40] 0.5525114  0.02226741 2.162336 121

Sorting by lift. Here are the top 10 unlikely combinations:

inspect(head(sort(rules, by = "lift", decreasing = TRUE), n = 10))
##      lhs                         rhs                         support confidence    coverage     lift count
## [1]  {bottled beer,                                                                                       
##       red/blush wine}         => {liquor}                0.001931876  0.3958333 0.004880529 35.71579    19
## [2]  {hamburger meat,                                                                                     
##       soda}                   => {Instant food products} 0.001220132  0.2105263 0.005795628 26.20919    12
## [3]  {ham,                                                                                                
##       white bread}            => {processed cheese}      0.001931876  0.3800000 0.005083884 22.92822    19
## [4]  {other vegetables,                                                                                   
##       root vegetables,                                                                                    
##       whole milk,                                                                                         
##       yogurt}                 => {rice}                  0.001321810  0.1688312 0.007829181 22.13939    13
## [5]  {bottled beer,                                                                                       
##       liquor}                 => {red/blush wine}        0.001931876  0.4130435 0.004677173 21.49356    19
## [6]  {Instant food products,                                                                              
##       soda}                   => {hamburger meat}        0.001220132  0.6315789 0.001931876 18.99565    12
## [7]  {curd,                                                                                               
##       sugar}                  => {flour}                 0.001118454  0.3235294 0.003457041 18.60767    11
## [8]  {salty snack,                                                                                        
##       soda}                   => {popcorn}               0.001220132  0.1304348 0.009354347 18.06797    12
## [9]  {baking powder,                                                                                      
##       sugar}                  => {flour}                 0.001016777  0.3125000 0.003253686 17.97332    10
## [10] {processed cheese,                                                                                   
##       white bread}            => {ham}                   0.001931876  0.4634146 0.004168785 17.80345    19

We’re seeing ham and cheese sandwiches; alcohol products; hamburger meat, soda, and instant food; snack products (chips, soda, salty snack); baking products. Support is low-ish for all of these, with just over .1% of transactions, and confidence varies from .13 to .46, On the higher end (ham sandwich combination), this represents about half of transactions with those items.

Plotting by lift:

plot(head(sort(rules, by = "lift", decreasing = TRUE), n = 10), method = "graph")

Cluster analysis with kmeans:

#make the groceries table into a matrix
basket_matrix <- as(groceries, "matrix")


basket_matrix <- basket_matrix * 1
set.seed(1122)
#makes 4 clusters (starting point)
clusters <- kmeans(basket_matrix, centers = 4)

items$cluster <- clusters$cluster
#cluster sizes
table(clusters$cluster)
## 
##    1    2    3    4 
## 5903 2319  520 1093
#they are enormous

#most frequent items in Cluster 1
colMeans(basket_matrix[clusters$cluster == 1, ]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##           soda     rolls/buns    canned beer  bottled water         yogurt 
##     0.18228020     0.16347620     0.10113502     0.09740810     0.09571404 
##  shopping bags   bottled beer        sausage         pastry tropical fruit 
##     0.09232594     0.08250042     0.07403015     0.06793156     0.06776215

There are a lot of items in each cluster. Let’s refine this a little with smaller clusters.

set.seed(1122)
#makes 8 clusters
clusters <- kmeans(basket_matrix, centers = 8)

items$cluster <- clusters$cluster

#cluster sizes
table(clusters$cluster)
## 
##    1    2    3    4    5    6    7    8 
## 3719 1190  431 1076  746  877  855  941
#they are still pretty big

#most frequent items in cluster 1
colMeans(basket_matrix[clusters$cluster == 1, ]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##     canned beer    bottled beer   shopping bags   bottled water          pastry 
##      0.12207583      0.08819575      0.08362463      0.08281796      0.06184458 
##      newspapers  tropical fruit         sausage          coffee root vegetables 
##      0.05888680      0.05619790      0.05404679      0.05216456      0.05189567
#cluster 2
colMeans(basket_matrix[clusters$cluster == 2, ]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##      whole milk      rolls/buns root vegetables  tropical fruit          pastry 
##      1.00000000      0.20924370      0.12521008      0.11512605      0.11008403 
##   bottled water      newspapers         sausage   domestic eggs     brown bread 
##      0.10924370      0.10924370      0.08907563      0.08487395      0.08067227
colMeans(basket_matrix[clusters$cluster == 3, ]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##          napkins       whole milk           yogurt       rolls/buns 
##        1.0000000        0.3016241        0.2320186        0.2088167 
##             soda   tropical fruit other vegetables  root vegetables 
##        0.1972158        0.1856148        0.1809745        0.1740139 
##    bottled water    shopping bags 
##        0.1531323        0.1415313
colMeans(basket_matrix[clusters$cluster == 4, ]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
## other vegetables  root vegetables       rolls/buns           yogurt 
##        1.0000000        0.1988848        0.1960967        0.1728625 
##             soda   tropical fruit          sausage     citrus fruit 
##        0.1486989        0.1486989        0.1356877        0.1338290 
##    shopping bags    bottled water 
##        0.1328996        0.1171004

Trying 20 clusters

clusters <- kmeans(basket_matrix, centers = 20)

items$cluster <- clusters$cluster

#cluster sizes
table(clusters$cluster)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  286 2535  279  205  493  387  169  395  580  315  705  655   95  358  466  224 
##   17   18   19   20 
##  410  517  378  383
#they are still pretty big

#most frequent items in cluster 1
colMeans(basket_matrix[clusters$cluster == 1,]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##      rolls/buns      whole milk  tropical fruit         sausage   bottled water 
##       1.0000000       1.0000000       0.1573427       0.1538462       0.1503497 
## root vegetables          pastry      newspapers       margarine            soda 
##       0.1468531       0.1363636       0.1153846       0.1118881       0.1083916
#cluster 2
colMeans(basket_matrix[clusters$cluster == 2,]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##                  soda          bottled beer         bottled water 
##            0.18343195            0.11360947            0.09743590 
##            newspapers fruit/vegetable juice          citrus fruit 
##            0.06429980            0.04654832            0.03984221 
##             chocolate             pip fruit   specialty chocolate 
##            0.03944773            0.03826430            0.03668639 
##       misc. beverages 
##            0.03629191
colMeans(basket_matrix[clusters$cluster == 3,]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##           brown bread            whole milk                pastry 
##             1.0000000             0.5519713             0.1863799 
##      other vegetables            newspapers                  soda 
##             0.1433692             0.1397849             0.1397849 
##        tropical fruit                yogurt       root vegetables 
##             0.1326165             0.1182796             0.1111111 
## fruit/vegetable juice 
##             0.1003584
colMeans(basket_matrix[clusters$cluster == 4,]) |> 
  sort(decreasing = TRUE) |> 
  head(10)
##         UHT-milk             soda    bottled water other vegetables 
##        1.0000000        0.2926829        0.2682927        0.1804878 
##           yogurt      brown bread  root vegetables       newspapers 
##        0.1658537        0.1414634        0.1414634        0.1365854 
##    shopping bags     citrus fruit 
##        0.1317073        0.1219512