We will utilize purchase data from one month of operation at a grocery store. The data contain 9,835 transactions, or about 328 transactions per day. If we remove brands and just consider product type, it will give total 169 items. Any guesses about which types of items might be purchased together? Will wine and cheese be a common pairing? Bread and butter? Milk and eggs? Let’s dig into this data and see if we can confirm our guesses.
library(arules)
library(arulesViz)
library(colorspace)
data(Groceries)
groceries = as(Groceries, 'data.frame')
Groceries
## transactions in sparse format with
## 9835 transactions (rows) and
## 169 items (columns)
head(groceries)
## items
## 1 {citrus fruit,semi-finished bread,margarine,ready soups}
## 2 {tropical fruit,yogurt,coffee}
## 3 {whole milk}
## 4 {pip fruit,yogurt,cream cheese ,meat spreads}
## 5 {other vegetables,whole milk,condensed milk,long life bakery product}
## 6 {whole milk,butter,yogurt,rice,abrasive cleaner}
Let us find the items most commonly found in transactional data
summary(Groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels level2 level1
## 1 frankfurter sausage meet and sausage
## 2 sausage sausage meet and sausage
## 3 liver loaf sausage meet and sausage
43367/9835 = 4.409456 itemsitemFrequency(Groceries[,1:5])
## frankfurter sausage liver loaf ham meat
## 0.058973055 0.093950178 0.005083884 0.026029487 0.025826131
itemFrequencyPlot(Groceries, support=0.1)
itemFrequencyPlot(Groceries, support=0.05)
itemFrequencyPlot(Groceries, topN=20)
image(Groceries[1:5])
image(sample(Groceries, 100))
We will implement apriori algorithm to find the associations among shopping cart items
apriori(Groceries)
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.8 0.1 1 none FALSE TRUE 0.1 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 0 rules
support is 0.1, an item must have appeared atleast 0.1*9385=938 transactions. Only 8 items appeared those many times, so no rules were generatedsupport by thinking the minimum number of transactions we would needconfidence threshold of 0.25, which means that in order to be included in the results, the rule has to be correct at least 25 percent of the time.In addition, We also set minlen = 2 to eliminate rules that contain fewer than two items.
grules = apriori(Groceries, parameter = list(support = 0.009, confidence = 0.25, minlen = 2))
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.25 0.1 1 none FALSE TRUE 0.009 2 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [93 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [224 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
grules
## set of 224 rules
summary(grules)
## set of 224 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 111 113
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.504 3.000 3.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.009049 Min. :0.2513 Min. :0.9932
## 1st Qu.:0.010066 1st Qu.:0.2974 1st Qu.:1.5767
## Median :0.012303 Median :0.3603 Median :1.8592
## Mean :0.016111 Mean :0.3730 Mean :1.9402
## 3rd Qu.:0.018480 3rd Qu.:0.4349 3rd Qu.:2.2038
## Max. :0.074835 Max. :0.6389 Max. :3.7969
##
## mining info:
## data ntransactions support confidence
## Groceries 9835 0.009 0.25
inspect(grules[1:10])
## lhs rhs support confidence lift
## 1 {baking powder} => {whole milk} 0.009252669 0.5229885 2.046793
## 2 {grapes} => {other vegetables} 0.009049314 0.4045455 2.090754
## 3 {meat} => {other vegetables} 0.009964413 0.3858268 1.994013
## 4 {meat} => {whole milk} 0.009964413 0.3858268 1.509991
## 5 {frozen meals} => {whole milk} 0.009862735 0.3476703 1.360659
## 6 {hard cheese} => {other vegetables} 0.009456024 0.3858921 1.994350
## 7 {hard cheese} => {whole milk} 0.010066090 0.4107884 1.607682
## 8 {butter milk} => {other vegetables} 0.010371124 0.3709091 1.916916
## 9 {butter milk} => {whole milk} 0.011591256 0.4145455 1.622385
## 10 {ham} => {other vegetables} 0.009150991 0.3515625 1.816930
A common approach is to take the result of learning association rules and divide them into three categories
inspect(sort(grules, by = "lift")[1:10])
## lhs rhs support confidence lift
## 1 {berries} => {whipped/sour cream} 0.009049314 0.2721713 3.796886
## 2 {tropical fruit,
## other vegetables} => {pip fruit} 0.009456024 0.2634561 3.482649
## 3 {pip fruit,
## other vegetables} => {tropical fruit} 0.009456024 0.3618677 3.448613
## 4 {citrus fruit,
## other vegetables} => {root vegetables} 0.010371124 0.3591549 3.295045
## 5 {tropical fruit,
## other vegetables} => {root vegetables} 0.012302999 0.3427762 3.144780
## 6 {tropical fruit,
## other vegetables} => {citrus fruit} 0.009049314 0.2521246 3.046248
## 7 {beef} => {root vegetables} 0.017386884 0.3313953 3.040367
## 8 {citrus fruit,
## root vegetables} => {other vegetables} 0.010371124 0.5862069 3.029608
## 9 {tropical fruit,
## root vegetables} => {other vegetables} 0.012302999 0.5845411 3.020999
## 10 {citrus fruit,
## other vegetables} => {tropical fruit} 0.009049314 0.3133803 2.986526
beefrules = subset(grules, items %in% c("beef","sausage"))
inspect(sort(beefrules, by = "lift")[1:10])
## lhs rhs support confidence lift
## 1 {beef} => {root vegetables} 0.017386884 0.3313953 3.040367
## 2 {rolls/buns,
## soda} => {sausage} 0.009659380 0.2519894 2.682160
## 3 {beef,
## whole milk} => {other vegetables} 0.009252669 0.4354067 2.250250
## 4 {sausage,
## soda} => {rolls/buns} 0.009659380 0.3974895 2.161034
## 5 {beef} => {other vegetables} 0.019725470 0.3759690 1.943066
## 6 {beef,
## other vegetables} => {whole milk} 0.009252669 0.4690722 1.835784
## 7 {sausage,
## rolls/buns} => {soda} 0.009659380 0.3156146 1.809953
## 8 {sausage} => {rolls/buns} 0.030604982 0.3257576 1.771048
## 9 {sausage,
## whole milk} => {other vegetables} 0.010167768 0.3401361 1.757876
## 10 {sausage,
## whole milk} => {rolls/buns} 0.009354347 0.3129252 1.701282
The subset() function can be used with several keywords and operators:
items, matches an item appearing anywhere in the rule.lhs and rhs instead.%in% means that at least one of the items must be found in the list you defined.%pin%) and complete matching (%ain%).support, confidence, or lift.&), or (|), and not (!).plot(grules)
plot(grules, measure=c("support", "lift"), shading="confidence")
plot(grules, shading="order", control=list(main = "Two-key plot"))
plot(grules, measure=c("support", "lift"), shading="confidence", interactive=TRUE)
plot(beefrules, method="matrix", measure="lift")
## Itemsets in Antecedent (LHS)
## [1] "{beef}" "{sausage}"
## [3] "{beef,other vegetables}" "{beef,whole milk}"
## [5] "{sausage,soda}" "{sausage,rolls/buns}"
## [7] "{rolls/buns,soda}" "{sausage,whole milk}"
## [9] "{sausage,other vegetables}"
## Itemsets in Consequent (RHS)
## [1] "{root vegetables}" "{rolls/buns}" "{other vegetables}"
## [4] "{whole milk}" "{soda}" "{sausage}"
plot(beefrules, method="matrix", measure="lift", control=list(reorder=TRUE))
## Itemsets in Antecedent (LHS)
## [1] "{beef}" "{beef,whole milk}"
## [3] "{sausage,whole milk}" "{sausage,soda}"
## [5] "{sausage}" "{sausage,rolls/buns}"
## [7] "{beef,other vegetables}" "{sausage,other vegetables}"
## [9] "{rolls/buns,soda}"
## Itemsets in Consequent (RHS)
## [1] "{sausage}" "{soda}" "{whole milk}"
## [4] "{root vegetables}" "{other vegetables}" "{rolls/buns}"
plot(beefrules, method="matrix", measure=c("lift", "confidence"), control=list(reorder=TRUE, col=sequential_hcl(200)))
## Itemsets in Antecedent (LHS)
## [1] "{beef}" "{sausage}"
## [3] "{sausage,rolls/buns}" "{beef,other vegetables}"
## [5] "{sausage,other vegetables}" "{sausage,soda}"
## [7] "{sausage,whole milk}" "{beef,whole milk}"
## [9] "{rolls/buns,soda}"
## Itemsets in Consequent (RHS)
## [1] "{sausage}" "{soda}" "{whole milk}"
## [4] "{root vegetables}" "{other vegetables}" "{rolls/buns}"
plot(beefrules, method="matrix", measure=c("support","confidence"), control=list(reorder=TRUE, col=sequential_hcl(200)))
## Itemsets in Antecedent (LHS)
## [1] "{sausage}" "{beef}"
## [3] "{sausage,rolls/buns}" "{sausage,other vegetables}"
## [5] "{beef,other vegetables}" "{rolls/buns,soda}"
## [7] "{beef,whole milk}" "{sausage,whole milk}"
## [9] "{sausage,soda}"
## Itemsets in Consequent (RHS)
## [1] "{root vegetables}" "{sausage}" "{soda}"
## [4] "{rolls/buns}" "{other vegetables}" "{whole milk}"
plot(beefrules, method="grouped", measure="support", control=list(col=sequential_hcl(100)))
plot(beefrules, method="grouped", measure="confidence", control=list(col=sequential_hcl(100)))
plot(beefrules, method="graph", control=list(type="items"))
plot(beefrules, method="paracoord", control=list(reorder=TRUE))
write(grules, file = "grules.csv", sep = ",", quote = TRUE, row.names = FALSE)