##### Chapter 8: Association Rules -------------------
## Example: Identifying Frequently-Purchased Groceries ----
## Step 2: Exploring and preparing the data ----
# load the grocery data into a sparse matrix
library(arules)
groceries <- read.transactions("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml13/groceries.csv", sep = ",")
summary(groceries)
transactions as itemMatrix in sparse format with
9835 rows (elements/itemsets/transactions) and
169 columns (items) and a density of 0.02609145577
most frequent items:
whole milk other vegetables rolls/buns soda yogurt
2513 1903 1809 1715 1372
(Other)
34055
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46 29 14
19 20 21 22 23 24 26 27 28 29 32
14 9 11 4 6 1 1 1 1 3 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000000 2.000000 3.000000 4.409456 6.000000 32.000000
includes extended item information - examples:
labels
1 abrasive cleaner
2 artif. sweetener
3 baby cosmetics
# look at the first five transactions
inspect(groceries[1:5])
items
[1] {citrus fruit,
margarine,
ready soups,
semi-finished bread}
[2] {coffee,
tropical fruit,
yogurt}
[3] {whole milk}
[4] {cream cheese,
meat spreads,
pip fruit,
yogurt}
[5] {condensed milk,
long life bakery product,
other vegetables,
whole milk}
# examine the frequency of items
itemFrequency(groceries[, 1:3])
abrasive cleaner artif. sweetener baby cosmetics
0.0035587188612 0.0032536858160 0.0006100660905
# plot the frequency of items
itemFrequencyPlot(groceries, support = 0.1)

itemFrequencyPlot(groceries, topN = 20)

# a visualization of the sparse matrix for the first five transactions
image(groceries[1:5])

# visualization of a random sample of 100 transactions
image(sample(groceries, 100))

## Step 3: Training a model on the data ----
library(arules)
# default settings result in zero rules learned
apriori(groceries)
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.8 0.1 1 none FALSE TRUE 5 0.1 1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 983
set item appearances ...[0 item(s)] done [0.01s].
set transactions ...[169 item(s), 9835 transaction(s)] done [0.02s].
sorting and recoding items ... [8 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.01s].
writing ... [0 rule(s)] done [0.01s].
creating S4 object ... done [0.01s].
set of 0 rules
# set better support and confidence levels to learn more rules
groceryrules <- apriori(groceries, parameter = list(support =
0.006, confidence = 0.25, minlen = 2))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.25 0.1 1 none FALSE TRUE 5 0.006 2 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 59
set item appearances ...[0 item(s)] done [0.01s].
set transactions ...[169 item(s), 9835 transaction(s)] done [0.02s].
sorting and recoding items ... [109 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.02s].
writing ... [463 rule(s)] done [0.01s].
creating S4 object ... done [0.01s].
groceryrules
set of 463 rules
## Step 4: Evaluating model performance ----
# summary of grocery association rules
summary(groceryrules)
set of 463 rules
rule length distribution (lhs + rhs):sizes
2 3 4
150 297 16
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000000 2.000000 3.000000 2.710583 3.000000 4.000000
summary of quality measures:
support confidence lift
Min. :0.006100661 Min. :0.2500000 Min. :0.9932367
1st Qu.:0.007117438 1st Qu.:0.2970711 1st Qu.:1.6229230
Median :0.008744281 Median :0.3553719 Median :1.9332351
Mean :0.011539429 Mean :0.3785573 Mean :2.0350922
3rd Qu.:0.012302999 3rd Qu.:0.4494849 3rd Qu.:2.3564791
Max. :0.074834774 Max. :0.6600000 Max. :3.9564774
mining info:
data ntransactions support confidence
groceries 9835 0.006 0.25
# look at the first three rules
inspect(groceryrules[1:3])
lhs rhs support confidence lift
[1] {potted plants} => {whole milk} 0.006914082359 0.4000000000 1.565459610
[2] {pasta} => {whole milk} 0.006100660905 0.4054054054 1.586614470
[3] {herbs} => {root vegetables} 0.007015760041 0.4312500000 3.956477379
## Step 5: Improving model performance ----
# sorting grocery rules by lift
inspect(sort(groceryrules, by = "lift")[1:5])
lhs rhs support confidence lift
[1] {herbs} => {root vegetables} 0.007015760041 0.4312500000 3.956477379
[2] {berries} => {whipped/sour cream} 0.009049313676 0.2721712538 3.796885505
[3] {other vegetables,
tropical fruit,
whole milk} => {root vegetables} 0.007015760041 0.4107142857 3.768073694
[4] {beef,
other vegetables} => {root vegetables} 0.007930859176 0.4020618557 3.688692491
[5] {other vegetables,
tropical fruit} => {pip fruit} 0.009456024403 0.2634560907 3.482648725
# finding subsets of rules containing any berry items
berryrules <- subset(groceryrules, items %in% "berries")
inspect(berryrules)
lhs rhs support confidence lift
[1] {berries} => {whipped/sour cream} 0.009049313676 0.2721712538 3.796885505
[2] {berries} => {yogurt} 0.010574478902 0.3180428135 2.279847719
[3] {berries} => {other vegetables} 0.010269445857 0.3088685015 1.596280459
[4] {berries} => {whole milk} 0.011794611083 0.3547400612 1.388328095
# writing the rules to a CSV file
write(groceryrules, file = "groceryrules.csv",
sep = ",", quote = TRUE, row.names = FALSE)
# converting the rule set to a data frame
groceryrules_df <- as(groceryrules, "data.frame")
str(groceryrules_df)
'data.frame': 463 obs. of 4 variables:
$ rules : Factor w/ 463 levels "{baking powder} => {other vegetables}",..: 340 302 207 206 208 341 402 21 139 140 ...
$ support : num 0.00691 0.0061 0.00702 0.00773 0.00773 ...
$ confidence: num 0.4 0.405 0.431 0.475 0.475 ...
$ lift : num 1.57 1.59 3.96 2.45 1.86 ...
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQNCiAgd29yZF9kb2N1bWVudDogZGVmYXVsdA0KLS0tDQoNCmBgYHtyIHNldHVwLCBpbmNsdWRlPVRSVUV9DQojIyMjIyBDaGFwdGVyIDg6IEFzc29jaWF0aW9uIFJ1bGVzIC0tLS0tLS0tLS0tLS0tLS0tLS0NCg0KIyMgRXhhbXBsZTogSWRlbnRpZnlpbmcgRnJlcXVlbnRseS1QdXJjaGFzZWQgR3JvY2VyaWVzIC0tLS0NCiMjIFN0ZXAgMjogRXhwbG9yaW5nIGFuZCBwcmVwYXJpbmcgdGhlIGRhdGEgLS0tLQ0KDQojIGxvYWQgdGhlIGdyb2NlcnkgZGF0YSBpbnRvIGEgc3BhcnNlIG1hdHJpeA0KbGlicmFyeShhcnVsZXMpDQpncm9jZXJpZXMgPC0gcmVhZC50cmFuc2FjdGlvbnMoImh0dHA6Ly93d3cuc2NpLmNzdWVhc3RiYXkuZWR1L35lc3Vlc3MvY2xhc3Nlcy9TdGF0aXN0aWNzXzY2MjAvUHJlc2VudGF0aW9ucy9tbDEzL2dyb2Nlcmllcy5jc3YiLCBzZXAgPSAiLCIpDQpzdW1tYXJ5KGdyb2NlcmllcykNCg0KIyBsb29rIGF0IHRoZSBmaXJzdCBmaXZlIHRyYW5zYWN0aW9ucw0KaW5zcGVjdChncm9jZXJpZXNbMTo1XSkNCg0KIyBleGFtaW5lIHRoZSBmcmVxdWVuY3kgb2YgaXRlbXMNCml0ZW1GcmVxdWVuY3koZ3JvY2VyaWVzWywgMTozXSkNCg0KIyBwbG90IHRoZSBmcmVxdWVuY3kgb2YgaXRlbXMNCml0ZW1GcmVxdWVuY3lQbG90KGdyb2Nlcmllcywgc3VwcG9ydCA9IDAuMSkNCml0ZW1GcmVxdWVuY3lQbG90KGdyb2NlcmllcywgdG9wTiA9IDIwKQ0KDQojIGEgdmlzdWFsaXphdGlvbiBvZiB0aGUgc3BhcnNlIG1hdHJpeCBmb3IgdGhlIGZpcnN0IGZpdmUgdHJhbnNhY3Rpb25zDQppbWFnZShncm9jZXJpZXNbMTo1XSkNCg0KIyB2aXN1YWxpemF0aW9uIG9mIGEgcmFuZG9tIHNhbXBsZSBvZiAxMDAgdHJhbnNhY3Rpb25zDQppbWFnZShzYW1wbGUoZ3JvY2VyaWVzLCAxMDApKQ0KDQojIyBTdGVwIDM6IFRyYWluaW5nIGEgbW9kZWwgb24gdGhlIGRhdGEgLS0tLQ0KbGlicmFyeShhcnVsZXMpDQoNCiMgZGVmYXVsdCBzZXR0aW5ncyByZXN1bHQgaW4gemVybyBydWxlcyBsZWFybmVkDQphcHJpb3JpKGdyb2NlcmllcykNCg0KIyBzZXQgYmV0dGVyIHN1cHBvcnQgYW5kIGNvbmZpZGVuY2UgbGV2ZWxzIHRvIGxlYXJuIG1vcmUgcnVsZXMNCmdyb2NlcnlydWxlcyA8LSBhcHJpb3JpKGdyb2NlcmllcywgcGFyYW1ldGVyID0gbGlzdChzdXBwb3J0ID0NCiAgICAgICAgICAgICAgICAgICAgICAgICAgMC4wMDYsIGNvbmZpZGVuY2UgPSAwLjI1LCBtaW5sZW4gPSAyKSkNCmdyb2NlcnlydWxlcw0KDQojIyBTdGVwIDQ6IEV2YWx1YXRpbmcgbW9kZWwgcGVyZm9ybWFuY2UgLS0tLQ0KIyBzdW1tYXJ5IG9mIGdyb2NlcnkgYXNzb2NpYXRpb24gcnVsZXMNCnN1bW1hcnkoZ3JvY2VyeXJ1bGVzKQ0KDQojIGxvb2sgYXQgdGhlIGZpcnN0IHRocmVlIHJ1bGVzDQppbnNwZWN0KGdyb2NlcnlydWxlc1sxOjNdKQ0KDQojIyBTdGVwIDU6IEltcHJvdmluZyBtb2RlbCBwZXJmb3JtYW5jZSAtLS0tDQoNCiMgc29ydGluZyBncm9jZXJ5IHJ1bGVzIGJ5IGxpZnQNCmluc3BlY3Qoc29ydChncm9jZXJ5cnVsZXMsIGJ5ID0gImxpZnQiKVsxOjVdKQ0KDQojIGZpbmRpbmcgc3Vic2V0cyBvZiBydWxlcyBjb250YWluaW5nIGFueSBiZXJyeSBpdGVtcw0KYmVycnlydWxlcyA8LSBzdWJzZXQoZ3JvY2VyeXJ1bGVzLCBpdGVtcyAlaW4lICJiZXJyaWVzIikNCmluc3BlY3QoYmVycnlydWxlcykNCg0KIyB3cml0aW5nIHRoZSBydWxlcyB0byBhIENTViBmaWxlDQp3cml0ZShncm9jZXJ5cnVsZXMsIGZpbGUgPSAiZ3JvY2VyeXJ1bGVzLmNzdiIsDQogICAgICBzZXAgPSAiLCIsIHF1b3RlID0gVFJVRSwgcm93Lm5hbWVzID0gRkFMU0UpDQoNCiMgY29udmVydGluZyB0aGUgcnVsZSBzZXQgdG8gYSBkYXRhIGZyYW1lDQpncm9jZXJ5cnVsZXNfZGYgPC0gYXMoZ3JvY2VyeXJ1bGVzLCAiZGF0YS5mcmFtZSIpDQpzdHIoZ3JvY2VyeXJ1bGVzX2RmKQ0KYGBgDQoNCg==