# Association Rules (Market Basket Analysis: Identifying
# Frequently-Purchased Groceries)
# load the grocery data into a sparse matrix
install.packages("arules")
## Installing package into '/Applications/RStudio.app/Contents/Resources/R/library'
## (as 'lib' is unspecified)
## Error: trying to use CRAN without setting a mirror
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
##
## The following objects are masked from 'package:base':
##
## %in%, write
groceries <- read.transactions("groceries.csv", sep = ",")
## Step 2: Exploring and preparing the data
summary(groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 3.00 4.41 6.00 32.00
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
# look at the first five transactions
inspect(groceries[1:5])
## items
## 1 {citrus fruit,
## margarine,
## ready soups,
## semi-finished bread}
## 2 {coffee,
## tropical fruit,
## yogurt}
## 3 {whole milk}
## 4 {cream cheese,
## meat spreads,
## pip fruit,
## yogurt}
## 5 {condensed milk,
## long life bakery product,
## other vegetables,
## whole milk}
# examine the frequency of items
itemFrequency(groceries[, 1:3])
## abrasive cleaner artif. sweetener baby cosmetics
## 0.0035587 0.0032537 0.0006101
# plot the frequency of items
itemFrequencyPlot(groceries, support = 0.1)
itemFrequencyPlot(groceries, topN = 20)
# a visualization of the sparse matrix for the first five transactions
image(groceries[1:5])
# visualization of a random sample of 100 transactions
image(sample(groceries, 100))
## Step 3: Training a model on the data ----
library(arules)
# default settings result in zero rules learned
apriori(groceries)
##
## parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.8 0.1 1 none FALSE TRUE 0.1 1 10
## target ext
## rules FALSE
##
## algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.01s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 0 rules
# set better support and confidence levels to learn more rules
groceryrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25,
minlen = 2))
##
## parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.25 0.1 1 none FALSE TRUE 0.006 2 10
## target ext
## rules FALSE
##
## algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
groceryrules
## set of 463 rules
## Step 4: Evaluating model performance ---- summary of grocery association
## rules
summary(groceryrules)
## set of 463 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 150 297 16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.00 3.00 2.71 3.00 4.00
##
## summary of quality measures:
## support confidence lift
## Min. :0.00610 Min. :0.250 Min. :0.993
## 1st Qu.:0.00712 1st Qu.:0.297 1st Qu.:1.623
## Median :0.00874 Median :0.355 Median :1.933
## Mean :0.01154 Mean :0.379 Mean :2.035
## 3rd Qu.:0.01230 3rd Qu.:0.450 3rd Qu.:2.356
## Max. :0.07483 Max. :0.660 Max. :3.956
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.006 0.25
# look at the first three rules
inspect(groceryrules[1:3])
## lhs rhs support confidence lift
## 1 {potted plants} => {whole milk} 0.006914 0.4000 1.565
## 2 {pasta} => {whole milk} 0.006101 0.4054 1.587
## 3 {herbs} => {root vegetables} 0.007016 0.4313 3.956
## Step 5: Improving model performance ----
# sorting grocery rules by lift
inspect(sort(groceryrules, by = "lift")[1:5])
## lhs rhs support confidence lift
## 1 {herbs} => {root vegetables} 0.007016 0.4313 3.956
## 2 {berries} => {whipped/sour cream} 0.009049 0.2722 3.797
## 3 {other vegetables,
## tropical fruit,
## whole milk} => {root vegetables} 0.007016 0.4107 3.768
## 4 {beef,
## other vegetables} => {root vegetables} 0.007931 0.4021 3.689
## 5 {other vegetables,
## tropical fruit} => {pip fruit} 0.009456 0.2635 3.483
# finding subsets of rules containing any berry items
berryrules <- subset(groceryrules, items %in% "berries")
inspect(berryrules)
## lhs rhs support confidence lift
## 1 {berries} => {whipped/sour cream} 0.009049 0.2722 3.797
## 2 {berries} => {yogurt} 0.010574 0.3180 2.280
## 3 {berries} => {other vegetables} 0.010269 0.3089 1.596
## 4 {berries} => {whole milk} 0.011795 0.3547 1.388
# finding subsets of rules that precede soda purchases
sodarules <- subset(groceryrules, rhs %pin% "soda")
inspect(sodarules)
## lhs rhs support confidence lift
## 1 {specialty bar} => {soda} 0.007219 0.2639 1.514
## 2 {misc. beverages} => {soda} 0.007321 0.2581 1.480
## 3 {candy} => {soda} 0.008643 0.2891 1.658
## 4 {dessert} => {soda} 0.009863 0.2658 1.524
## 5 {chocolate} => {soda} 0.013523 0.2725 1.563
## 6 {fruit/vegetable juice} => {soda} 0.018404 0.2546 1.460
## 7 {sausage} => {soda} 0.024301 0.2587 1.483
## 8 {bottled water} => {soda} 0.028978 0.2622 1.504
## 9 {rolls/buns,
## shopping bags} => {soda} 0.006304 0.3229 1.852
## 10 {shopping bags,
## whole milk} => {soda} 0.006812 0.2780 1.594
## 11 {rolls/buns,
## sausage} => {soda} 0.009659 0.3156 1.810
## 12 {other vegetables,
## sausage} => {soda} 0.007219 0.2679 1.536
## 13 {bottled water,
## yogurt} => {soda} 0.007422 0.3230 1.852
## 14 {bottled water,
## rolls/buns} => {soda} 0.006812 0.2815 1.614
## 15 {rolls/buns,
## yogurt} => {soda} 0.008643 0.2515 1.442
top.soda.rules <- head(sort(sodarules, by = "lift"), 5)
inspect(top.soda.rules)
## lhs rhs support confidence lift
## 1 {bottled water,
## yogurt} => {soda} 0.007422 0.3230 1.852
## 2 {rolls/buns,
## shopping bags} => {soda} 0.006304 0.3229 1.852
## 3 {rolls/buns,
## sausage} => {soda} 0.009659 0.3156 1.810
## 4 {candy} => {soda} 0.008643 0.2891 1.658
## 5 {bottled water,
## rolls/buns} => {soda} 0.006812 0.2815 1.614
# writing the rules to a CSV file
write(groceryrules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE)
# converting the rule set to a data frame
groceryrules_df <- as(groceryrules, "data.frame")
str(groceryrules_df)
## 'data.frame': 463 obs. of 4 variables:
## $ rules : Factor w/ 463 levels "{baking powder} => {other vegetables}",..: 340 302 207 206 208 341 402 21 139 140 ...
## $ support : num 0.00691 0.0061 0.00702 0.00773 0.00773 ...
## $ confidence: num 0.4 0.405 0.431 0.475 0.475 ...
## $ lift : num 1.57 1.59 3.96 2.45 1.86 ...