Study of “what goes with what” like “Customers who bought X also bought Y” and What symptoms go with what diagnosis These Association rules are transaction-based or event-based, this is also called “market basket analysis” and “affinity analysis” which is originated with study of customer transactions databases to determine associations among items purchased
I’m using Arules, an open source package available from The Comprehensive R Archive Network, is a powerful tool-set for mining associative rules in transactional databases.
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
data("Groceries")
summary(Groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels level2 level1
## 1 frankfurter sausage meat and sausage
## 2 sausage sausage meat and sausage
## 3 liver loaf sausage meat and sausage
The summary of Groceries contain 169 items and 9835 transactions
itemLabels(Groceries)[1:20]
## [1] "frankfurter" "sausage" "liver loaf"
## [4] "ham" "meat" "finished products"
## [7] "organic sausage" "chicken" "turkey"
## [10] "pork" "beef" "hamburger meat"
## [13] "fish" "citrus fruit" "tropical fruit"
## [16] "pip fruit" "grapes" "berries"
## [19] "nuts/prunes" "root vegetables"
To get Top 10 most frequent items we can use
#Most frequent items
par(mfrow=c(1,2))
itemFrequencyPlot(Groceries,
type="relative",
topN=10, # can be changed to the number of interest
horiz=TRUE,
col='steelblue3',
xlab='',
main='Item frequency, relative')
itemFrequencyPlot(Groceries,
type="absolute",
topN=10,
horiz=TRUE,
col='steelblue3',
xlab='',
main='Item frequency, absolute')
#least frequent items
par(mar=c(2,10,2,2), mfrow=c(1,2))
barplot(sort(table(unlist(LIST(Groceries))))[1:10]/9835,
horiz=TRUE,
las=1,
col='steelblue3',
xlab='',
main='Frequency, relative')
barplot(sort(table(unlist(LIST(Groceries))))[1:10],
horiz=TRUE,
las=1,
col='steelblue3',
xlab='',
main='Frequency, absolute')
itemsets <- apriori(Groceries,
parameter = list(support=.001,
minlen=2,
target='frequent' # to mine for itemsets
))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [13335 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(itemsets)
## set of 13335 itemsets
##
## most frequent items:
## whole milk other vegetables yogurt root vegetables
## 3764 3341 2401 1958
## tropical fruit (Other)
## 1796 27683
##
## element (itemset/transaction) length distribution:sizes
## 2 3 4 5 6
## 2981 6831 3137 376 10
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 3.00 3.00 3.07 4.00 6.00
##
## summary of quality measures:
## support count
## Min. :0.001017 Min. : 10.00
## 1st Qu.:0.001118 1st Qu.: 11.00
## Median :0.001423 Median : 14.00
## Mean :0.002259 Mean : 22.22
## 3rd Qu.:0.002237 3rd Qu.: 22.00
## Max. :0.074835 Max. :736.00
##
## includes transaction ID lists: FALSE
##
## mining info:
## data ntransactions support confidence
## Groceries 9835 0.001 1
inspect(sort(itemsets, by='support', decreasing = T)[1:5])
## items support count
## [1] {other vegetables,whole milk} 0.07483477 736
## [2] {whole milk,rolls/buns} 0.05663447 557
## [3] {whole milk,yogurt} 0.05602440 551
## [4] {root vegetables,whole milk} 0.04890696 481
## [5] {root vegetables,other vegetables} 0.04738180 466
quality(itemsets)$lift <- interestMeasure(itemsets, measure='lift', Groceries)
inspect(sort(itemsets, by ='lift', decreasing = T)[1:5])
## items support count lift
## [1] {tropical fruit,
## root vegetables,
## other vegetables,
## whole milk,
## yogurt,
## oil} 0.001016777 10 459.3068
## [2] {tropical fruit,
## other vegetables,
## whole milk,
## butter,
## yogurt,
## domestic eggs} 0.001016777 10 399.6002
## [3] {tropical fruit,
## root vegetables,
## other vegetables,
## whole milk,
## butter,
## yogurt} 0.001118454 11 255.8634
## [4] {other vegetables,
## curd,
## yogurt,
## whipped/sour cream,
## cream cheese } 0.001016777 10 248.7251
## [5] {root vegetables,
## other vegetables,
## whole milk,
## yogurt,
## rice} 0.001321810 13 230.5682
itemsets <- apriori(Groceries,
parameter = list(support=.001,
minlen=2,
maxlen=2,
target='frequent' # to mine for itemsets
))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 2 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2
## Warning in apriori(Groceries, parameter = list(support = 0.001, minlen =
## 2, : Mining stopped (maxlen reached). Only patterns up to a length of 2
## returned!
## done [0.00s].
## writing ... [2981 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
quality(itemsets)$lift <- interestMeasure(itemsets, measure='lift', Groceries)
inspect(sort(itemsets, by ='lift', decreasing = T)[1:10])
## items support count lift
## [1] {mayonnaise,mustard} 0.001423488 14 12.965160
## [2] {hamburger meat,Instant food products} 0.003050330 30 11.421438
## [3] {detergent,softener} 0.001118454 11 10.600137
## [4] {liquor,red/blush wine} 0.002135231 21 10.025484
## [5] {flour,sugar} 0.004982206 49 8.463112
## [6] {salty snack,popcorn} 0.002236909 22 8.192110
## [7] {ham,processed cheese} 0.003050330 30 7.070792
## [8] {hamburger meat,sauces} 0.001220132 12 6.683656
## [9] {cream cheese ,meat spreads} 0.001118454 11 6.604701
## [10] {detergent,house keeping products} 0.001016777 10 6.345980
rules <- apriori(Groceries,
parameter = list(support=.001,
confidence=.5,
minlen=2,
target='rules' # to mine for rules
))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [5668 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)
## set of 5668 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 11 1461 3211 939 46
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 3.00 4.00 3.92 4.00 6.00
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001017 Min. :0.5000 Min. : 1.957 Min. : 10.0
## 1st Qu.:0.001118 1st Qu.:0.5455 1st Qu.: 2.464 1st Qu.: 11.0
## Median :0.001322 Median :0.6000 Median : 2.899 Median : 13.0
## Mean :0.001668 Mean :0.6250 Mean : 3.262 Mean : 16.4
## 3rd Qu.:0.001729 3rd Qu.:0.6842 3rd Qu.: 3.691 3rd Qu.: 17.0
## Max. :0.022267 Max. :1.0000 Max. :18.996 Max. :219.0
##
## mining info:
## data ntransactions support confidence
## Groceries 9835 0.001 0.5
inspect(sort(rules, by='lift', decreasing = T)[1:5])
## lhs rhs support confidence lift count
## [1] {Instant food products,
## soda} => {hamburger meat} 0.001220132 0.6315789 18.99565 12
## [2] {soda,
## popcorn} => {salty snack} 0.001220132 0.6315789 16.69779 12
## [3] {flour,
## baking powder} => {sugar} 0.001016777 0.5555556 16.40807 10
## [4] {ham,
## processed cheese} => {white bread} 0.001931876 0.6333333 15.04549 19
## [5] {whole milk,
## Instant food products} => {hamburger meat} 0.001525165 0.5000000 15.03823 15
quality(rules)$chi <- interestMeasure(rules, measure='chi', significance=T, Groceries)
inspect(sort(rules, by='lift', decreasing = T)[1:5])
## lhs rhs support confidence lift count chi
## [1] {Instant food products,
## soda} => {hamburger meat} 0.001220132 0.6315789 18.99565 12 4.966566e-48
## [2] {soda,
## popcorn} => {salty snack} 0.001220132 0.6315789 16.69779 12 5.279336e-42
## [3] {flour,
## baking powder} => {sugar} 0.001016777 0.5555556 16.40807 10 1.702941e-34
## [4] {ham,
## processed cheese} => {white bread} 0.001931876 0.6333333 15.04549 19 1.108502e-58
## [5] {whole milk,
## Instant food products} => {hamburger meat} 0.001525165 0.5000000 15.03823 15 2.865097e-46
rules <- apriori(Groceries,
parameter = list(support=.001,
confidence=.7,
maxlen=5,
target='rules' # to mine for rules
))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 5 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5
## Warning in apriori(Groceries, parameter = list(support = 0.001, confidence
## = 0.7, : Mining stopped (maxlen reached). Only patterns up to a length of 5
## returned!
## done [0.01s].
## writing ... [1255 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].