Association Rules

Study of “what goes with what” like “Customers who bought X also bought Y” and What symptoms go with what diagnosis These Association rules are transaction-based or event-based, this is also called “market basket analysis” and “affinity analysis” which is originated with study of customer transactions databases to determine associations among items purchased

I’m using Arules, an open source package available from The Comprehensive R Archive Network, is a powerful tool-set for mining associative rules in transactional databases.

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
data("Groceries")
summary(Groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##        labels  level2           level1
## 1 frankfurter sausage meat and sausage
## 2     sausage sausage meat and sausage
## 3  liver loaf sausage meat and sausage

The summary of Groceries contain 169 items and 9835 transactions

itemLabels(Groceries)[1:20]
##  [1] "frankfurter"       "sausage"           "liver loaf"       
##  [4] "ham"               "meat"              "finished products"
##  [7] "organic sausage"   "chicken"           "turkey"           
## [10] "pork"              "beef"              "hamburger meat"   
## [13] "fish"              "citrus fruit"      "tropical fruit"   
## [16] "pip fruit"         "grapes"            "berries"          
## [19] "nuts/prunes"       "root vegetables"

To get Top 10 most frequent items we can use

#Most frequent items
par(mfrow=c(1,2))

itemFrequencyPlot(Groceries,
                  type="relative",
                  topN=10, # can be changed to the number of interest
                  horiz=TRUE,
                  col='steelblue3',
                  xlab='',
                  main='Item frequency, relative')

itemFrequencyPlot(Groceries,
                  type="absolute",
                  topN=10,
                  horiz=TRUE,
                  col='steelblue3',
                  xlab='',
                  main='Item frequency, absolute')

#least frequent items

par(mar=c(2,10,2,2), mfrow=c(1,2))

barplot(sort(table(unlist(LIST(Groceries))))[1:10]/9835,
        horiz=TRUE,
        las=1,
        col='steelblue3',
        xlab='',
        main='Frequency, relative')

barplot(sort(table(unlist(LIST(Groceries))))[1:10],
        horiz=TRUE,
        las=1,
        col='steelblue3',
        xlab='',
        main='Frequency, absolute')

itemsets <- apriori(Groceries,
                    parameter = list(support=.001,
                                     minlen=2,
                                     target='frequent' # to mine for itemsets
                                     ))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [13335 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(itemsets)
## set of 13335 itemsets
## 
## most frequent items:
##       whole milk other vegetables           yogurt  root vegetables 
##             3764             3341             2401             1958 
##   tropical fruit          (Other) 
##             1796            27683 
## 
## element (itemset/transaction) length distribution:sizes
##    2    3    4    5    6 
## 2981 6831 3137  376   10 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    3.00    3.00    3.07    4.00    6.00 
## 
## summary of quality measures:
##     support             count       
##  Min.   :0.001017   Min.   : 10.00  
##  1st Qu.:0.001118   1st Qu.: 11.00  
##  Median :0.001423   Median : 14.00  
##  Mean   :0.002259   Mean   : 22.22  
##  3rd Qu.:0.002237   3rd Qu.: 22.00  
##  Max.   :0.074835   Max.   :736.00  
## 
## includes transaction ID lists: FALSE 
## 
## mining info:
##       data ntransactions support confidence
##  Groceries          9835   0.001          1
inspect(sort(itemsets, by='support', decreasing = T)[1:5])
##     items                              support    count
## [1] {other vegetables,whole milk}      0.07483477 736  
## [2] {whole milk,rolls/buns}            0.05663447 557  
## [3] {whole milk,yogurt}                0.05602440 551  
## [4] {root vegetables,whole milk}       0.04890696 481  
## [5] {root vegetables,other vegetables} 0.04738180 466
quality(itemsets)$lift <- interestMeasure(itemsets, measure='lift', Groceries)
inspect(sort(itemsets, by ='lift', decreasing = T)[1:5])
##     items                    support count     lift
## [1] {tropical fruit,                               
##      root vegetables,                              
##      other vegetables,                             
##      whole milk,                                   
##      yogurt,                                       
##      oil}                0.001016777    10 459.3068
## [2] {tropical fruit,                               
##      other vegetables,                             
##      whole milk,                                   
##      butter,                                       
##      yogurt,                                       
##      domestic eggs}      0.001016777    10 399.6002
## [3] {tropical fruit,                               
##      root vegetables,                              
##      other vegetables,                             
##      whole milk,                                   
##      butter,                                       
##      yogurt}             0.001118454    11 255.8634
## [4] {other vegetables,                             
##      curd,                                         
##      yogurt,                                       
##      whipped/sour cream,                           
##      cream cheese }      0.001016777    10 248.7251
## [5] {root vegetables,                              
##      other vegetables,                             
##      whole milk,                                   
##      yogurt,                                       
##      rice}               0.001321810    13 230.5682
itemsets <- apriori(Groceries,
                    parameter = list(support=.001,
                                     minlen=2,
                                     maxlen=2,
                                     target='frequent' # to mine for itemsets
                                     ))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen            target   ext
##       2 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2
## Warning in apriori(Groceries, parameter = list(support = 0.001, minlen =
## 2, : Mining stopped (maxlen reached). Only patterns up to a length of 2
## returned!
##  done [0.00s].
## writing ... [2981 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
quality(itemsets)$lift <- interestMeasure(itemsets, measure='lift', Groceries)
inspect(sort(itemsets, by ='lift', decreasing = T)[1:10])
##      items                                  support     count lift     
## [1]  {mayonnaise,mustard}                   0.001423488 14    12.965160
## [2]  {hamburger meat,Instant food products} 0.003050330 30    11.421438
## [3]  {detergent,softener}                   0.001118454 11    10.600137
## [4]  {liquor,red/blush wine}                0.002135231 21    10.025484
## [5]  {flour,sugar}                          0.004982206 49     8.463112
## [6]  {salty snack,popcorn}                  0.002236909 22     8.192110
## [7]  {ham,processed cheese}                 0.003050330 30     7.070792
## [8]  {hamburger meat,sauces}                0.001220132 12     6.683656
## [9]  {cream cheese ,meat spreads}           0.001118454 11     6.604701
## [10] {detergent,house keeping products}     0.001016777 10     6.345980
rules <- apriori(Groceries,
                 parameter = list(support=.001,
                                  confidence=.5,
                                  minlen=2,
                                  target='rules' # to mine for rules
                                  ))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [5668 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 5668 rules
## 
## rule length distribution (lhs + rhs):sizes
##    2    3    4    5    6 
##   11 1461 3211  939   46 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    3.00    4.00    3.92    4.00    6.00 
## 
## summary of quality measures:
##     support           confidence          lift            count      
##  Min.   :0.001017   Min.   :0.5000   Min.   : 1.957   Min.   : 10.0  
##  1st Qu.:0.001118   1st Qu.:0.5455   1st Qu.: 2.464   1st Qu.: 11.0  
##  Median :0.001322   Median :0.6000   Median : 2.899   Median : 13.0  
##  Mean   :0.001668   Mean   :0.6250   Mean   : 3.262   Mean   : 16.4  
##  3rd Qu.:0.001729   3rd Qu.:0.6842   3rd Qu.: 3.691   3rd Qu.: 17.0  
##  Max.   :0.022267   Max.   :1.0000   Max.   :18.996   Max.   :219.0  
## 
## mining info:
##       data ntransactions support confidence
##  Groceries          9835   0.001        0.5
inspect(sort(rules, by='lift', decreasing = T)[1:5])
##     lhs                        rhs                  support confidence     lift count
## [1] {Instant food products,                                                          
##      soda}                  => {hamburger meat} 0.001220132  0.6315789 18.99565    12
## [2] {soda,                                                                           
##      popcorn}               => {salty snack}    0.001220132  0.6315789 16.69779    12
## [3] {flour,                                                                          
##      baking powder}         => {sugar}          0.001016777  0.5555556 16.40807    10
## [4] {ham,                                                                            
##      processed cheese}      => {white bread}    0.001931876  0.6333333 15.04549    19
## [5] {whole milk,                                                                     
##      Instant food products} => {hamburger meat} 0.001525165  0.5000000 15.03823    15
quality(rules)$chi <- interestMeasure(rules, measure='chi', significance=T, Groceries)
inspect(sort(rules, by='lift', decreasing = T)[1:5])
##     lhs                        rhs                  support confidence     lift count          chi
## [1] {Instant food products,                                                                       
##      soda}                  => {hamburger meat} 0.001220132  0.6315789 18.99565    12 4.966566e-48
## [2] {soda,                                                                                        
##      popcorn}               => {salty snack}    0.001220132  0.6315789 16.69779    12 5.279336e-42
## [3] {flour,                                                                                       
##      baking powder}         => {sugar}          0.001016777  0.5555556 16.40807    10 1.702941e-34
## [4] {ham,                                                                                         
##      processed cheese}      => {white bread}    0.001931876  0.6333333 15.04549    19 1.108502e-58
## [5] {whole milk,                                                                                  
##      Instant food products} => {hamburger meat} 0.001525165  0.5000000 15.03823    15 2.865097e-46
rules <- apriori(Groceries,
                 parameter = list(support=.001,
                                  confidence=.7,
                                  maxlen=5,
                                  target='rules' # to mine for rules
                                  ))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##       5  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5
## Warning in apriori(Groceries, parameter = list(support = 0.001, confidence
## = 0.7, : Mining stopped (maxlen reached). Only patterns up to a length of 5
## returned!
##  done [0.01s].
## writing ... [1255 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].