Load packages for Association Rules

We will utilize purchase data from one month of operation at a grocery store. The data contain 9,835 transactions, or about 328 transactions per day. If we remove brands and just consider product type, it will give total 169 items. Any guesses about which types of items might be purchased together? Will wine and cheese be a common pairing? Bread and butter? Milk and eggs? Let’s dig into this data and see if we can confirm our guesses.

library(arules)
library(arulesViz)
library(colorspace)
data(Groceries)
groceries = as(Groceries, 'data.frame')
Groceries
## transactions in sparse format with
##  9835 transactions (rows) and
##  169 items (columns)
head(groceries)
##                                                                   items
## 1              {citrus fruit,semi-finished bread,margarine,ready soups}
## 2                                        {tropical fruit,yogurt,coffee}
## 3                                                          {whole milk}
## 4                         {pip fruit,yogurt,cream cheese ,meat spreads}
## 5 {other vegetables,whole milk,condensed milk,long life bakery product}
## 6                      {whole milk,butter,yogurt,rice,abrasive cleaner}

Let us find the items most commonly found in transactional data

summary(Groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##        labels  level2           level1
## 1 frankfurter sausage meet and sausage
## 2     sausage sausage meet and sausage
## 3  liver loaf sausage meet and sausage

Observations

Find item frequency

itemFrequency(Groceries[,1:5])
## frankfurter     sausage  liver loaf         ham        meat 
## 0.058973055 0.093950178 0.005083884 0.026029487 0.025826131

Visualize item support - item frequency plots

itemFrequencyPlot(Groceries, support=0.1)

itemFrequencyPlot(Groceries, support=0.05)

itemFrequencyPlot(Groceries, topN=20)

Visualizing first five transaction data

image(Groceries[1:5])

Visualizing random 100 transactions

image(sample(Groceries, 100))

Implementation of Apriori algorithm

We will implement apriori algorithm to find the associations among shopping cart items

apriori(Groceries)
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.1      1     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## set of 0 rules

Support

  • To set a support by thinking the minimum number of transactions we would need
  • For an example, if an item is purchased three times a day (about 90 times) then it may be worth taking a look at.
  • Then support will be 90 out of 9835 transactions, i.e. 0.009

Confidence

  • We will set a confidence threshold of 0.25, which means that in order to be included in the results, the rule has to be correct at least 25 percent of the time.
  • This will eliminate the most unreliable rules while allowing some room for us to modify behavior with targeted promotions.

In addition, We also set minlen = 2 to eliminate rules that contain fewer than two items.

New apriori algorithm

grules = apriori(Groceries, parameter = list(support = 0.009, confidence = 0.25, minlen = 2))
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##        0.25    0.1    1 none FALSE            TRUE   0.009      2     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [93 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [224 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
grules
## set of 224 rules

Evaluating performance

summary(grules)
## set of 224 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3 
## 111 113 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.504   3.000   3.000 
## 
## summary of quality measures:
##     support           confidence          lift       
##  Min.   :0.009049   Min.   :0.2513   Min.   :0.9932  
##  1st Qu.:0.010066   1st Qu.:0.2974   1st Qu.:1.5767  
##  Median :0.012303   Median :0.3603   Median :1.8592  
##  Mean   :0.016111   Mean   :0.3730   Mean   :1.9402  
##  3rd Qu.:0.018480   3rd Qu.:0.4349   3rd Qu.:2.2038  
##  Max.   :0.074835   Max.   :0.6389   Max.   :3.7969  
## 
## mining info:
##       data ntransactions support confidence
##  Groceries          9835   0.009       0.25

Take a look at some rules

inspect(grules[1:10])
##    lhs                rhs                    support confidence     lift
## 1  {baking powder} => {whole milk}       0.009252669  0.5229885 2.046793
## 2  {grapes}        => {other vegetables} 0.009049314  0.4045455 2.090754
## 3  {meat}          => {other vegetables} 0.009964413  0.3858268 1.994013
## 4  {meat}          => {whole milk}       0.009964413  0.3858268 1.509991
## 5  {frozen meals}  => {whole milk}       0.009862735  0.3476703 1.360659
## 6  {hard cheese}   => {other vegetables} 0.009456024  0.3858921 1.994350
## 7  {hard cheese}   => {whole milk}       0.010066090  0.4107884 1.607682
## 8  {butter milk}   => {other vegetables} 0.010371124  0.3709091 1.916916
## 9  {butter milk}   => {whole milk}       0.011591256  0.4145455 1.622385
## 10 {ham}           => {other vegetables} 0.009150991  0.3515625 1.816930

Understanding association rules

A common approach is to take the result of learning association rules and divide them into three categories

Deeper look - sorting the set of association rules

inspect(sort(grules, by = "lift")[1:10])
##    lhs                   rhs                      support confidence     lift
## 1  {berries}          => {whipped/sour cream} 0.009049314  0.2721713 3.796886
## 2  {tropical fruit,                                                          
##     other vegetables} => {pip fruit}          0.009456024  0.2634561 3.482649
## 3  {pip fruit,                                                               
##     other vegetables} => {tropical fruit}     0.009456024  0.3618677 3.448613
## 4  {citrus fruit,                                                            
##     other vegetables} => {root vegetables}    0.010371124  0.3591549 3.295045
## 5  {tropical fruit,                                                          
##     other vegetables} => {root vegetables}    0.012302999  0.3427762 3.144780
## 6  {tropical fruit,                                                          
##     other vegetables} => {citrus fruit}       0.009049314  0.2521246 3.046248
## 7  {beef}             => {root vegetables}    0.017386884  0.3313953 3.040367
## 8  {citrus fruit,                                                            
##     root vegetables}  => {other vegetables}   0.010371124  0.5862069 3.029608
## 9  {tropical fruit,                                                          
##     root vegetables}  => {other vegetables}   0.012302999  0.5845411 3.020999
## 10 {citrus fruit,                                                            
##     other vegetables} => {tropical fruit}     0.009049314  0.3133803 2.986526

Subset of association rules

beefrules = subset(grules, items %in% c("beef","sausage"))
inspect(sort(beefrules, by = "lift")[1:10])
##    lhs                   rhs                    support confidence     lift
## 1  {beef}             => {root vegetables}  0.017386884  0.3313953 3.040367
## 2  {rolls/buns,                                                            
##     soda}             => {sausage}          0.009659380  0.2519894 2.682160
## 3  {beef,                                                                  
##     whole milk}       => {other vegetables} 0.009252669  0.4354067 2.250250
## 4  {sausage,                                                               
##     soda}             => {rolls/buns}       0.009659380  0.3974895 2.161034
## 5  {beef}             => {other vegetables} 0.019725470  0.3759690 1.943066
## 6  {beef,                                                                  
##     other vegetables} => {whole milk}       0.009252669  0.4690722 1.835784
## 7  {sausage,                                                               
##     rolls/buns}       => {soda}             0.009659380  0.3156146 1.809953
## 8  {sausage}          => {rolls/buns}       0.030604982  0.3257576 1.771048
## 9  {sausage,                                                               
##     whole milk}       => {other vegetables} 0.010167768  0.3401361 1.757876
## 10 {sausage,                                                               
##     whole milk}       => {rolls/buns}       0.009354347  0.3129252 1.701282

The subset() function can be used with several keywords and operators:

Association rules - Scatter Plot

plot(grules)

plot(grules, measure=c("support", "lift"), shading="confidence")

Shading by order (number of items contained in the rule)

plot(grules, shading="order", control=list(main = "Two-key plot"))

Interactive Scatter Plot

plot(grules, measure=c("support", "lift"), shading="confidence", interactive=TRUE)

Matrix based visulaization

plot(beefrules, method="matrix", measure="lift")
## Itemsets in Antecedent (LHS)
## [1] "{beef}"                     "{sausage}"                 
## [3] "{beef,other vegetables}"    "{beef,whole milk}"         
## [5] "{sausage,soda}"             "{sausage,rolls/buns}"      
## [7] "{rolls/buns,soda}"          "{sausage,whole milk}"      
## [9] "{sausage,other vegetables}"
## Itemsets in Consequent (RHS)
## [1] "{root vegetables}"  "{rolls/buns}"       "{other vegetables}"
## [4] "{whole milk}"       "{soda}"             "{sausage}"

plot(beefrules, method="matrix", measure="lift", control=list(reorder=TRUE))
## Itemsets in Antecedent (LHS)
## [1] "{beef}"                     "{beef,whole milk}"         
## [3] "{sausage,whole milk}"       "{sausage,soda}"            
## [5] "{sausage}"                  "{sausage,rolls/buns}"      
## [7] "{beef,other vegetables}"    "{sausage,other vegetables}"
## [9] "{rolls/buns,soda}"         
## Itemsets in Consequent (RHS)
## [1] "{sausage}"          "{soda}"             "{whole milk}"      
## [4] "{root vegetables}"  "{other vegetables}" "{rolls/buns}"

plot(beefrules, method="matrix", measure=c("lift", "confidence"), control=list(reorder=TRUE, col=sequential_hcl(200)))
## Itemsets in Antecedent (LHS)
## [1] "{beef}"                     "{sausage}"                 
## [3] "{sausage,rolls/buns}"       "{beef,other vegetables}"   
## [5] "{sausage,other vegetables}" "{sausage,soda}"            
## [7] "{sausage,whole milk}"       "{beef,whole milk}"         
## [9] "{rolls/buns,soda}"         
## Itemsets in Consequent (RHS)
## [1] "{sausage}"          "{soda}"             "{whole milk}"      
## [4] "{root vegetables}"  "{other vegetables}" "{rolls/buns}"

plot(beefrules, method="matrix", measure=c("support","confidence"), control=list(reorder=TRUE, col=sequential_hcl(200)))
## Itemsets in Antecedent (LHS)
## [1] "{sausage}"                  "{beef}"                    
## [3] "{sausage,rolls/buns}"       "{sausage,other vegetables}"
## [5] "{beef,other vegetables}"    "{rolls/buns,soda}"         
## [7] "{beef,whole milk}"          "{sausage,whole milk}"      
## [9] "{sausage,soda}"            
## Itemsets in Consequent (RHS)
## [1] "{root vegetables}"  "{sausage}"          "{soda}"            
## [4] "{rolls/buns}"       "{other vegetables}" "{whole milk}"

Grouped Matrix based visualization

plot(beefrules, method="grouped", measure="support", control=list(col=sequential_hcl(100)))

plot(beefrules, method="grouped", measure="confidence", control=list(col=sequential_hcl(100)))

Graph based visualizations

plot(beefrules, method="graph", control=list(type="items"))

Parallel coordinates plot

plot(beefrules, method="paracoord", control=list(reorder=TRUE))

Saving Association rules

write(grules, file = "grules.csv", sep = ",", quote = TRUE, row.names = FALSE)

Summary