library(openxlsx)
library(tm)
library(car)
library(foreign) 
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)
library(arules)

This project explores a basic application of market basket analysis using “association rule algorithms”. The data used are for practice and were drawn from “grocery” data and text: “Machine Learning with R”.

Apriori

# call data as sparse matrix
groceries<-read.transactions("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\ML\\grocery.txt", sep = ",")
## Warning in readLines(file, encoding = encoding): incomplete final line found on
## 'C:\Users\Jaire\OneDrive\Desktop\Exploratory Research\ML\grocery.txt'
# check data
summary(groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics
inspect(groceries [1:10])
##      items                      
## [1]  {citrus fruit,             
##       margarine,                
##       ready soups,              
##       semi-finished bread}      
## [2]  {coffee,                   
##       tropical fruit,           
##       yogurt}                   
## [3]  {whole milk}               
## [4]  {cream cheese,             
##       meat spreads,             
##       pip fruit,                
##       yogurt}                   
## [5]  {condensed milk,           
##       long life bakery product, 
##       other vegetables,         
##       whole milk}               
## [6]  {abrasive cleaner,         
##       butter,                   
##       rice,                     
##       whole milk,               
##       yogurt}                   
## [7]  {rolls/buns}               
## [8]  {bottled beer,             
##       liquor (appetizer),       
##       other vegetables,         
##       rolls/buns,               
##       UHT-milk}                 
## [9]  {pot plants}               
## [10] {cereals,                  
##       whole milk}
# browse item frequency, examine support level
itemFrequency(groceries[, 1:10])
## abrasive cleaner artif. sweetener   baby cosmetics        baby food 
##     0.0035587189     0.0032536858     0.0006100661     0.0001016777 
##             bags    baking powder bathroom cleaner             beef 
##     0.0004067107     0.0176919166     0.0027452974     0.0524656838 
##          berries        beverages 
##     0.0332486019     0.0260294865
# visually assess the sparse matrix; item support, ranking, transaction, sample...
itemFrequencyPlot(groceries, support = 0.06)

itemFrequencyPlot(groceries, topN = 10)

image(groceries[1:10])

image(sample(groceries, 500))

# assessment of apriori model before adding specifications, default support 0.1
apriori(groceries)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 983 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## set of 0 rules

No rules learned because too few items meet the support threshold.

# train model on data
groceryrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5   0.006      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 59 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(groceryrules)
## set of 463 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3   4 
## 150 297  16 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.711   3.000   4.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.006101   Min.   :0.2500   Min.   :0.009964   Min.   :0.9932  
##  1st Qu.:0.007117   1st Qu.:0.2971   1st Qu.:0.018709   1st Qu.:1.6229  
##  Median :0.008744   Median :0.3554   Median :0.024809   Median :1.9332  
##  Mean   :0.011539   Mean   :0.3786   Mean   :0.032608   Mean   :2.0351  
##  3rd Qu.:0.012303   3rd Qu.:0.4495   3rd Qu.:0.035892   3rd Qu.:2.3565  
##  Max.   :0.074835   Max.   :0.6600   Max.   :0.255516   Max.   :3.9565  
##      count      
##  Min.   : 60.0  
##  1st Qu.: 70.0  
##  Median : 86.0  
##  Mean   :113.5  
##  3rd Qu.:121.0  
##  Max.   :736.0  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.006       0.25
##                                                                                         call
##  apriori(data = groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))

The object contains 463 rules.

# view first 10 rules
inspect(groceryrules[1:10])
##      lhs                      rhs                support     confidence
## [1]  {pot plants}          => {whole milk}       0.006914082 0.4000000 
## [2]  {pasta}               => {whole milk}       0.006100661 0.4054054 
## [3]  {herbs}               => {root vegetables}  0.007015760 0.4312500 
## [4]  {herbs}               => {other vegetables} 0.007727504 0.4750000 
## [5]  {herbs}               => {whole milk}       0.007727504 0.4750000 
## [6]  {processed cheese}    => {whole milk}       0.007015760 0.4233129 
## [7]  {semi-finished bread} => {whole milk}       0.007117438 0.4022989 
## [8]  {beverages}           => {whole milk}       0.006812405 0.2617188 
## [9]  {detergent}           => {other vegetables} 0.006405694 0.3333333 
## [10] {detergent}           => {whole milk}       0.008947636 0.4656085 
##      coverage   lift     count
## [1]  0.01728521 1.565460 68   
## [2]  0.01504830 1.586614 60   
## [3]  0.01626843 3.956477 69   
## [4]  0.01626843 2.454874 76   
## [5]  0.01626843 1.858983 76   
## [6]  0.01657346 1.656698 69   
## [7]  0.01769192 1.574457 70   
## [8]  0.02602949 1.024275 67   
## [9]  0.01921708 1.722719 63   
## [10] 0.01921708 1.822228 88

Some of the rules are a bit trivial, but others seem actionable.

# improve model performance to identify better rules; best 10 rules by lift
inspect(sort(groceryrules, by = "lift")[1:10])
##      lhs                    rhs                      support confidence   coverage     lift count
## [1]  {herbs}             => {root vegetables}    0.007015760  0.4312500 0.01626843 3.956477    69
## [2]  {berries}           => {whipped/sour cream} 0.009049314  0.2721713 0.03324860 3.796886    89
## [3]  {other vegetables,                                                                          
##       tropical fruit,                                                                            
##       whole milk}        => {root vegetables}    0.007015760  0.4107143 0.01708185 3.768074    69
## [4]  {beef,                                                                                      
##       other vegetables}  => {root vegetables}    0.007930859  0.4020619 0.01972547 3.688692    78
## [5]  {other vegetables,                                                                          
##       tropical fruit}    => {pip fruit}          0.009456024  0.2634561 0.03589222 3.482649    93
## [6]  {beef,                                                                                      
##       whole milk}        => {root vegetables}    0.008032537  0.3779904 0.02125064 3.467851    79
## [7]  {other vegetables,                                                                          
##       pip fruit}         => {tropical fruit}     0.009456024  0.3618677 0.02613116 3.448613    93
## [8]  {pip fruit,                                                                                 
##       yogurt}            => {tropical fruit}     0.006405694  0.3559322 0.01799695 3.392048    63
## [9]  {citrus fruit,                                                                              
##       other vegetables}  => {root vegetables}    0.010371124  0.3591549 0.02887646 3.295045   102
## [10] {other vegetables,                                                                          
##       whole milk,                                                                                
##       yogurt}            => {tropical fruit}     0.007625826  0.3424658 0.02226741 3.263712    75
# assess transactions by item and rule to find other basket items
herbsrules <- subset(groceryrules, items %in% "herbs")
herbs_veggiesrules <- subset(groceryrules, items %in% c("herbs","root vegetables"))
inspect(herbsrules)
##     lhs        rhs                support     confidence coverage   lift    
## [1] {herbs} => {root vegetables}  0.007015760 0.43125    0.01626843 3.956477
## [2] {herbs} => {other vegetables} 0.007727504 0.47500    0.01626843 2.454874
## [3] {herbs} => {whole milk}       0.007727504 0.47500    0.01626843 1.858983
##     count
## [1] 69   
## [2] 76   
## [3] 76
inspect(sort(herbs_veggiesrules, by = "lift")[1:5])
##     lhs                    rhs                   support confidence   coverage     lift count
## [1] {herbs}             => {root vegetables} 0.007015760  0.4312500 0.01626843 3.956477    69
## [2] {other vegetables,                                                                       
##      tropical fruit,                                                                         
##      whole milk}        => {root vegetables} 0.007015760  0.4107143 0.01708185 3.768074    69
## [3] {beef,                                                                                   
##      other vegetables}  => {root vegetables} 0.007930859  0.4020619 0.01972547 3.688692    78
## [4] {beef,                                                                                   
##      whole milk}        => {root vegetables} 0.008032537  0.3779904 0.02125064 3.467851    79
## [5] {citrus fruit,                                                                           
##      other vegetables}  => {root vegetables} 0.010371124  0.3591549 0.02887646 3.295045   102

Herbs are purchased frequently with other vegetables and root vegetables. I assess some rules based on root vegetables as well.

# store rules as csv or df
# write(groceryrules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE)
groceryrules_df <- as(groceryrules, "data.frame")
# view grocery rule data frame
str(groceryrules_df)
## 'data.frame':    463 obs. of  6 variables:
##  $ rules     : chr  "{pot plants} => {whole milk}" "{pasta} => {whole milk}" "{herbs} => {root vegetables}" "{herbs} => {other vegetables}" ...
##  $ support   : num  0.00691 0.0061 0.00702 0.00773 0.00773 ...
##  $ confidence: num  0.4 0.405 0.431 0.475 0.475 ...
##  $ coverage  : num  0.0173 0.015 0.0163 0.0163 0.0163 ...
##  $ lift      : num  1.57 1.59 3.96 2.45 1.86 ...
##  $ count     : int  68 60 69 76 76 69 70 67 63 88 ...