# Association Rules (Market Basket Analysis: Identifying
# Frequently-Purchased Groceries)

# load the grocery data into a sparse matrix
install.packages("arules")

## Installing package into '/Applications/RStudio.app/Contents/Resources/R/library'
## (as 'lib' is unspecified)

## Error: trying to use CRAN without setting a mirror

library(arules)

## Loading required package: Matrix
## 
## Attaching package: 'arules'
## 
## The following objects are masked from 'package:base':
## 
##     %in%, write

groceries <- read.transactions("groceries.csv", sep = ",")

## Step 2: Exploring and preparing the data
summary(groceries)

## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    3.00    4.41    6.00   32.00 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics


# look at the first five transactions
inspect(groceries[1:5])

##   items                     
## 1 {citrus fruit,            
##    margarine,               
##    ready soups,             
##    semi-finished bread}     
## 2 {coffee,                  
##    tropical fruit,          
##    yogurt}                  
## 3 {whole milk}              
## 4 {cream cheese,            
##    meat spreads,            
##    pip fruit,               
##    yogurt}                  
## 5 {condensed milk,          
##    long life bakery product,
##    other vegetables,        
##    whole milk}


# examine the frequency of items
itemFrequency(groceries[, 1:3])

## abrasive cleaner artif. sweetener   baby cosmetics 
##        0.0035587        0.0032537        0.0006101


# plot the frequency of items
itemFrequencyPlot(groceries, support = 0.1)

plot of chunk unnamed-chunk-1

itemFrequencyPlot(groceries, topN = 20)

plot of chunk unnamed-chunk-1


# a visualization of the sparse matrix for the first five transactions
image(groceries[1:5])

plot of chunk unnamed-chunk-1


# visualization of a random sample of 100 transactions
image(sample(groceries, 100))

plot of chunk unnamed-chunk-1


## Step 3: Training a model on the data ----
library(arules)

# default settings result in zero rules learned
apriori(groceries)

## 
## parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.1      1     10
##  target   ext
##   rules FALSE
## 
## algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.01s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

## set of 0 rules


# set better support and confidence levels to learn more rules
groceryrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25, 
    minlen = 2))

## 
## parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##        0.25    0.1    1 none FALSE            TRUE   0.006      2     10
##  target   ext
##   rules FALSE
## 
## algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

groceryrules

## set of 463 rules


## Step 4: Evaluating model performance ---- summary of grocery association
## rules
summary(groceryrules)

## set of 463 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3   4 
## 150 297  16 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.00    3.00    2.71    3.00    4.00 
## 
## summary of quality measures:
##     support          confidence         lift      
##  Min.   :0.00610   Min.   :0.250   Min.   :0.993  
##  1st Qu.:0.00712   1st Qu.:0.297   1st Qu.:1.623  
##  Median :0.00874   Median :0.355   Median :1.933  
##  Mean   :0.01154   Mean   :0.379   Mean   :2.035  
##  3rd Qu.:0.01230   3rd Qu.:0.450   3rd Qu.:2.356  
##  Max.   :0.07483   Max.   :0.660   Max.   :3.956  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.006       0.25


# look at the first three rules
inspect(groceryrules[1:3])

##   lhs                rhs                support confidence  lift
## 1 {potted plants} => {whole milk}      0.006914     0.4000 1.565
## 2 {pasta}         => {whole milk}      0.006101     0.4054 1.587
## 3 {herbs}         => {root vegetables} 0.007016     0.4313 3.956


## Step 5: Improving model performance ----

# sorting grocery rules by lift
inspect(sort(groceryrules, by = "lift")[1:5])

##   lhs                   rhs                   support confidence  lift
## 1 {herbs}            => {root vegetables}    0.007016     0.4313 3.956
## 2 {berries}          => {whipped/sour cream} 0.009049     0.2722 3.797
## 3 {other vegetables,                                                  
##    tropical fruit,                                                    
##    whole milk}       => {root vegetables}    0.007016     0.4107 3.768
## 4 {beef,                                                              
##    other vegetables} => {root vegetables}    0.007931     0.4021 3.689
## 5 {other vegetables,                                                  
##    tropical fruit}   => {pip fruit}          0.009456     0.2635 3.483


# finding subsets of rules containing any berry items
berryrules <- subset(groceryrules, items %in% "berries")
inspect(berryrules)

##   lhs          rhs                   support confidence  lift
## 1 {berries} => {whipped/sour cream} 0.009049     0.2722 3.797
## 2 {berries} => {yogurt}             0.010574     0.3180 2.280
## 3 {berries} => {other vegetables}   0.010269     0.3089 1.596
## 4 {berries} => {whole milk}         0.011795     0.3547 1.388



# finding subsets of rules that precede soda purchases
sodarules <- subset(groceryrules, rhs %pin% "soda")
inspect(sodarules)

##    lhs                        rhs     support confidence  lift
## 1  {specialty bar}         => {soda} 0.007219     0.2639 1.514
## 2  {misc. beverages}       => {soda} 0.007321     0.2581 1.480
## 3  {candy}                 => {soda} 0.008643     0.2891 1.658
## 4  {dessert}               => {soda} 0.009863     0.2658 1.524
## 5  {chocolate}             => {soda} 0.013523     0.2725 1.563
## 6  {fruit/vegetable juice} => {soda} 0.018404     0.2546 1.460
## 7  {sausage}               => {soda} 0.024301     0.2587 1.483
## 8  {bottled water}         => {soda} 0.028978     0.2622 1.504
## 9  {rolls/buns,                                               
##     shopping bags}         => {soda} 0.006304     0.3229 1.852
## 10 {shopping bags,                                            
##     whole milk}            => {soda} 0.006812     0.2780 1.594
## 11 {rolls/buns,                                               
##     sausage}               => {soda} 0.009659     0.3156 1.810
## 12 {other vegetables,                                         
##     sausage}               => {soda} 0.007219     0.2679 1.536
## 13 {bottled water,                                            
##     yogurt}                => {soda} 0.007422     0.3230 1.852
## 14 {bottled water,                                            
##     rolls/buns}            => {soda} 0.006812     0.2815 1.614
## 15 {rolls/buns,                                               
##     yogurt}                => {soda} 0.008643     0.2515 1.442

top.soda.rules <- head(sort(sodarules, by = "lift"), 5)
inspect(top.soda.rules)

##   lhs                rhs     support confidence  lift
## 1 {bottled water,                                    
##    yogurt}        => {soda} 0.007422     0.3230 1.852
## 2 {rolls/buns,                                       
##    shopping bags} => {soda} 0.006304     0.3229 1.852
## 3 {rolls/buns,                                       
##    sausage}       => {soda} 0.009659     0.3156 1.810
## 4 {candy}         => {soda} 0.008643     0.2891 1.658
## 5 {bottled water,                                    
##    rolls/buns}    => {soda} 0.006812     0.2815 1.614


# writing the rules to a CSV file
write(groceryrules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE)

# converting the rule set to a data frame
groceryrules_df <- as(groceryrules, "data.frame")
str(groceryrules_df)

## 'data.frame':    463 obs. of  4 variables:
##  $ rules     : Factor w/ 463 levels "{baking powder} => {other vegetables}",..: 340 302 207 206 208 341 402 21 139 140 ...
##  $ support   : num  0.00691 0.0061 0.00702 0.00773 0.00773 ...
##  $ confidence: num  0.4 0.405 0.431 0.475 0.475 ...
##  $ lift      : num  1.57 1.59 3.96 2.45 1.86 ...