Association-Rules-IP-Week-3.knit

# loading the dataset using the fread function
library(data.table)

#import data
df <- fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Sales_Dataset II.csv")

## Warning in fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\
## \Supermarket_Sales_Dataset II.csv"): Detected 2 column names but the data has
## 3 columns (i.e. invalid file). Added 1 extra default column name for the first
## column which is guessed to be row names or an index. Use setnames() afterwards
## if this guess is not correct, or fix the file write command that created the
## file to create a valid file.

## Warning in fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\
## \Supermarket_Sales_Dataset II.csv"): Stopped early on line 10. Expected 3 fields
## but found 1. Consider fill=TRUE and comment.char=. First discarded non-empty
## line: <<french fries>>

# previewing the dataset
head(df)

##                   V1 whole wheat pasta french fries
## 1:              soup       light cream      shallot
## 2: frozen vegetables         spaghetti    green tea

# loading the arules library
library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

# loading the data from our comma separated file and previewing the first 6 rows

path <- "C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Sales_Dataset II.csv"
dfa <- read.transactions(path, sep = ",")

## Warning in asMethod(object): removing duplicated items in transactions

dfa

## transactions in sparse format with
##  7501 transactions (rows) and
##  119 items (columns)

# we have 7501 rows and 119 columns

# Verifying the object's class
# ---
# This should show us transactions as the type of data that we will need
# ---
# 
class(dfa)

## [1] "transactions"
## attr(,"package")
## [1] "arules"

# Previewing our first 5 transactions
#
inspect(dfa[1:5])

##     items               
## [1] {almonds,           
##      antioxydant juice, 
##      avocado,           
##      cottage cheese,    
##      energy drink,      
##      frozen smoothie,   
##      green grapes,      
##      green tea,         
##      honey,             
##      low fat yogurt,    
##      mineral water,     
##      olive oil,         
##      salad,             
##      salmon,            
##      shrimp,            
##      spinach,           
##      tomato juice,      
##      vegetables mix,    
##      whole weat flour,  
##      yams}              
## [2] {burgers,           
##      eggs,              
##      meatballs}         
## [3] {chutney}           
## [4] {avocado,           
##      turkey}            
## [5] {energy bar,        
##      green tea,         
##      milk,              
##      mineral water,     
##      whole wheat rice}

# preview the items that make up our dataset,
dfb <- as.data.frame(itemLabels(dfa))
colnames(dfb) <- "Item"
head(dfb, 10)

##                 Item
## 1            almonds
## 2  antioxydant juice
## 3          asparagus
## 4            avocado
## 5        babies food
## 6              bacon
## 7     barbecue sauce
## 8          black tea
## 9        blueberries
## 10        body spray

# getting the summary of our dataset
summary(dfa)

## transactions as itemMatrix in sparse format with
##  7501 rows (elements/itemsets/transactions) and
##  119 columns (items) and a density of 0.03288973 
## 
## most frequent items:
## mineral water          eggs     spaghetti  french fries     chocolate 
##          1788          1348          1306          1282          1229 
##       (Other) 
##         22405 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17    4 
##   18   19   20 
##    1    2    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.914   5.000  20.000 
## 
## includes extended item information - examples:
##              labels
## 1           almonds
## 2 antioxydant juice
## 3         asparagus

# mineral water, eggs, spaghetti, french fries and chocolate are among carrefour's most purchased items

# Exploring the frequency of items of transacations ranging from 30 to 40 and performing some operation in percentage terms of the total transactions 

itemFrequency(dfa[, 6:10],type = "absolute")

##          bacon barbecue sauce      black tea    blueberries     body spray 
##             65             81            107             69             86

round(itemFrequency(dfa[, 30:40],type = "relative")*100,2)

##        cookies    cooking oil           corn cottage cheese          cream 
##           8.04           5.11           0.48           3.19           0.09 
##   dessert wine       eggplant           eggs     energy bar   energy drink 
##           0.44           1.32          17.97           2.71           2.67 
##       escalope 
##           7.93

# Producing a chart of frequencies and fitering to consider only items with a minimum percentage of support/ considering a top x of items
# Displaying top 10 most common items in the carrefour dataset and the items whose relative importance is at least 10%

par(mfrow = c(1, 2))

# plot the frequency of items
itemFrequencyPlot(dfa, topN = 10,col="darkblue")
## mineral water, eggs, spaghetti, french fries and chocolate are among carrefour's most purchased items as displayed in the plot
itemFrequencyPlot(dfa, support = 0.1,col="darkred")

# we choose a support of 0.001 i.e items that occur at least 10 times out of a total of 10000 transactions and a confidence of 0.6
rules2 <- apriori (dfa, parameter = list(supp = 0.001, conf = 0.6))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 7 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [545 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

rules2

## set of 545 rules

# this gives us 545 rules to work with

# getting the summary information of our rules
summary(rules2)

## set of 545 rules
## 
## rule length distribution (lhs + rhs):sizes
##   3   4   5   6 
## 146 329  67   3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   3.000   4.000   3.866   4.000   6.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.001067   Min.   :0.6000   Min.   :0.001067   Min.   : 2.517  
##  1st Qu.:0.001067   1st Qu.:0.6250   1st Qu.:0.001600   1st Qu.: 2.797  
##  Median :0.001200   Median :0.6667   Median :0.001866   Median : 3.446  
##  Mean   :0.001409   Mean   :0.6893   Mean   :0.002081   Mean   : 3.889  
##  3rd Qu.:0.001466   3rd Qu.:0.7273   3rd Qu.:0.002266   3rd Qu.: 4.177  
##  Max.   :0.005066   Max.   :1.0000   Max.   :0.007999   Max.   :34.970  
##      count      
##  Min.   : 8.00  
##  1st Qu.: 8.00  
##  Median : 9.00  
##  Mean   :10.57  
##  3rd Qu.:11.00  
##  Max.   :38.00  
## 
## mining info:
##  data ntransactions support confidence
##   dfa          7501   0.001        0.6
##                                                             call
##  apriori(data = dfa, parameter = list(supp = 0.001, conf = 0.6))

# most of our rules have 3 and 4 items with some containing 5 and the 3 of them containing 6 items

# Observing rules built in our model i.e. first 10 model rules
# ---
# 
inspect(rules2[1:10])

##      lhs                           rhs              support     confidence
## [1]  {cookies, shallot}         => {low fat yogurt} 0.001199840 0.6000000 
## [2]  {low fat yogurt, shallot}  => {cookies}        0.001199840 0.6923077 
## [3]  {cookies, shallot}         => {green tea}      0.001199840 0.6000000 
## [4]  {cookies, shallot}         => {french fries}   0.001199840 0.6000000 
## [5]  {low fat yogurt, shallot}  => {french fries}   0.001066524 0.6153846 
## [6]  {burger sauce, chicken}    => {mineral water}  0.001066524 0.6666667 
## [7]  {frozen smoothie, spinach} => {mineral water}  0.001066524 0.8888889 
## [8]  {milk, spinach}            => {mineral water}  0.001066524 0.6666667 
## [9]  {spaghetti, spinach}       => {mineral water}  0.001333156 0.7142857 
## [10] {olive oil, strong cheese} => {spaghetti}      0.001066524 0.7272727 
##      coverage    lift     count
## [1]  0.001999733 7.840767  9   
## [2]  0.001733102 8.611940  9   
## [3]  0.001999733 4.541473  9   
## [4]  0.001999733 3.510608  9   
## [5]  0.001733102 3.600624  8   
## [6]  0.001599787 2.796793  8   
## [7]  0.001199840 3.729058  8   
## [8]  0.001599787 2.796793  8   
## [9]  0.001866418 2.996564 10   
## [10] 0.001466471 4.177085  8

# interpreting the 7th rule: if someone buys both frozen smoothies and spinach they are 88.89% likely to buy mineral water

# Ordering these rules by the lift criteria then looking at the first 10 rules.
rules2 <- sort(rules2, by="lift", decreasing=TRUE)
inspect(rules2[1:10])

##      lhs                        rhs                        support confidence    coverage      lift count
## [1]  {escalope,                                                                                          
##       french fries,                                                                                      
##       pasta}                 => {mushroom cream sauce} 0.001066524  0.6666667 0.001599787 34.969697     8
## [2]  {fresh tuna,                                                                                        
##       fromage blanc}         => {honey}                0.001599787  0.6666667 0.002399680 14.046816    12
## [3]  {eggs,                                                                                              
##       mineral water,                                                                                     
##       pasta}                 => {shrimp}               0.001333156  0.9090909 0.001466471 12.722185    10
## [4]  {french fries,                                                                                      
##       mushroom cream sauce,                                                                              
##       pasta}                 => {escalope}             0.001066524  1.0000000 0.001066524 12.606723     8
## [5]  {milk,                                                                                              
##       pasta}                 => {shrimp}               0.001599787  0.8571429 0.001866418 11.995203    12
## [6]  {mushroom cream sauce,                                                                              
##       pasta}                 => {escalope}             0.002532996  0.9500000 0.002666311 11.976387    19
## [7]  {chocolate,                                                                                         
##       light cream,                                                                                       
##       mineral water}         => {chicken}              0.001199840  0.6428571 0.001866418 10.715714     9
## [8]  {mineral water,                                                                                     
##       pasta}                 => {shrimp}               0.001599787  0.7500000 0.002133049 10.495802    12
## [9]  {black tea,                                                                                         
##       eggs,                                                                                              
##       spaghetti}             => {turkey}               0.001066524  0.6153846 0.001733102  9.842217     8
## [10] {eggs,                                                                                              
##       pasta}                 => {shrimp}               0.001866418  0.6666667 0.002799627  9.329602    14

# our first 10 rules have high lift values indicating high association between the items hence when ordering items at Carrefour aisles they should be ordered according to the lift values in descending order as displayed below.
# This strategy if employed by Carrefour's marketing team will ensure that their total sales will increase as is desired by the company since items with high association will most likely be bought together

# if they would like to increase their shrimp sales via a promotion they should arrange the shrimp close to the items displayed in the code below: 
shrimp <- subset(rules2, subset = rhs %pin% "shrimp")
 
# Then order by lift
shrimp <- sort(shrimp, by="lift", decreasing=TRUE)
inspect(shrimp[1:5])

##     lhs                             rhs      support     confidence coverage   
## [1] {eggs, mineral water, pasta} => {shrimp} 0.001333156 0.9090909  0.001466471
## [2] {milk, pasta}                => {shrimp} 0.001599787 0.8571429  0.001866418
## [3] {mineral water, pasta}       => {shrimp} 0.001599787 0.7500000  0.002133049
## [4] {eggs, pasta}                => {shrimp} 0.001866418 0.6666667  0.002799627
## [5] {burgers, pasta}             => {shrimp} 0.001466471 0.6470588  0.002266364
##     lift      count
## [1] 12.722185 10   
## [2] 11.995203 12   
## [3] 10.495802 12   
## [4]  9.329602 14   
## [5]  9.055202 11