# loading the dataset using the fread function
library(data.table)
#import data
df <- fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Sales_Dataset II.csv")
## Warning in fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\
## \Supermarket_Sales_Dataset II.csv"): Detected 2 column names but the data has
## 3 columns (i.e. invalid file). Added 1 extra default column name for the first
## column which is guessed to be row names or an index. Use setnames() afterwards
## if this guess is not correct, or fix the file write command that created the
## file to create a valid file.
## Warning in fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\
## \Supermarket_Sales_Dataset II.csv"): Stopped early on line 10. Expected 3 fields
## but found 1. Consider fill=TRUE and comment.char=. First discarded non-empty
## line: <<french fries>>
# previewing the dataset
head(df)
## V1 whole wheat pasta french fries
## 1: soup light cream shallot
## 2: frozen vegetables spaghetti green tea
# loading the arules library
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
# loading the data from our comma separated file and previewing the first 6 rows
path <- "C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Sales_Dataset II.csv"
dfa <- read.transactions(path, sep = ",")
## Warning in asMethod(object): removing duplicated items in transactions
dfa
## transactions in sparse format with
## 7501 transactions (rows) and
## 119 items (columns)
# we have 7501 rows and 119 columns
# Verifying the object's class
# ---
# This should show us transactions as the type of data that we will need
# ---
#
class(dfa)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
# Previewing our first 5 transactions
#
inspect(dfa[1:5])
## items
## [1] {almonds,
## antioxydant juice,
## avocado,
## cottage cheese,
## energy drink,
## frozen smoothie,
## green grapes,
## green tea,
## honey,
## low fat yogurt,
## mineral water,
## olive oil,
## salad,
## salmon,
## shrimp,
## spinach,
## tomato juice,
## vegetables mix,
## whole weat flour,
## yams}
## [2] {burgers,
## eggs,
## meatballs}
## [3] {chutney}
## [4] {avocado,
## turkey}
## [5] {energy bar,
## green tea,
## milk,
## mineral water,
## whole wheat rice}
# preview the items that make up our dataset,
dfb <- as.data.frame(itemLabels(dfa))
colnames(dfb) <- "Item"
head(dfb, 10)
## Item
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
## 4 avocado
## 5 babies food
## 6 bacon
## 7 barbecue sauce
## 8 black tea
## 9 blueberries
## 10 body spray
# getting the summary of our dataset
summary(dfa)
## transactions as itemMatrix in sparse format with
## 7501 rows (elements/itemsets/transactions) and
## 119 columns (items) and a density of 0.03288973
##
## most frequent items:
## mineral water eggs spaghetti french fries chocolate
## 1788 1348 1306 1282 1229
## (Other)
## 22405
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17 4
## 18 19 20
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.914 5.000 20.000
##
## includes extended item information - examples:
## labels
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
# mineral water, eggs, spaghetti, french fries and chocolate are among carrefour's most purchased items
# Exploring the frequency of items of transacations ranging from 30 to 40 and performing some operation in percentage terms of the total transactions
itemFrequency(dfa[, 6:10],type = "absolute")
## bacon barbecue sauce black tea blueberries body spray
## 65 81 107 69 86
round(itemFrequency(dfa[, 30:40],type = "relative")*100,2)
## cookies cooking oil corn cottage cheese cream
## 8.04 5.11 0.48 3.19 0.09
## dessert wine eggplant eggs energy bar energy drink
## 0.44 1.32 17.97 2.71 2.67
## escalope
## 7.93
# Producing a chart of frequencies and fitering to consider only items with a minimum percentage of support/ considering a top x of items
# Displaying top 10 most common items in the carrefour dataset and the items whose relative importance is at least 10%
par(mfrow = c(1, 2))
# plot the frequency of items
itemFrequencyPlot(dfa, topN = 10,col="darkblue")
## mineral water, eggs, spaghetti, french fries and chocolate are among carrefour's most purchased items as displayed in the plot
itemFrequencyPlot(dfa, support = 0.1,col="darkred")

# we choose a support of 0.001 i.e items that occur at least 10 times out of a total of 10000 transactions and a confidence of 0.6
rules2 <- apriori (dfa, parameter = list(supp = 0.001, conf = 0.6))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [545 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules2
## set of 545 rules
# this gives us 545 rules to work with
# getting the summary information of our rules
summary(rules2)
## set of 545 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4 5 6
## 146 329 67 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 4.000 3.866 4.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001067 Min. :0.6000 Min. :0.001067 Min. : 2.517
## 1st Qu.:0.001067 1st Qu.:0.6250 1st Qu.:0.001600 1st Qu.: 2.797
## Median :0.001200 Median :0.6667 Median :0.001866 Median : 3.446
## Mean :0.001409 Mean :0.6893 Mean :0.002081 Mean : 3.889
## 3rd Qu.:0.001466 3rd Qu.:0.7273 3rd Qu.:0.002266 3rd Qu.: 4.177
## Max. :0.005066 Max. :1.0000 Max. :0.007999 Max. :34.970
## count
## Min. : 8.00
## 1st Qu.: 8.00
## Median : 9.00
## Mean :10.57
## 3rd Qu.:11.00
## Max. :38.00
##
## mining info:
## data ntransactions support confidence
## dfa 7501 0.001 0.6
## call
## apriori(data = dfa, parameter = list(supp = 0.001, conf = 0.6))
# most of our rules have 3 and 4 items with some containing 5 and the 3 of them containing 6 items
# Observing rules built in our model i.e. first 10 model rules
# ---
#
inspect(rules2[1:10])
## lhs rhs support confidence
## [1] {cookies, shallot} => {low fat yogurt} 0.001199840 0.6000000
## [2] {low fat yogurt, shallot} => {cookies} 0.001199840 0.6923077
## [3] {cookies, shallot} => {green tea} 0.001199840 0.6000000
## [4] {cookies, shallot} => {french fries} 0.001199840 0.6000000
## [5] {low fat yogurt, shallot} => {french fries} 0.001066524 0.6153846
## [6] {burger sauce, chicken} => {mineral water} 0.001066524 0.6666667
## [7] {frozen smoothie, spinach} => {mineral water} 0.001066524 0.8888889
## [8] {milk, spinach} => {mineral water} 0.001066524 0.6666667
## [9] {spaghetti, spinach} => {mineral water} 0.001333156 0.7142857
## [10] {olive oil, strong cheese} => {spaghetti} 0.001066524 0.7272727
## coverage lift count
## [1] 0.001999733 7.840767 9
## [2] 0.001733102 8.611940 9
## [3] 0.001999733 4.541473 9
## [4] 0.001999733 3.510608 9
## [5] 0.001733102 3.600624 8
## [6] 0.001599787 2.796793 8
## [7] 0.001199840 3.729058 8
## [8] 0.001599787 2.796793 8
## [9] 0.001866418 2.996564 10
## [10] 0.001466471 4.177085 8
# interpreting the 7th rule: if someone buys both frozen smoothies and spinach they are 88.89% likely to buy mineral water
# Ordering these rules by the lift criteria then looking at the first 10 rules.
rules2 <- sort(rules2, by="lift", decreasing=TRUE)
inspect(rules2[1:10])
## lhs rhs support confidence coverage lift count
## [1] {escalope,
## french fries,
## pasta} => {mushroom cream sauce} 0.001066524 0.6666667 0.001599787 34.969697 8
## [2] {fresh tuna,
## fromage blanc} => {honey} 0.001599787 0.6666667 0.002399680 14.046816 12
## [3] {eggs,
## mineral water,
## pasta} => {shrimp} 0.001333156 0.9090909 0.001466471 12.722185 10
## [4] {french fries,
## mushroom cream sauce,
## pasta} => {escalope} 0.001066524 1.0000000 0.001066524 12.606723 8
## [5] {milk,
## pasta} => {shrimp} 0.001599787 0.8571429 0.001866418 11.995203 12
## [6] {mushroom cream sauce,
## pasta} => {escalope} 0.002532996 0.9500000 0.002666311 11.976387 19
## [7] {chocolate,
## light cream,
## mineral water} => {chicken} 0.001199840 0.6428571 0.001866418 10.715714 9
## [8] {mineral water,
## pasta} => {shrimp} 0.001599787 0.7500000 0.002133049 10.495802 12
## [9] {black tea,
## eggs,
## spaghetti} => {turkey} 0.001066524 0.6153846 0.001733102 9.842217 8
## [10] {eggs,
## pasta} => {shrimp} 0.001866418 0.6666667 0.002799627 9.329602 14
# our first 10 rules have high lift values indicating high association between the items hence when ordering items at Carrefour aisles they should be ordered according to the lift values in descending order as displayed below.
# This strategy if employed by Carrefour's marketing team will ensure that their total sales will increase as is desired by the company since items with high association will most likely be bought together
# if they would like to increase their shrimp sales via a promotion they should arrange the shrimp close to the items displayed in the code below:
shrimp <- subset(rules2, subset = rhs %pin% "shrimp")
# Then order by lift
shrimp <- sort(shrimp, by="lift", decreasing=TRUE)
inspect(shrimp[1:5])
## lhs rhs support confidence coverage
## [1] {eggs, mineral water, pasta} => {shrimp} 0.001333156 0.9090909 0.001466471
## [2] {milk, pasta} => {shrimp} 0.001599787 0.8571429 0.001866418
## [3] {mineral water, pasta} => {shrimp} 0.001599787 0.7500000 0.002133049
## [4] {eggs, pasta} => {shrimp} 0.001866418 0.6666667 0.002799627
## [5] {burgers, pasta} => {shrimp} 0.001466471 0.6470588 0.002266364
## lift count
## [1] 12.722185 10
## [2] 11.995203 12
## [3] 10.495802 12
## [4] 9.329602 14
## [5] 9.055202 11