Create association rules that will enable the identification of relationships between variables in the dataset. A separate dataset that comprises groups of items that will be associated with others is provided for this section. Providing insights for the analysis is required.
Check whether there are any anomalies in the given sales dataset. The objective of this task being fraud detection.
For Part 3: create association rules that will allow you to identify relationships between variables in the dataset.
For Part 4: Check whether there are any anomalies in the given sales dataset
The project is centered around the retail sector. Exploring the marketing dataset through performing unsupervised learning techniques will help in providing relevant insight that will help in making informed marketing decisions.
For this analysis, I will perform the following actions:
Loading the Data.
Reading the Data.
Checking Dataset.
Association Analysis
Conclusion.
Recommendation.
This data is relevant because it helps in providing relevant insight that will help Carrefour Kenya make informed marketing decisions.
# Loading the arules library which is needed for Association Analysis
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
# Loading our file from csv using the read.transactions function convert them to the class transactions, which is require while working with models of association rules
# Defining the path
path <-"http://bit.ly/SupermarketDatasetII"
Supermarket<-read.transactions(path, sep = ",")
## Warning in asMethod(object): removing duplicated items in transactions
Supermarket
## transactions in sparse format with
## 7501 transactions (rows) and
## 119 items (columns)
# Checking the class of our object
class(Supermarket)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
# preview ten items that make up our dataset
items<-as.data.frame(itemLabels(Supermarket))
colnames(items) <- "Item"
head(items, 10)
## Item
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
## 4 avocado
## 5 babies food
## 6 bacon
## 7 barbecue sauce
## 8 black tea
## 9 blueberries
## 10 body spray
# Previewing the first five transactions
#inspect(Supermarket[1:5])
# Checking the summary our our dataset
summary(Supermarket)
## transactions as itemMatrix in sparse format with
## 7501 rows (elements/itemsets/transactions) and
## 119 columns (items) and a density of 0.03288973
##
## most frequent items:
## mineral water eggs spaghetti french fries chocolate
## 1788 1348 1306 1282 1229
## (Other)
## 22405
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17 4
## 18 19 20
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.914 5.000 20.000
##
## includes extended item information - examples:
## labels
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
# Exploring the frequency of some articles
# We picked transacations ranging from 4 to 8 and performing
# some operation in percentage terms of the total transactions
itemFrequency(Supermarket[, 5:8],type = "absolute")
## babies food bacon barbecue sauce black tea
## 34 65 81 107
round(itemFrequency(Supermarket[, 5:8],type = "relative")*100,2)
## babies food bacon barbecue sauce black tea
## 0.45 0.87 1.08 1.43
# Producing a chart of frequencies and fitering
# to consider only items with a minimum percentage
# of support/ considering a top x of items
#
par(mfrow = c(1, 2))
# plot the frequency of items
itemFrequencyPlot(Supermarket, topN = 10,col="darkblue")
itemFrequencyPlot(Supermarket, support = 0.1,col="lightblue")
# Building a model based on association rules
# using the apriori function
#
rules <- apriori (Supermarket, parameter = list(supp = 0.001, conf = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [74 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules
## set of 74 rules
# Building a apriori model with Min Support as 0.002 and confidence as 0.8.
rules2 <- apriori (Supermarket,parameter = list(supp = 0.002, conf = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.002 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 15
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [115 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Building apriori model with Min Support as 0.002 and confidence as 0.6.
rules3 <- apriori (Supermarket, parameter = list(supp = 0.001, conf = 0.6))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [545 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules2
## set of 2 rules
rules3
## set of 545 rules
# We can perform an exploration of our model
# through the use of the summary function as shown
summary(rules)
## set of 74 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4 5 6
## 15 42 16 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 4.000 4.000 4.041 4.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001067 Min. :0.8000 Min. :0.001067 Min. : 3.356
## 1st Qu.:0.001067 1st Qu.:0.8000 1st Qu.:0.001333 1st Qu.: 3.432
## Median :0.001133 Median :0.8333 Median :0.001333 Median : 3.795
## Mean :0.001256 Mean :0.8504 Mean :0.001479 Mean : 4.823
## 3rd Qu.:0.001333 3rd Qu.:0.8889 3rd Qu.:0.001600 3rd Qu.: 4.877
## Max. :0.002533 Max. :1.0000 Max. :0.002666 Max. :12.722
## count
## Min. : 8.000
## 1st Qu.: 8.000
## Median : 8.500
## Mean : 9.419
## 3rd Qu.:10.000
## Max. :19.000
##
## mining info:
## data ntransactions support confidence
## Supermarket 7501 0.001 0.8
## call
## apriori(data = Supermarket, parameter = list(supp = 0.001, conf = 0.8))
# Observing rules built in our model i.e. first 5 model rules
# ---
#
inspect(rules[1:5])
## lhs rhs support confidence
## [1] {frozen smoothie, spinach} => {mineral water} 0.001066524 0.8888889
## [2] {bacon, pancakes} => {spaghetti} 0.001733102 0.8125000
## [3] {nonfat milk, turkey} => {mineral water} 0.001199840 0.8181818
## [4] {ground beef, nonfat milk} => {mineral water} 0.001599787 0.8571429
## [5] {mushroom cream sauce, pasta} => {escalope} 0.002532996 0.9500000
## coverage lift count
## [1] 0.001199840 3.729058 8
## [2] 0.002133049 4.666587 13
## [3] 0.001466471 3.432428 9
## [4] 0.001866418 3.595877 12
## [5] 0.002666311 11.976387 19
Interpretation of the rules: 1: If someone buys frozen smoothie and spinach, they are 89% likely to buy mineral water too 2: If someone buys bacon and pancakes, they are 81% likely to buy spaghetti too 3: If someone buys nonfat milk and turkey, they are 82% likely to buy mineral water too 4: If someone buys ground beef and nonfat milk, they are 86% likely to buy mineral water too 5: If someone buys frozen mushroom cream sauce and pasta, they are 95% likely to buy escalope too
# If we're interested in making a promotion relating to the sale of eggs,
# we could create a subset of rules concerning these products
# ---
# This would tell us the items that the customers bought before purchasing eggs
# ---
#
eggs <- subset(rules, subset = rhs %pin% "eggs")
# Then order by confidence
eggs<-sort(eggs, by="confidence", decreasing=TRUE)
inspect(eggs)
## lhs rhs support confidence coverage
## [1] {black tea, spaghetti, turkey} => {eggs} 0.001066524 0.8888889 0.001199840
## [2] {mineral water, pasta, shrimp} => {eggs} 0.001333156 0.8333333 0.001599787
## lift count
## [1] 4.946258 8
## [2] 4.637117 10
Interpretation of the rules:
1: If someone buys black tea, spagetti, and turkey, they are 89% likely to eggs. 2: If someone buys mineral water, pasta and shrimp, they are 83% likely to buy eggs.
# What if we wanted to determine items that customers might buy
# who have previously bought eggs?
# ---
#
# Subset the rules
eggs <- subset(rules, subset = lhs %pin% "eggs")
# Order by confidence
eggs<-sort(eggs, by="confidence", decreasing=TRUE)
# inspect top 5
inspect(eggs)
## lhs rhs support confidence coverage lift count
## [1] {eggs,
## mineral water,
## pasta} => {shrimp} 0.001333156 0.9090909 0.001466471 12.722185 10
## [2] {brownies,
## eggs,
## ground beef} => {mineral water} 0.001066524 0.8888889 0.001199840 3.729058 8
## [3] {chocolate,
## eggs,
## frozen vegetables,
## ground beef} => {mineral water} 0.001466471 0.8461538 0.001733102 3.549776 11
## [4] {chocolate,
## eggs,
## olive oil,
## spaghetti} => {mineral water} 0.001199840 0.8181818 0.001466471 3.432428 9
## [5] {cooking oil,
## eggs,
## olive oil} => {mineral water} 0.001066524 0.8000000 0.001333156 3.356152 8
## [6] {cake,
## eggs,
## milk,
## turkey} => {mineral water} 0.001066524 0.8000000 0.001333156 3.356152 8
## [7] {chocolate,
## eggs,
## milk,
## olive oil} => {mineral water} 0.001066524 0.8000000 0.001333156 3.356152 8
Interpretation of the rules: 1: If someone buys eggs, mineral water and pasta, they are 91% likely to buy shrimp. 2: If someone buys brownies, eggs and ground beef, they are 89% likely to buy mineral water. 3: If someone buys chocolate, eggs, olive oil and spaghetti, they are 82% likely to buy mineral water too
# Install anomalize
library(anomalize) #tidy anomaly detectiom
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(tidyverse) #tidyverse packages like dplyr, ggplot, tidyr
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(timetk)
# Loading the dataset
anom <- read.csv('Supermarket_Forecasting.csv', header = TRUE)
# Previewing the first five entries in our dataset
head(anom)
## Date Sales
## 1 1/5/2019 548.9715
## 2 3/8/2019 80.2200
## 3 3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5 2/8/2019 634.3785
## 6 3/25/2019 627.6165
sum(is.na(anom))
## [1] 0
sum(duplicated(anom))
## [1] 0
dim(anom)
## [1] 1000 2
# Converting Date to the appropriate format
anom$Date <- as.Date(anom$Date, format = "%m/%d/%y")
#Convertion to POCIXct type
anom$Date <- as.POSIXct(anom$Date)
# Changing to a tibble
anom <- as_tibble(anom)
head(anom)
## # A tibble: 6 × 2
## Date Sales
## <dttm> <dbl>
## 1 2020-01-05 03:00:00 549.
## 2 2020-03-08 03:00:00 80.2
## 3 2020-03-03 03:00:00 341.
## 4 2020-01-27 03:00:00 489.
## 5 2020-02-08 03:00:00 634.
## 6 2020-03-25 03:00:00 628.
# Using an alpha of 0.05 and max_anoms of 0.2
anom %>%
time_decompose(Sales, method = "stl", frequency = "auto", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.2) %>%
plot_anomaly_decomposition()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Adjusting alpha from 0.05 to 0.1
anom %>%
time_decompose(Sales, method = 'stl', frequency = 'auto', trend = 'auto') %>%
anomalize(remainder, method = 'gesd', alpha = 0.1, max_anoms = 0.2) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds
# Adjusting max anoms from 0.2 to 0.05
anom %>%
time_decompose(Sales, method = 'stl', frequency = 'auto', trend = 'auto') %>%
anomalize(remainder, method = 'gesd', alpha = 0.05, max_anoms = 0.05) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds
# Using IQR Method in place of GESD and using alpha of 0.05 and max_anoms of 0.2
anom %>%
time_decompose(Sales, method = 'stl', frequency = 'auto', trend = 'auto') %>%
anomalize(remainder, method = 'iqr', alpha = 0.05, max_anoms = 0.2) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds