Part 3: Association Rules
# Loading Libraries
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(arulesViz)
# Loading dataset
url<-"http://bit.ly/SupermarketDatasetII"
sales <- read.transactions(url, sep = ',')
## Warning in asMethod(object): removing duplicated items in transactions
head(sales)
## transactions in sparse format with
## 6 transactions (rows) and
## 119 items (columns)
# Previewing top items
inspect(sales[1:4])
## items
## [1] {almonds,
## antioxydant juice,
## avocado,
## cottage cheese,
## energy drink,
## frozen smoothie,
## green grapes,
## green tea,
## honey,
## low fat yogurt,
## mineral water,
## olive oil,
## salad,
## salmon,
## shrimp,
## spinach,
## tomato juice,
## vegetables mix,
## whole weat flour,
## yams}
## [2] {burgers,
## eggs,
## meatballs}
## [3] {chutney}
## [4] {avocado,
## turkey}
# Previewing summary statistic
summary(sales)
## transactions as itemMatrix in sparse format with
## 7501 rows (elements/itemsets/transactions) and
## 119 columns (items) and a density of 0.03288973
##
## most frequent items:
## mineral water eggs spaghetti french fries chocolate
## 1788 1348 1306 1282 1229
## (Other)
## 22405
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17 4
## 18 19 20
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.914 5.000 20.000
##
## includes extended item information - examples:
## labels
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
# Previewing the labels of items
items <- as.data.frame(itemLabels(sales))
colnames(items) <- "Item"
head(items, 4)
## Item
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
## 4 avocado
tail(items,4)
## Item
## 116 whole wheat rice
## 117 yams
## 118 yogurt cake
## 119 zucchini
# Checking the frequency of first 10 labels
itemFrequency(sales[, 1:10],type = "absolute")
## almonds antioxydant juice asparagus avocado
## 153 67 36 250
## babies food bacon barbecue sauce black tea
## 34 65 81 107
## blueberries body spray
## 69 86
# plotting the frequency of items
itemFrequencyPlot(sales, topN = 10,col="green")
itemFrequencyPlot(sales, support = 0.1,col="blue")
# Checking the rules of the data at 80% confidence level
rules <- apriori (sales, parameter = list(supp = 0.001, conf = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.01s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [74 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules
## set of 74 rules
# Summary of the 1st 5 rules
inspect(rules[1:5])
## lhs rhs support confidence
## [1] {frozen smoothie, spinach} => {mineral water} 0.001066524 0.8888889
## [2] {bacon, pancakes} => {spaghetti} 0.001733102 0.8125000
## [3] {nonfat milk, turkey} => {mineral water} 0.001199840 0.8181818
## [4] {ground beef, nonfat milk} => {mineral water} 0.001599787 0.8571429
## [5] {mushroom cream sauce, pasta} => {escalope} 0.002532996 0.9500000
## coverage lift count
## [1] 0.001199840 3.729058 8
## [2] 0.002133049 4.666587 13
## [3] 0.001466471 3.432428 9
## [4] 0.001866418 3.595877 12
## [5] 0.002666311 11.976387 19
# Visualization of the association rules
subrules <- head(rules, n = 10, by = "confidence")
plot(subrules, method = "graph" , engine = "htmlwidget")
plot(subrules, method = "graph" , engine = "default")
plot(subrules, method = "paracoord")
Part 4: Anomaly Detection
1. Loading libraries and dataset
# Loading libraries
library(tidyverse)
library(anomalize)
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(dplyr)
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:arules':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyverse)
library(tibbletime)
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
library(dplyr)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(devtools)
## Loading required package: usethis
library(Rcpp)
library(anomaly)
##
## Attaching package: 'anomaly'
## The following object is masked from 'package:stats':
##
## simulate
library(ggplot2)
# Loading dataset
url <- "http://bit.ly/CarreFourSalesDataset"
sales_forecast<-read.csv(url)
2. Checking the data
# Previewing the top of the dataset
head(sales_forecast)
## Date Sales
## 1 1/5/2019 548.9715
## 2 3/8/2019 80.2200
## 3 3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5 2/8/2019 634.3785
## 6 3/25/2019 627.6165
# Previewing bottom of the dataset
tail(sales_forecast)
## Date Sales
## 995 2/18/2019 63.9975
## 996 1/29/2019 42.3675
## 997 3/2/2019 1022.4900
## 998 2/9/2019 33.4320
## 999 2/22/2019 69.1110
## 1000 2/18/2019 649.2990
# Previewing shape
cat("The dataset has", nrow(sales_forecast), "rows", "and", ncol(sales_forecast), "columns")
## The dataset has 1000 rows and 2 columns
# Checking Data types
str(sales_forecast)
## 'data.frame': 1000 obs. of 2 variables:
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Sales: num 549 80.2 340.5 489 634.4 ...
sales_forecast$Date <- as.Date(sales_forecast$Date , format = "%m/%d/%Y")
head(sales_forecast)
## Date Sales
## 1 2019-01-05 548.9715
## 2 2019-03-08 80.2200
## 3 2019-03-03 340.5255
## 4 2019-01-27 489.0480
## 5 2019-02-08 634.3785
## 6 2019-03-25 627.6165
unique(sales_forecast$Date)
## [1] "2019-01-05" "2019-03-08" "2019-03-03" "2019-01-27" "2019-02-08"
## [6] "2019-03-25" "2019-02-25" "2019-02-24" "2019-01-10" "2019-02-20"
## [11] "2019-02-06" "2019-03-09" "2019-02-12" "2019-02-07" "2019-03-29"
## [16] "2019-01-15" "2019-03-11" "2019-01-01" "2019-01-21" "2019-03-05"
## [21] "2019-03-15" "2019-02-17" "2019-03-02" "2019-03-22" "2019-03-10"
## [26] "2019-01-25" "2019-01-28" "2019-01-07" "2019-03-23" "2019-01-17"
## [31] "2019-02-02" "2019-03-04" "2019-03-16" "2019-02-27" "2019-02-10"
## [36] "2019-03-19" "2019-02-03" "2019-03-07" "2019-02-28" "2019-03-27"
## [41] "2019-01-20" "2019-03-12" "2019-02-15" "2019-03-06" "2019-02-14"
## [46] "2019-03-13" "2019-01-24" "2019-01-06" "2019-02-11" "2019-01-22"
## [51] "2019-01-13" "2019-01-09" "2019-01-12" "2019-01-26" "2019-01-23"
## [56] "2019-02-23" "2019-01-02" "2019-02-09" "2019-03-26" "2019-03-01"
## [61] "2019-02-01" "2019-03-28" "2019-03-24" "2019-02-05" "2019-01-19"
## [66] "2019-01-16" "2019-01-08" "2019-02-18" "2019-01-18" "2019-02-16"
## [71] "2019-02-22" "2019-01-29" "2019-01-04" "2019-03-30" "2019-01-30"
## [76] "2019-01-03" "2019-03-21" "2019-02-13" "2019-01-14" "2019-03-18"
## [81] "2019-03-20" "2019-02-21" "2019-01-31" "2019-01-11" "2019-02-26"
## [86] "2019-03-17" "2019-03-14" "2019-02-04" "2019-02-19"
sales_forecast <- as_tbl_time(sales_forecast , index= Date)
3. Data Cleaning
# Tidying column names
colnames(sales_forecast)
## [1] "Date" "Sales"
# Checking for number of missing values
length(which(is.na(sales_forecast)))
## [1] 0
# Checking for duplicates
sum(duplicated(sales_forecast))
## [1] 0
# Checking for outliers
boxplot(sales_forecast$Sales)
# Listing the outliers
boxplot.stats(sales_forecast$Sales)$out
## [1] 1003.590 1039.290 1042.650 1002.120 1020.705 1034.460 1023.750 1022.385
## [9] 1022.490
4. EDA
# Statistical summary of the dataset
describe(sales_forecast$Sales)
## vars n mean sd median trimmed mad min max range skew
## X1 1 1000 322.97 245.89 253.85 293.91 233.78 10.68 1042.65 1031.97 0.89
## kurtosis se
## X1 -0.09 7.78
# Plotting the sales data
ggplot(sales_forecast, aes(x=Date, y=Sales, color=Sales)) + geom_line()
# Frequency distribution of sales
hist(sales_forecast$Sales, col = 'gold', ylim = c(0, 250))
5. Anomaly Detection
# Performing anomaly detection using Seasonal Hybrid ESD Test
anomaly.detect <- sales_forecast %>%group_by(Date) %>%summarise(totalsales =sum(eval(as.symbol("Sales")))) %>% ungroup() %>%time_decompose(totalsales) %>% anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.2) %>%
plot_anomaly_decomposition()
## frequency = 7 days
## trend = 30 days
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
anomaly.detect
# Plotting clearer points to check the months that have anomalies
sales_forecast %>% group_by(Date) %>% summarise(totalsales = sum(Sales),.group='drop') %>% time_decompose(totalsales, method = "stl", frequency = "auto", trend = "auto") %>% anomalize(remainder, method = "gesd") %>% time_recompose() %>%
# Anomaly Visualization
plot_anomalies(time_recomposed =T,ncol = 6, color_no = "red", color_yes = "green",fill_ribbon ="yellow")
## frequency = 7 days
## trend = 30 days
6. Conclusion
7. Recommendation