# loading the dataset using the fread function
library(data.table)
#import data
df <- fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Sales_Forecasting - Sales.csv")
# previewing the first 6 rows
head(df)
## Date Sales
## 1: 1/5/2019 548.9715
## 2: 3/8/2019 80.2200
## 3: 3/3/2019 340.5255
## 4: 1/27/2019 489.0480
## 5: 2/8/2019 634.3785
## 6: 3/25/2019 627.6165
# previewing the last 6 rows
tail(df)
## Date Sales
## 1: 2/18/2019 63.9975
## 2: 1/29/2019 42.3675
## 3: 3/2/2019 1022.4900
## 4: 2/9/2019 33.4320
## 5: 2/22/2019 69.1110
## 6: 2/18/2019 649.2990
# checking the shape of our dataset
dim(df)
## [1] 1000 2
# we have 1000 rows and 2 columns
# checking for missing values per column
colSums(is.na(df))
## Date Sales
## 0 0
# no missing values exist in the dataset
# checking for duplicates in the dataset
dup <- df[duplicated(df),]
dup
## Empty data.table (0 rows and 2 cols): Date,Sales
# there are no duplicates in the dataset
# checking the datatypes of the columns
str(df)
## Classes 'data.table' and 'data.frame': 1000 obs. of 2 variables:
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Sales: num 549 80.2 340.5 489 634.4 ...
## - attr(*, ".internal.selfref")=<externalptr>
# changing our date column to the correct data type
df$Date <- as.Date(df$Date, format = "%m/%d/%y")
#Convertion to POCIXct type
df$Date <- as.POSIXct(df$Date)
# previewing the changes
head(df)
## Date Sales
## 1: 2020-01-05 03:00:00 548.9715
## 2: 2020-03-08 03:00:00 80.2200
## 3: 2020-03-03 03:00:00 340.5255
## 4: 2020-01-27 03:00:00 489.0480
## 5: 2020-02-08 03:00:00 634.3785
## 6: 2020-03-25 03:00:00 627.6165
# checking the summary statistics of our dataset
summary(df)
## Date Sales
## Min. :2020-01-01 03:00:00.0 Min. : 10.68
## 1st Qu.:2020-01-24 03:00:00.0 1st Qu.: 124.42
## Median :2020-02-13 03:00:00.0 Median : 253.85
## Mean :2020-02-14 11:22:33.5 Mean : 322.97
## 3rd Qu.:2020-03-08 03:00:00.0 3rd Qu.: 471.35
## Max. :2020-03-30 03:00:00.0 Max. :1042.65
# we can clearly get the mean, median and quantile values for our sales column as shown below
# Load tidyverse and anomalize
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
library(anomalize)
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# visualising outliers using boxplot
library(lattice)
boxplot(df$Sales, xlab="Sales")

# outliers are clearly present in the sales column
# loading relevant libraries
library(tibbletime)
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
library(timetk)
##
## Attaching package: 'timetk'
## The following object is masked from 'package:data.table':
##
## :=
# converting our dataframe to a tibble and previewing the first 6 rows
dfd <- as_tibble(df)
head(dfd)
## # A tibble: 6 × 2
## Date Sales
## <dttm> <dbl>
## 1 2020-01-05 03:00:00 549.
## 2 2020-03-08 03:00:00 80.2
## 3 2020-03-03 03:00:00 341.
## 4 2020-01-27 03:00:00 489.
## 5 2020-02-08 03:00:00 634.
## 6 2020-03-25 03:00:00 628.
# DECOMPOSING THE TIME SERIES USING THE STL detection package
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
dfe <- dfd %>%
time_decompose(Sales, method = "stl", frequency = "24 hour", trend = "1 day") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
plot_anomaly_decomposition()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
dfe

# we have been assigned a frequency of 12 seconds and a trend of 12 seconds by default. We assign our model a frequency of 24 hour and 1 day trend
# decomposing the time series using twitter detection package
dff <- dfd %>%
time_decompose(Sales, method = "Twitter", frequency = "24 hour", trend = "1 day") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
plot_anomaly_decomposition()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## median_span = 12 seconds
dff

# when compared to each other the stl package detects more anomalies over a 24 hr frequency and 1 day trend than twitter
# RECOMPOSING THE TIME SERIES USING stl
dfi <- dfd %>%
time_decompose(Sales, method = "stl", frequency = "24 hour", trend = "1 day") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds
dfi

x = dfd %>%
time_decompose(Sales, method = "stl", frequency = "24 hour", trend = "1 day") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
time_recompose() %>%
filter(anomaly == 'Yes')
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 12 seconds
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 12 seconds
x
## # A time tibble: 9 × 10
## # Index: Date
## Date observed season trend remainder remainder_l1 remainder_l2
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-03-29 03:00:00 923. -14.4 203. 734. -738. 714.
## 2 2020-02-15 03:00:00 1043. 28.7 286. 728. -738. 714.
## 3 2020-02-08 03:00:00 1021. 28.7 201. 791. -738. 714.
## 4 2020-03-08 03:00:00 952. 12.7 217. 722. -738. 714.
## 5 2020-01-30 03:00:00 1034. 10.4 209. 815. -738. 714.
## 6 2020-03-09 03:00:00 935. -34.6 211. 759. -738. 714.
## 7 2020-01-15 03:00:00 1022. -14.4 209. 828. -738. 714.
## 8 2020-02-19 03:00:00 932. -34.6 189. 778. -738. 714.
## 9 2020-03-02 03:00:00 1022. -14.4 91.6 945. -738. 714.
## # … with 3 more variables: anomaly <chr>, recomposed_l1 <dbl>,
## # recomposed_l2 <dbl>
# The anomalies detected are as shown by the dates in the code cell below which the marketing team should further investigate.