knitr::opts_chunk$set(error = TRUE)
# Load tidyverse and anomalize
# ---
#
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tibbletime)
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
library(anomalize)
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(vctrs)
##
## Attaching package: 'vctrs'
## The following object is masked from 'package:dplyr':
##
## data_frame
## The following object is masked from 'package:tibble':
##
## data_frame
library(tidyr)
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df <- read_csv("/home/oppy/Downloads/Supermarket_Sales_Forecasting - Sales.csv")
## Rows: 1000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (1): Sales
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 2
## Date Sales
## <chr> <dbl>
## 1 1/5/2019 549.
## 2 3/8/2019 80.2
## 3 3/3/2019 341.
## 4 1/27/2019 489.
## 5 2/8/2019 634.
## 6 3/25/2019 628.
Checking the datatypes
#checking the datatypes
str(df)
## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : chr [1:1000] "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_character(),
## .. Sales = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Change the date column datatype
#changing the datatype
df$Date <- as.Date(df$Date, "%m%d%y")
#POSIXlt" and "POSIXct" representing calendar dates and times.
df$Date <- as.POSIXct(df$Date)
df$Sales <- as.numeric(df$Sales)
Confirm the changes made
#checking the datatypes
str(df)
## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : POSIXct[1:1000], format: NA NA ...
## $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_character(),
## .. Sales = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Checking for null and duplicate values
sum(is.na(df))
## [1] 1000
df1 <- df[complete.cases(df),]
df1[complete.cases(df),]
## Error:
## ! Must subset rows with a valid subscript vector.
## ℹ Logical subscripts must match the size of the indexed input.
## x Input has size 0 but subscript `complete.cases(df)` has size 1000.
colSums(is.na(df1))
## Date Sales
## 0 0
head(df1)
## # A tibble: 0 × 2
## # … with 2 variables: Date <dttm>, Sales <dbl>
Confirming the changes
sum(is.na(df1))
## [1] 0
dff <- as.tibble(df1)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
dff
## # A tibble: 0 × 2
## # … with 2 variables: Date <dttm>, Sales <dbl>
anom = dff %>%
time_decompose(Sales, merge = TRUE) %>%
anomalize(remainder) %>%
time_recompose = TRUE
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Error in `reconstruct()`:
## ! Problem while computing `Date = collapse_index(...)`.
## Caused by error in `if (defaults$d == 0) ...`:
## ! missing value where TRUE/FALSE needed
anom%>% glimpse()
## Error in glimpse(.): object 'anom' not found
#visualize the anomalies using the plot_anomalies() function.
anom %>%
plot_anomalies(ncol = 3, alpha_dots = 0.25)
## Error in plot_anomalies(., ncol = 3, alpha_dots = 0.25): object 'anom' not found
Based on the labels, there are no anomalies detected
trend and seasonality are fundamental to time series analysis and specifically time series decomposition.
#plotting the trend
anom_plot <- anom %>%
plot_anomaly_decomposition()+
ggtitle("Plotting Anomalies")
## Error in plot_anomaly_decomposition(.): object 'anom' not found
anom_plot
## Error in eval(expr, envir, enclos): object 'anom_plot' not found
Parameter Tuning
We will use the max anoms and alpha parameters for tuning
#visualize the anomalies using the plot_anomalies() function.
anom%>%
time_decompose(Sales)%>%
anomalize(remainder, alpha = 0.09, max_anoms = 0.10)%>%
time_recompose()%>%
plot_anomalies(time_recompose = T)+
ggtitle("alpha = 0.09")
## Error in time_decompose(., Sales): object 'anom' not found
#Tuning the Alpha parameter
anom%>%
time_decompose(Sales)%>%
anomalize(remainder, alpha = 0.5, max_anoms = 0.5)%>%
time_recompose()%>%
plot_anomalies(time_recompose = T)+
ggtitle("alpha = 0.5")
## Error in time_decompose(., Sales): object 'anom' not found
When the alpha level and max_anoms are increased, more anomalies are observed
In IQR a distribution is taken and 25% and 75% inner quartile range to establish the distribution of the remainder. Limits are set by default to a factor of 3 times above, and below the inner quartile range, any remainder beyond the limit is considered as an anomaly.
iqr<-anom %>%
time_decompose(Sales, method = "stl") %>%
anomalize(remainder, method = "iqr")
## Error in time_decompose(., Sales, method = "stl"): object 'anom' not found
plot_anomaly_decomposition(iqr)
## Error: Object is not of class `tbl_time`.
In GESD anomalies are progressively evaluated removing the worst offenders and recalculating the test statistics and critical values, or more simply you can say that a range is recalculated after identifying the anomalies in an iterative way.
gesd <-anom %>%
time_decompose(Sales, method = "stl") %>%
anomalize(remainder, method = "gesd")
## Error in time_decompose(., Sales, method = "stl"): object 'anom' not found
plot_anomaly_decomposition(gesd)
## Error: Object is not of class `tbl_time`.
GESD is more accurate since it detects more anomalies than IQR with the same hyperparameters