knitr::opts_chunk$set(error = TRUE)
# Load tidyverse and anomalize
# ---
# 
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter
library(anomalize)
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(vctrs)
## 
## Attaching package: 'vctrs'
## The following object is masked from 'package:dplyr':
## 
##     data_frame
## The following object is masked from 'package:tibble':
## 
##     data_frame
library(tidyr)
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
df <- read_csv("/home/oppy/Downloads/Supermarket_Sales_Forecasting - Sales.csv")
## Rows: 1000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (1): Sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 2
##   Date      Sales
##   <chr>     <dbl>
## 1 1/5/2019  549. 
## 2 3/8/2019   80.2
## 3 3/3/2019  341. 
## 4 1/27/2019 489. 
## 5 2/8/2019  634. 
## 6 3/25/2019 628.

Checking the datatypes

#checking the datatypes
str(df)
## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date : chr [1:1000] "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Change the date column datatype

#changing the datatype
df$Date <- as.Date(df$Date, "%m%d%y")
#POSIXlt" and "POSIXct" representing calendar dates and times.
df$Date <- as.POSIXct(df$Date)
df$Sales <- as.numeric(df$Sales)

Confirm the changes made

#checking the datatypes
str(df)
## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date : POSIXct[1:1000], format: NA NA ...
##  $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Checking for null and duplicate values

sum(is.na(df))
## [1] 1000
df1 <- df[complete.cases(df),]
df1[complete.cases(df),]
## Error:
## ! Must subset rows with a valid subscript vector.
## ℹ Logical subscripts must match the size of the indexed input.
## x Input has size 0 but subscript `complete.cases(df)` has size 1000.
colSums(is.na(df1))
##  Date Sales 
##     0     0
head(df1)
## # A tibble: 0 × 2
## # … with 2 variables: Date <dttm>, Sales <dbl>

Confirming the changes

sum(is.na(df1))
## [1] 0
dff <- as.tibble(df1)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
dff
## # A tibble: 0 × 2
## # … with 2 variables: Date <dttm>, Sales <dbl>
anom = dff %>%
  time_decompose(Sales, merge = TRUE) %>%
  anomalize(remainder) %>%
  time_recompose = TRUE
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Error in `reconstruct()`:
## ! Problem while computing `Date = collapse_index(...)`.
## Caused by error in `if (defaults$d == 0) ...`:
## ! missing value where TRUE/FALSE needed
anom%>% glimpse()
## Error in glimpse(.): object 'anom' not found
#visualize the anomalies using the plot_anomalies() function.

anom %>%
    plot_anomalies(ncol = 3, alpha_dots = 0.25)
## Error in plot_anomalies(., ncol = 3, alpha_dots = 0.25): object 'anom' not found

Based on the labels, there are no anomalies detected

trend and seasonality are fundamental to time series analysis and specifically time series decomposition.

#plotting the trend
anom_plot <- anom %>%
  plot_anomaly_decomposition()+
  ggtitle("Plotting Anomalies")
## Error in plot_anomaly_decomposition(.): object 'anom' not found
anom_plot
## Error in eval(expr, envir, enclos): object 'anom_plot' not found

Parameter Tuning

We will use the max anoms and alpha parameters for tuning

#visualize the anomalies using the plot_anomalies() function.
anom%>%
 time_decompose(Sales)%>%
  anomalize(remainder, alpha = 0.09, max_anoms = 0.10)%>%
  time_recompose()%>%
  plot_anomalies(time_recompose = T)+
  ggtitle("alpha = 0.09")
## Error in time_decompose(., Sales): object 'anom' not found
#Tuning the Alpha parameter
anom%>%
  time_decompose(Sales)%>%
  anomalize(remainder, alpha = 0.5, max_anoms = 0.5)%>%
  time_recompose()%>%
  plot_anomalies(time_recompose = T)+
  ggtitle("alpha = 0.5")
## Error in time_decompose(., Sales): object 'anom' not found

When the alpha level and max_anoms are increased, more anomalies are observed

In IQR a distribution is taken and 25% and 75% inner quartile range to establish the distribution of the remainder. Limits are set by default to a factor of 3 times above, and below the inner quartile range, any remainder beyond the limit is considered as an anomaly.

iqr<-anom %>%
    time_decompose(Sales, method = "stl") %>%
    anomalize(remainder, method = "iqr")
## Error in time_decompose(., Sales, method = "stl"): object 'anom' not found
plot_anomaly_decomposition(iqr)
## Error: Object is not of class `tbl_time`.

In GESD anomalies are progressively evaluated removing the worst offenders and recalculating the test statistics and critical values, or more simply you can say that a range is recalculated after identifying the anomalies in an iterative way.

gesd <-anom %>%
    time_decompose(Sales, method = "stl") %>%
    anomalize(remainder, method = "gesd")
## Error in time_decompose(., Sales, method = "stl"): object 'anom' not found
plot_anomaly_decomposition(gesd)
## Error: Object is not of class `tbl_time`.

GESD is more accurate since it detects more anomalies than IQR with the same hyperparameters