Anomaly

knitr::opts_chunk$set(error = TRUE)

# Load tidyverse and anomalize
# ---
# 
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tibbletime)

## 
## Attaching package: 'tibbletime'

## The following object is masked from 'package:stats':
## 
##     filter

library(anomalize)

## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>

library(vctrs)

## 
## Attaching package: 'vctrs'

## The following object is masked from 'package:dplyr':
## 
##     data_frame

## The following object is masked from 'package:tibble':
## 
##     data_frame

library(tidyr)
library(dplyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

df <- read_csv("/home/oppy/Downloads/Supermarket_Sales_Forecasting - Sales.csv")

## Rows: 1000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (1): Sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(df)

## # A tibble: 6 × 2
##   Date      Sales
##   <chr>     <dbl>
## 1 1/5/2019  549. 
## 2 3/8/2019   80.2
## 3 3/3/2019  341. 
## 4 1/27/2019 489. 
## 5 2/8/2019  634. 
## 6 3/25/2019 628.

Checking the datatypes

#checking the datatypes
str(df)

## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date : chr [1:1000] "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Change the date column datatype

#changing the datatype
df$Date <- as.Date(df$Date, "%m%d%y")
#POSIXlt" and "POSIXct" representing calendar dates and times.
df$Date <- as.POSIXct(df$Date)
df$Sales <- as.numeric(df$Sales)

Confirm the changes made

#checking the datatypes
str(df)

## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date : POSIXct[1:1000], format: NA NA ...
##  $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Checking for null and duplicate values

sum(is.na(df))

## [1] 1000

df1 <- df[complete.cases(df),]
df1[complete.cases(df),]

## Error:
## ! Must subset rows with a valid subscript vector.
## ℹ Logical subscripts must match the size of the indexed input.
## x Input has size 0 but subscript `complete.cases(df)` has size 1000.

colSums(is.na(df1))

##  Date Sales 
##     0     0

head(df1)

## # A tibble: 0 × 2
## # … with 2 variables: Date <dttm>, Sales <dbl>

Confirming the changes

sum(is.na(df1))

## [1] 0

dff <- as.tibble(df1)

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

dff

## # A tibble: 0 × 2
## # … with 2 variables: Date <dttm>, Sales <dbl>

anom = dff %>%
  time_decompose(Sales, merge = TRUE) %>%
  anomalize(remainder) %>%
  time_recompose = TRUE

## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date

## Error in `reconstruct()`:
## ! Problem while computing `Date = collapse_index(...)`.
## Caused by error in `if (defaults$d == 0) ...`:
## ! missing value where TRUE/FALSE needed

anom%>% glimpse()

## Error in glimpse(.): object 'anom' not found

#visualize the anomalies using the plot_anomalies() function.

anom %>%
    plot_anomalies(ncol = 3, alpha_dots = 0.25)

## Error in plot_anomalies(., ncol = 3, alpha_dots = 0.25): object 'anom' not found

Based on the labels, there are no anomalies detected

trend and seasonality are fundamental to time series analysis and specifically time series decomposition.

#plotting the trend
anom_plot <- anom %>%
  plot_anomaly_decomposition()+
  ggtitle("Plotting Anomalies")

## Error in plot_anomaly_decomposition(.): object 'anom' not found

anom_plot

## Error in eval(expr, envir, enclos): object 'anom_plot' not found

Parameter Tuning

We will use the max anoms and alpha parameters for tuning

#visualize the anomalies using the plot_anomalies() function.
anom%>%
 time_decompose(Sales)%>%
  anomalize(remainder, alpha = 0.09, max_anoms = 0.10)%>%
  time_recompose()%>%
  plot_anomalies(time_recompose = T)+
  ggtitle("alpha = 0.09")

## Error in time_decompose(., Sales): object 'anom' not found

#Tuning the Alpha parameter
anom%>%
  time_decompose(Sales)%>%
  anomalize(remainder, alpha = 0.5, max_anoms = 0.5)%>%
  time_recompose()%>%
  plot_anomalies(time_recompose = T)+
  ggtitle("alpha = 0.5")

## Error in time_decompose(., Sales): object 'anom' not found

When the alpha level and max_anoms are increased, more anomalies are observed

In IQR a distribution is taken and 25% and 75% inner quartile range to establish the distribution of the remainder. Limits are set by default to a factor of 3 times above, and below the inner quartile range, any remainder beyond the limit is considered as an anomaly.

iqr<-anom %>%
    time_decompose(Sales, method = "stl") %>%
    anomalize(remainder, method = "iqr")

## Error in time_decompose(., Sales, method = "stl"): object 'anom' not found

plot_anomaly_decomposition(iqr)

## Error: Object is not of class `tbl_time`.

In GESD anomalies are progressively evaluated removing the worst offenders and recalculating the test statistics and critical values, or more simply you can say that a range is recalculated after identifying the anomalies in an iterative way.

gesd <-anom %>%
    time_decompose(Sales, method = "stl") %>%
    anomalize(remainder, method = "gesd")

## Error in time_decompose(., Sales, method = "stl"): object 'anom' not found

plot_anomaly_decomposition(gesd)

## Error: Object is not of class `tbl_time`.

GESD is more accurate since it detects more anomalies than IQR with the same hyperparameters

Anomaly

Oppy

2022-04-01