Anomaly-Detection-IP-Week-3.knit

# loading the dataset using the fread function
library(data.table)

#import data
df <- fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Sales_Forecasting - Sales.csv")

# previewing the first 6 rows
head(df)

##         Date    Sales
## 1:  1/5/2019 548.9715
## 2:  3/8/2019  80.2200
## 3:  3/3/2019 340.5255
## 4: 1/27/2019 489.0480
## 5:  2/8/2019 634.3785
## 6: 3/25/2019 627.6165

# previewing the last 6 rows
tail(df)

##         Date     Sales
## 1: 2/18/2019   63.9975
## 2: 1/29/2019   42.3675
## 3:  3/2/2019 1022.4900
## 4:  2/9/2019   33.4320
## 5: 2/22/2019   69.1110
## 6: 2/18/2019  649.2990

# checking the shape of our dataset
dim(df)

## [1] 1000    2

# we have 1000 rows and 2 columns

# checking for missing values per column
colSums(is.na(df))

##  Date Sales 
##     0     0

# no missing values exist in the dataset

# checking for duplicates in the dataset
dup <- df[duplicated(df),]
dup

## Empty data.table (0 rows and 2 cols): Date,Sales

# there are no duplicates in the dataset

# checking the datatypes of the columns
str(df)

## Classes 'data.table' and 'data.frame':   1000 obs. of  2 variables:
##  $ Date : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num  549 80.2 340.5 489 634.4 ...
##  - attr(*, ".internal.selfref")=<externalptr>

# changing our date column to the correct data type
df$Date <- as.Date(df$Date, format = "%m/%d/%y")
#Convertion to POCIXct type
df$Date <- as.POSIXct(df$Date)
# previewing the changes
head(df)

##                   Date    Sales
## 1: 2020-01-05 03:00:00 548.9715
## 2: 2020-03-08 03:00:00  80.2200
## 3: 2020-03-03 03:00:00 340.5255
## 4: 2020-01-27 03:00:00 489.0480
## 5: 2020-02-08 03:00:00 634.3785
## 6: 2020-03-25 03:00:00 627.6165

# checking the summary statistics of our dataset
summary(df)

##       Date                           Sales        
##  Min.   :2020-01-01 03:00:00.0   Min.   :  10.68  
##  1st Qu.:2020-01-24 03:00:00.0   1st Qu.: 124.42  
##  Median :2020-02-13 03:00:00.0   Median : 253.85  
##  Mean   :2020-02-14 11:22:33.5   Mean   : 322.97  
##  3rd Qu.:2020-03-08 03:00:00.0   3rd Qu.: 471.35  
##  Max.   :2020-03-30 03:00:00.0   Max.   :1042.65

# we can clearly get the mean, median and quantile values for our sales column as shown below

# Load tidyverse and anomalize
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()

library(anomalize)

## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>

# visualising outliers using boxplot
library(lattice)
boxplot(df$Sales, xlab="Sales")

# outliers are clearly present in the sales column

# loading relevant libraries
library(tibbletime)

## 
## Attaching package: 'tibbletime'

## The following object is masked from 'package:stats':
## 
##     filter

library(timetk)

## 
## Attaching package: 'timetk'

## The following object is masked from 'package:data.table':
## 
##     :=

# converting our dataframe to a tibble and previewing the first 6 rows
dfd <- as_tibble(df)
head(dfd)

## # A tibble: 6 × 2
##   Date                Sales
##   <dttm>              <dbl>
## 1 2020-01-05 03:00:00 549. 
## 2 2020-03-08 03:00:00  80.2
## 3 2020-03-03 03:00:00 341. 
## 4 2020-01-27 03:00:00 489. 
## 5 2020-02-08 03:00:00 634. 
## 6 2020-03-25 03:00:00 628.

# DECOMPOSING THE TIME SERIES USING THE STL detection package
library(dplyr)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

dfe <- dfd %>%
  time_decompose(Sales, method = "stl",  frequency = "24 hour", trend = "1 day") %>%  
  anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>% 
  plot_anomaly_decomposition()

## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## frequency = 12 seconds

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## trend = 12 seconds

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

dfe

# we have been assigned a frequency of 12 seconds and a trend of 12 seconds by default. We assign our model a frequency of 24 hour and 1 day trend

# decomposing the time series using twitter detection package
dff <- dfd %>%
  time_decompose(Sales, method = "Twitter",  frequency = "24 hour", trend = "1 day") %>%  
  anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>% 
  plot_anomaly_decomposition()

## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## frequency = 12 seconds

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## median_span = 12 seconds

dff

# when compared to each other the stl package detects more anomalies over a 24 hr frequency and 1 day trend than twitter

# RECOMPOSING THE TIME SERIES USING stl
dfi <- dfd %>% 
  time_decompose(Sales, method = "stl",  frequency = "24 hour", trend = "1 day") %>% 
  anomalize(remainder,  method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
  time_recompose() %>%  
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)

## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## frequency = 12 seconds

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## trend = 12 seconds

dfi

x = dfd %>% 
  time_decompose(Sales, method = "stl",  frequency = "24 hour", trend = "1 day") %>%  
  anomalize(remainder,  method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%  
  time_recompose() %>%  
  filter(anomaly == 'Yes')

## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## frequency = 12 seconds

## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.

## trend = 12 seconds

## # A time tibble: 9 × 10
## # Index: Date
##   Date                observed season trend remainder remainder_l1 remainder_l2
##   <dttm>                 <dbl>  <dbl> <dbl>     <dbl>        <dbl>        <dbl>
## 1 2020-03-29 03:00:00     923.  -14.4 203.       734.        -738.         714.
## 2 2020-02-15 03:00:00    1043.   28.7 286.       728.        -738.         714.
## 3 2020-02-08 03:00:00    1021.   28.7 201.       791.        -738.         714.
## 4 2020-03-08 03:00:00     952.   12.7 217.       722.        -738.         714.
## 5 2020-01-30 03:00:00    1034.   10.4 209.       815.        -738.         714.
## 6 2020-03-09 03:00:00     935.  -34.6 211.       759.        -738.         714.
## 7 2020-01-15 03:00:00    1022.  -14.4 209.       828.        -738.         714.
## 8 2020-02-19 03:00:00     932.  -34.6 189.       778.        -738.         714.
## 9 2020-03-02 03:00:00    1022.  -14.4  91.6      945.        -738.         714.
## # … with 3 more variables: anomaly <chr>, recomposed_l1 <dbl>,
## #   recomposed_l2 <dbl>

# The anomalies detected are as shown by the dates in the code cell below which the marketing team should further investigate.