In this section, I will check if there are any anomalies in the sales data. The objective of this exercise is fraud detection.

# calling the libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(anomalize)
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter

Loading the Dataset

# loading the dataset
df <- read_csv('http://bit.ly/CarreFourSalesDataset')
## Rows: 1000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (1): Sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# checking the class of the dataset
df %>% class()
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

Previewing the Data

# Determining the number of rows and columns in the dataset
cat('Number of rows are', nrow(df), 'and the number of columns are', ncol(df))
## Number of rows are 1000 and the number of columns are 2
# previewing the top of the dataset
df %>% head()
## # A tibble: 6 × 2
##   Date      Sales
##   <chr>     <dbl>
## 1 1/5/2019  549. 
## 2 3/8/2019   80.2
## 3 3/3/2019  341. 
## 4 1/27/2019 489. 
## 5 2/8/2019  634. 
## 6 3/25/2019 628.
# previewing the top of the dataset
df %>% head()
## # A tibble: 6 × 2
##   Date      Sales
##   <chr>     <dbl>
## 1 1/5/2019  549. 
## 2 3/8/2019   80.2
## 3 3/3/2019  341. 
## 4 1/27/2019 489. 
## 5 2/8/2019  634. 
## 6 3/25/2019 628.
# checking the structure of the dataset and datatype of each column
df %>% str()
## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date : chr [1:1000] "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Sales has the appropriate data type, I will the date column one to a desirable data type

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
df$Date <- mdy(df$Date)
# checking if the changes are implemented
df %>% str()
## spec_tbl_df [1,000 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date : Date[1:1000], format: "2019-01-05" "2019-03-08" ...
##  $ Sales: num [1:1000] 549 80.2 340.5 489 634.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data Cleaning and Manipulation

a) Validation

# Checking value validity
df %>% summary()
##       Date                Sales        
##  Min.   :2019-01-01   Min.   :  10.68  
##  1st Qu.:2019-01-24   1st Qu.: 124.42  
##  Median :2019-02-13   Median : 253.85  
##  Mean   :2019-02-14   Mean   : 322.97  
##  3rd Qu.:2019-03-08   3rd Qu.: 471.35  
##  Max.   :2019-03-30   Max.   :1042.65

There’s no entry below less than 0 therefore all the records are valid.

b) Consistency

# Checking for missing values
sum(is.na(df))
## [1] 0

There are no missing values.

c) Completeness

# Check for duplicates
sum(duplicated(df))
## [1] 0

There are no duplicate entries.

d) Uniformity

# Checking uniformity in column names 
colnames(df)
## [1] "Date"  "Sales"

The column names are uniform

e) Outliers

boxplot(df$Sales)

There are a few outliers but I won’t remove them

Exploration Analysis

# Statistical description of the variables

library(dplyr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(df$Sales)
##    vars    n   mean     sd median trimmed    mad   min     max   range skew
## X1    1 1000 322.97 245.89 253.85  293.91 233.78 10.68 1042.65 1031.97 0.89
##    kurtosis   se
## X1    -0.09 7.78

The column is positively skewed (skew of 0.89) and is platykurtotic (has negative excess kurtosis).

# frequency distribution of the sales column
hist(df$Sales, col = 'darkred', ylim = c(0, 250))

Most sales are between 0 to 200

# Sales distribution over time
library(ggplot2)
ggplot(data = df, aes(x = Date, y = Sales)) +
      geom_bar(stat = "identity", fill = "#f37735") +
      labs(title = "Sales distribution",
           x = "Date", y = "Sales(ksh)")

February and March had sales above 6000

#Ordering the data by Date
df = df %>% arrange(Date)
df %>% head(5)
## # A tibble: 5 × 2
##   Date       Sales
##   <date>     <dbl>
## 1 2019-01-01  457.
## 2 2019-01-01  400.
## 3 2019-01-01  471.
## 4 2019-01-01  388.
## 5 2019-01-01  133.

The data has several sales per day. Feeding all this to the anomaly detection function will throw an error. I will therefore calculate the average for each day to avoid the error.

# Calculating the average 
df = aggregate(Sales ~ Date , df , mean)
head(df)
##         Date    Sales
## 1 2019-01-01 395.4318
## 2 2019-01-02 243.1879
## 3 2019-01-03 259.7661
## 4 2019-01-04 270.6148
## 5 2019-01-05 294.7236
## 6 2019-01-06 401.5783
# Converting data frame to a tibble time (tbl_time)
# tbl_time have a time index that contains information about which column 
# should be used for time-based subsetting and other time-based manipulation

df = tbl_time(df, Date)
class(df)
## [1] "tbl_time"   "tbl_df"     "tbl"        "data.frame"

Anomaly Detection

# Using the 3 main functions for anomaly detection:
# 1. time_decompose(): Separates the time series into seasonal, trend, and remainder components
# 2. anomalize(): Applies anomaly detection methods to the remainder component.
# 3. time_recompose(): Calculates limits that separate the “normal” data from the anomalies

df_anomalized <- df %>%
    time_decompose(Sales) %>%
    anomalize(remainder) %>%
    time_recompose()
## frequency = 7 days
## trend = 30 days
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
df_anomalized %>% glimpse()
## Rows: 89
## Columns: 10
## $ Date          <date> 2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01…
## $ observed      <dbl> 395.4318, 243.1879, 259.7661, 270.6148, 294.7236, 401.57…
## $ season        <dbl> 13.288685, -15.022564, 3.885046, -7.121344, 23.706308, 1…
## $ trend         <dbl> 313.5412, 315.4142, 317.2872, 319.1602, 320.6628, 322.16…
## $ remainder     <dbl> 68.601843, -57.203762, -61.406165, -41.424068, -49.64549…
## $ remainder_l1  <dbl> -376.4995, -376.4995, -376.4995, -376.4995, -376.4995, -…
## $ remainder_l2  <dbl> 385.2714, 385.2714, 385.2714, 385.2714, 385.2714, 385.27…
## $ anomaly       <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "N…
## $ recomposed_l1 <dbl> -49.66955, -76.10782, -55.32723, -64.46063, -32.13033, -…
## $ recomposed_l2 <dbl> 712.1013, 685.6630, 706.4436, 697.3102, 729.6405, 726.17…
# Visualizing the anomalies
df_anomalized %>%
    plot_anomalies(ncol = 1, alpha_dots = 0.25)

There are no anomalies in the dataset.

Conclusuion