Load the IR page-click data

montana <- read.csv('montana_state_university_RAMP_pc_daily_clicks.csv')
head(montana)

##         date clicks            repository_id
## 1 2017-02-01    676 montana_state_university
## 2 2017-02-02    654 montana_state_university
## 3 2017-02-03    623 montana_state_university
## 4 2017-02-04    473 montana_state_university
## 5 2017-02-05    593 montana_state_university
## 6 2017-02-06    793 montana_state_university

Review data type

str(montana)

## 'data.frame':    1526 obs. of  3 variables:
##  $ date         : chr  "2017-02-01" "2017-02-02" "2017-02-03" "2017-02-04" ...
##  $ clicks       : int  676 654 623 473 593 793 765 636 898 1122 ...
##  $ repository_id: chr  "montana_state_university" "montana_state_university" "montana_state_university" "montana_state_university" ...

Convert date to standard date class in R format

montana$date <- as.Date(montana$date)
# as.Date(montana$date, "%m/%d/%Y") if the input file have a different format

Drop repository_id

montana <- subset(montana, select = -repository_id)

head(montana)

##         date clicks
## 1 2017-02-01    676
## 2 2017-02-02    654
## 3 2017-02-03    623
## 4 2017-02-04    473
## 5 2017-02-05    593
## 6 2017-02-06    793

Date has to be sorted first.

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.0.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

montana <- montana %>% arrange(montana$date)

str(montana)

## 'data.frame':    1526 obs. of  2 variables:
##  $ date  : Date, format: "2017-01-01" "2017-01-02" ...
##  $ clicks: int  636 898 1122 1076 1030 964 706 946 1250 1180 ...

Method 1: use base R ts() function and plot.ts()

montana_series <- ts(montana)

class(montana_series)

## [1] "mts"    "ts"     "matrix"

plot.ts(montana_series)

The ts() function is very limited in terms of handling daily values in our usage data.

Method 2: using the logic in R Statistics Cookbook, Chapter 7 by Juretig, 2019, on anomaly detection

library(tscount)

## Warning: package 'tscount' was built under R version 4.0.5

library(dplyr)
library(anomalize)

## Warning: package 'anomalize' was built under R version 4.0.5

## == Use anomalize to improve your Forecasts by 50%! =============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>

library(tibbletime)

## Warning: package 'tibbletime' was built under R version 4.0.5

## 
## Attaching package: 'tibbletime'

## The following object is masked from 'package:stats':
## 
##     filter

Calculate days difference from the date column.

difference = montana %>% 
  mutate(diff_days = as.numeric(date-lag(date))) #append a new column diff_days by subtracting the date difference

difference[is.na(difference)]  = 0  #Convert all na values in dataframe to zero

head(difference)

##         date clicks diff_days
## 1 2017-01-01    636         0
## 2 2017-01-02    898         1
## 3 2017-01-03   1122         1
## 4 2017-01-04   1076         1
## 5 2017-01-05   1030         1
## 6 2017-01-06    964         1

Anomaly Detection

Transform the dataset into tibbletime object and assigned date as index

montana_anomalies = as_tbl_time(difference,index = date) 
str(montana_anomalies)

## tbl_time [1,526 x 3] (S3: tbl_time/tbl_df/tbl/data.frame)
##  $ date     : Date[1:1526], format: "2017-01-01" "2017-01-02" ...
##  $ clicks   : int [1:1526] 636 898 1122 1076 1030 964 706 946 1250 1180 ...
##  $ diff_days: num [1:1526] 0 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "index_quo")= language ~date
##   ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  - attr(*, "index_time_zone")= chr "UTC"

head(montana_anomalies)

## # A time tibble: 6 x 3
## # Index: date
##   date       clicks diff_days
##   <date>      <int>     <dbl>
## 1 2017-01-01    636         0
## 2 2017-01-02    898         1
## 3 2017-01-03   1122         1
## 4 2017-01-04   1076         1
## 5 2017-01-05   1030         1
## 6 2017-01-06    964         1

Decompose the time series into trend, seasonality, and remainder components

We assumed the IR usage data is seasonal. Decomposition will estimate the trend component, seasonal component, and irregular component

montana_anomalies %>% 
  time_decompose(clicks, merge = TRUE) #time_decompose from anomalize library

## frequency = 7 days

## trend = 91 days

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Warning: `type_convert()` only converts columns of type 'character'.
## - `df` has no columns of type 'character'

## # A time tibble: 1,526 x 7
## # Index: date
##    date       clicks diff_days observed season trend remainder
##    <date>      <int>     <dbl>    <dbl>  <dbl> <dbl>     <dbl>
##  1 2017-01-01    636         0      636   51.3  934.   -349.  
##  2 2017-01-02    898         1      898  -76.3  933.     41.6 
##  3 2017-01-03   1122         1     1122 -204.   932.    394.  
##  4 2017-01-04   1076         1     1076  -56.7  930.    202.  
##  5 2017-01-05   1030         1     1030  110.   929.     -8.78
##  6 2017-01-06    964         1      964   92.7  928.    -56.5 
##  7 2017-01-07    706         1      706   83.4  927.   -304.  
##  8 2017-01-08    946         1      946   51.3  925.    -30.7 
##  9 2017-01-09   1250         1     1250  -76.3  924.    402.  
## 10 2017-01-10   1180         1     1180 -204.   923.    461.  
## # ... with 1,516 more rows

# It gives you an output of how many months in frequency and trend calculation
# Four columns are displayed for time tibble: observed, season, trend, and remainder

Detect the anomalies using the remainder

montana_anomalies %>% 
  time_decompose(clicks, merge = TRUE)%>%
  anomalize(remainder, method = "iqr")

## frequency = 7 days

## trend = 91 days

## Warning: `type_convert()` only converts columns of type 'character'.
## - `df` has no columns of type 'character'

## # A time tibble: 1,526 x 10
## # Index: date
##    date       clicks diff_days observed season trend remainder remainder_l1
##    <date>      <int>     <dbl>    <dbl>  <dbl> <dbl>     <dbl>        <dbl>
##  1 2017-01-01    636         0      636   51.3  934.   -349.          -539.
##  2 2017-01-02    898         1      898  -76.3  933.     41.6         -539.
##  3 2017-01-03   1122         1     1122 -204.   932.    394.          -539.
##  4 2017-01-04   1076         1     1076  -56.7  930.    202.          -539.
##  5 2017-01-05   1030         1     1030  110.   929.     -8.78        -539.
##  6 2017-01-06    964         1      964   92.7  928.    -56.5         -539.
##  7 2017-01-07    706         1      706   83.4  927.   -304.          -539.
##  8 2017-01-08    946         1      946   51.3  925.    -30.7         -539.
##  9 2017-01-09   1250         1     1250  -76.3  924.    402.          -539.
## 10 2017-01-10   1180         1     1180 -204.   923.    461.          -539.
## # ... with 1,516 more rows, and 2 more variables: remainder_l2 <dbl>,
## #   anomaly <chr>

Putting the code sequence together and assigned to dataframe

results = 
  montana_anomalies %>%
  time_decompose(clicks, method = "stl", merge = TRUE) %>%
  anomalize(remainder, method = "iqr")

## frequency = 7 days

## trend = 91 days

## Warning: `type_convert()` only converts columns of type 'character'.
## - `df` has no columns of type 'character'

head(results)

## # A time tibble: 6 x 10
## # Index: date
##   date       clicks diff_days observed season trend remainder remainder_l1
##   <date>      <int>     <dbl>    <dbl>  <dbl> <dbl>     <dbl>        <dbl>
## 1 2017-01-01    636         0      636   51.3  934.   -349.          -539.
## 2 2017-01-02    898         1      898  -76.3  933.     41.6         -539.
## 3 2017-01-03   1122         1     1122 -204.   932.    394.          -539.
## 4 2017-01-04   1076         1     1076  -56.7  930.    202.          -539.
## 5 2017-01-05   1030         1     1030  110.   929.     -8.78        -539.
## 6 2017-01-06    964         1      964   92.7  928.    -56.5         -539.
## # ... with 2 more variables: remainder_l2 <dbl>, anomaly <chr>

Finally, join everything back together

montana_result <- results%>% time_recompose()

Plot the result

montana_result %>% plot_anomaly_decomposition(ncol=2,alpha_dots = 0.3)

Times Series - pilot on anomaly detection

Cary

9/23/2021