montana <- read.csv('montana_state_university_RAMP_pc_daily_clicks.csv')
head(montana)
## date clicks repository_id
## 1 2017-02-01 676 montana_state_university
## 2 2017-02-02 654 montana_state_university
## 3 2017-02-03 623 montana_state_university
## 4 2017-02-04 473 montana_state_university
## 5 2017-02-05 593 montana_state_university
## 6 2017-02-06 793 montana_state_university
str(montana)
## 'data.frame': 1526 obs. of 3 variables:
## $ date : chr "2017-02-01" "2017-02-02" "2017-02-03" "2017-02-04" ...
## $ clicks : int 676 654 623 473 593 793 765 636 898 1122 ...
## $ repository_id: chr "montana_state_university" "montana_state_university" "montana_state_university" "montana_state_university" ...
montana$date <- as.Date(montana$date)
# as.Date(montana$date, "%m/%d/%Y") if the input file have a different format
montana <- subset(montana, select = -repository_id)
head(montana)
## date clicks
## 1 2017-02-01 676
## 2 2017-02-02 654
## 3 2017-02-03 623
## 4 2017-02-04 473
## 5 2017-02-05 593
## 6 2017-02-06 793
Date has to be sorted first.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
montana <- montana %>% arrange(montana$date)
str(montana)
## 'data.frame': 1526 obs. of 2 variables:
## $ date : Date, format: "2017-01-01" "2017-01-02" ...
## $ clicks: int 636 898 1122 1076 1030 964 706 946 1250 1180 ...
montana_series <- ts(montana)
class(montana_series)
## [1] "mts" "ts" "matrix"
plot.ts(montana_series)
The ts() function is very limited in terms of handling daily values in our usage data.
library(tscount)
## Warning: package 'tscount' was built under R version 4.0.5
library(dplyr)
library(anomalize)
## Warning: package 'anomalize' was built under R version 4.0.5
## == Use anomalize to improve your Forecasts by 50%! =============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(tibbletime)
## Warning: package 'tibbletime' was built under R version 4.0.5
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
difference = montana %>%
mutate(diff_days = as.numeric(date-lag(date))) #append a new column diff_days by subtracting the date difference
difference[is.na(difference)] = 0 #Convert all na values in dataframe to zero
head(difference)
## date clicks diff_days
## 1 2017-01-01 636 0
## 2 2017-01-02 898 1
## 3 2017-01-03 1122 1
## 4 2017-01-04 1076 1
## 5 2017-01-05 1030 1
## 6 2017-01-06 964 1
montana_anomalies = as_tbl_time(difference,index = date)
str(montana_anomalies)
## tbl_time [1,526 x 3] (S3: tbl_time/tbl_df/tbl/data.frame)
## $ date : Date[1:1526], format: "2017-01-01" "2017-01-02" ...
## $ clicks : int [1:1526] 636 898 1122 1076 1030 964 706 946 1250 1180 ...
## $ diff_days: num [1:1526] 0 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "index_quo")= language ~date
## ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## - attr(*, "index_time_zone")= chr "UTC"
head(montana_anomalies)
## # A time tibble: 6 x 3
## # Index: date
## date clicks diff_days
## <date> <int> <dbl>
## 1 2017-01-01 636 0
## 2 2017-01-02 898 1
## 3 2017-01-03 1122 1
## 4 2017-01-04 1076 1
## 5 2017-01-05 1030 1
## 6 2017-01-06 964 1
We assumed the IR usage data is seasonal. Decomposition will estimate the trend component, seasonal component, and irregular component
montana_anomalies %>%
time_decompose(clicks, merge = TRUE) #time_decompose from anomalize library
## frequency = 7 days
## trend = 91 days
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: `type_convert()` only converts columns of type 'character'.
## - `df` has no columns of type 'character'
## # A time tibble: 1,526 x 7
## # Index: date
## date clicks diff_days observed season trend remainder
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2017-01-01 636 0 636 51.3 934. -349.
## 2 2017-01-02 898 1 898 -76.3 933. 41.6
## 3 2017-01-03 1122 1 1122 -204. 932. 394.
## 4 2017-01-04 1076 1 1076 -56.7 930. 202.
## 5 2017-01-05 1030 1 1030 110. 929. -8.78
## 6 2017-01-06 964 1 964 92.7 928. -56.5
## 7 2017-01-07 706 1 706 83.4 927. -304.
## 8 2017-01-08 946 1 946 51.3 925. -30.7
## 9 2017-01-09 1250 1 1250 -76.3 924. 402.
## 10 2017-01-10 1180 1 1180 -204. 923. 461.
## # ... with 1,516 more rows
# It gives you an output of how many months in frequency and trend calculation
# Four columns are displayed for time tibble: observed, season, trend, and remainder
montana_anomalies %>%
time_decompose(clicks, merge = TRUE)%>%
anomalize(remainder, method = "iqr")
## frequency = 7 days
## trend = 91 days
## Warning: `type_convert()` only converts columns of type 'character'.
## - `df` has no columns of type 'character'
## # A time tibble: 1,526 x 10
## # Index: date
## date clicks diff_days observed season trend remainder remainder_l1
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2017-01-01 636 0 636 51.3 934. -349. -539.
## 2 2017-01-02 898 1 898 -76.3 933. 41.6 -539.
## 3 2017-01-03 1122 1 1122 -204. 932. 394. -539.
## 4 2017-01-04 1076 1 1076 -56.7 930. 202. -539.
## 5 2017-01-05 1030 1 1030 110. 929. -8.78 -539.
## 6 2017-01-06 964 1 964 92.7 928. -56.5 -539.
## 7 2017-01-07 706 1 706 83.4 927. -304. -539.
## 8 2017-01-08 946 1 946 51.3 925. -30.7 -539.
## 9 2017-01-09 1250 1 1250 -76.3 924. 402. -539.
## 10 2017-01-10 1180 1 1180 -204. 923. 461. -539.
## # ... with 1,516 more rows, and 2 more variables: remainder_l2 <dbl>,
## # anomaly <chr>
results =
montana_anomalies %>%
time_decompose(clicks, method = "stl", merge = TRUE) %>%
anomalize(remainder, method = "iqr")
## frequency = 7 days
## trend = 91 days
## Warning: `type_convert()` only converts columns of type 'character'.
## - `df` has no columns of type 'character'
head(results)
## # A time tibble: 6 x 10
## # Index: date
## date clicks diff_days observed season trend remainder remainder_l1
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2017-01-01 636 0 636 51.3 934. -349. -539.
## 2 2017-01-02 898 1 898 -76.3 933. 41.6 -539.
## 3 2017-01-03 1122 1 1122 -204. 932. 394. -539.
## 4 2017-01-04 1076 1 1076 -56.7 930. 202. -539.
## 5 2017-01-05 1030 1 1030 110. 929. -8.78 -539.
## 6 2017-01-06 964 1 964 92.7 928. -56.5 -539.
## # ... with 2 more variables: remainder_l2 <dbl>, anomaly <chr>
montana_result <- results%>% time_recompose()
montana_result %>% plot_anomaly_decomposition(ncol=2,alpha_dots = 0.3)