install_and_load_pkgs <- function(pkg){
# Load a vector of packages and install them if needed.
# CODE SOURCE: https://gist.github.com/stevenworthington/3178163
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg)) install.packages(new.pkg, dependencies = T)
sapply(pkg, require, character.only = T, quietly = T, warn.conflicts = T)
}
# list the packages here
pkgs <- c("dplyr", "tidyr", "forecast", "ggplot2", "xts", "doMC", "kableExtra", "openxlsx")
install_and_load_pkgs(pkgs)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'forecast' was built under R version 3.4.4
## Warning: package 'xts' was built under R version 3.4.4
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Warning: package 'kableExtra' was built under R version 3.4.4
## Warning: package 'openxlsx' was built under R version 3.4.4
# if you are using windows, use this command:
# install.packages("doMC", repos="http://R-Forge.R-project.org")
source('Source/project1_jg.R')
source('local_cv_test.R')
On initial inspection, the SeriesInd appear to have minor regularly occuring gaps for all data points. The consecutive runs of four and five index values do indicate that this time series might be based on weekday reporting; however, there did not appear to be a noticeable origin date to use in order to link the indices to actual datetimes. Therefore, it was decided that the indices were to be treated as if they were consecutive, ignoring the gaps, much like how weekday data would be processed anyways, but without an indication of frequency (e.g. week, month, year time periods). In fact, due to the unknown nature of the frequency of values, this feature will be varied in hyperparameter training, which is typically not the case.
As a first step, the data were checked for missing values, which were present in all of the variables. These values were filled in by interpolation.
#check nulls
sapply(excel_df, function(x){sum(is.na(x))})
## SeriesInd group Var01 Var02 Var03 Var05 Var07
## 0 0 14 2 26 26 26
#Graph imputations
autoplot(s03_v01_full_ts, series='interpolated') +
autolayer(s03_v01_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v02_full_ts, series='interpolated') +
autolayer(s03_v02_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v03_full_ts, series='interpolated') +
autolayer(s03_v03_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v05_full_ts, series='interpolated') +
autolayer(s03_v05_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v07_full_ts, series='interpolated') +
autolayer(s03_v07_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
Next, the data were analyzed for outliers. After a preliminary inspection, it appears that there was one value in S06 (Var01, Var03, Var05, Var07) and one value in S02 (Var01, Var03, Var05, Var07) that is much different that any other measurements, and were therefore removed.
ggplot_df <- excel_df %>%
gather('Var', 'Value', 3:7)
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) +
geom_line() +
facet_wrap(~Var, scales = 'free', ncol = 2) +
ggtitle('Series Plots With Outliers') +
theme(plot.title = element_text(hjust = 0.5))
#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) +
geom_boxplot() +
facet_wrap(~Var, scales = 'free', ncol = 2) +
ggtitle('Series Box Plots With Outliers') +
theme(plot.title = element_text(hjust = 0.5))
ggplot_df <- excel_no_outlier_df %>%
gather('Var', 'Value', 3:7) #%>%
#use this to filter and graph subgroups
#filter(group=='S03' & Var=='Var05' & (SeriesInd > as.Date('2015-01-01') & SeriesInd < as.Date('2017-01-01')))
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) +
geom_line() +
facet_wrap(~Var, scales = 'free', ncol = 2) +
ggtitle('Series Plots Without Outliers') +
theme(plot.title = element_text(hjust = 0.5))
#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) +
geom_boxplot() +
facet_wrap(~Var, scales = 'free', ncol = 2) +
ggtitle('Series Box Plots Without Outliers') +
theme(plot.title = element_text(hjust = 0.5))
s03_v01_max_points_df <- s03_v01_df %>% filter(
SeriesInd<41600 & Var01 == max(s03_v01_df %>% filter(SeriesInd<41600) %>% select(Var01)) |
SeriesInd<42500 & Var01 == max(s03_v01_df %>% filter(SeriesInd<42500) %>% select(Var01))
)
s03_v01_freq <- (s03_v01_max_points_df[2,1] - s03_v01_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v01_df, aes(x=SeriesInd, y=Var01)) +
geom_line() +
geom_point(s03_v01_max_points_df,
mapping = aes(x=SeriesInd, Var01),
size=3,
color='blue')
#s03_v02_freq <- 125
#freq_points_df <- s03_v02_df[seq(1, nrow(s03_v02_df), s03_v02_freq), ]
ggplot(s03_v02_df, aes(x=SeriesInd, y=Var02)) +
geom_line() #+
#geom_point(freq_points_df,
# mapping = aes(x=SeriesInd, Var02),
# size=3,
# color='blue')
s03_v03_max_points_df <- s03_v03_df %>% filter(
SeriesInd<41600 & Var03 == max(s03_v03_df %>% filter(SeriesInd<41600) %>% select(Var03)) |
SeriesInd<42500 & Var03 == max(s03_v03_df %>% filter(SeriesInd<42500) %>% select(Var03))
)
s03_v03_freq <- (s03_v03_max_points_df[2,1] - s03_v03_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v03_df, aes(x=SeriesInd, y=Var03)) +
geom_line() +
ggtitle('S03 Var03 Data Plot') +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point(s03_v03_max_points_df,
mapping = aes(x=SeriesInd, Var03),
size=3,
color='blue')
There is an obvious upward trend in the data for S03 Var05 and Var07 with possibly some cyclical nature. Both are highly correlated with each other so it appears the final selected method for one will work with the other. For time series data with no obvious trend-cycle, a simple exponential smooothing method would work the best; however, since there is an apparent upward trend, then it is possible that the holt method would be a good fit, which was specifically developed to address these types of time series. A holt-winter and STL decomposition model will also be studied in case there is any unseen seasonal action being performed. Additionally, given the short-term erratic nature of the values from point to point, an ARIMA model will be attempted. This model will specifically include a drift component given that the data show an upward trend. Finally, a simple naive model will be constructed in order to gauge the effectiveness of the other models.
All the models mentioned above will have various hyperparameters trained and optimized to have the lowest root mean square error on the 140th observation (maximum prediction length). In order to ensure that the model that best fits the series, and not the last few points, the 140th step-ahead error was calculated from the 1000th point to the end of the data set and the RMSE was calculated on these results. Additionally, the mean average percentage error (MAPE) was taken to provide further insight. Only valid candidate models will be produced below
s03_v05_max_points_df <- s03_v05_df %>% filter(
SeriesInd<41600 & Var05 == max(s03_v05_df %>% filter(SeriesInd<41600) %>% select(Var05)) |
SeriesInd<42500 & Var05 == max(s03_v05_df %>% filter(SeriesInd<42500) %>% select(Var05))
)
s03_v05_freq <- (s03_v05_max_points_df[2,1] - s03_v05_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v05_df, aes(x=SeriesInd, y=Var05)) +
geom_line() +
ggtitle('S03 Var05 Data Plot') +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point(s03_v05_max_points_df,
mapping = aes(x=SeriesInd, Var05),
size=3,
color='blue')
s03_v07_max_points_df <- s03_v07_df %>% filter(
SeriesInd<41600 & Var07 == max(s03_v07_df %>% filter(SeriesInd<41600) %>% select(Var07)) |
SeriesInd<42500 & Var07 == max(s03_v07_df %>% filter(SeriesInd<42500) %>% select(Var07))
)
s03_v07_freq <- (s03_v07_max_points_df[2,1] - s03_v07_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v07_df, aes(x=SeriesInd, y=Var07)) +
geom_line() +
ggtitle('S03 Var07 Data Plot') +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point(s03_v07_max_points_df,
mapping = aes(x=SeriesInd, Var07),
size=3,
color='blue')
#ggsubseriesplot(s03_v05_full_ts)
library(GGally)
ggpairs(base_df %>% select(-SeriesInd, -group))
gglagplot(s03_v05_ts) +
ggtitle('S03 Var05 Lag Plot') +
theme(plot.title = element_text(hjust = 0.5))
ggAcf(s03_v05_full_ts, lag = 1000) +
ggtitle('S03 Var05 ACF Plot') +
theme(plot.title = element_text(hjust = 0.5))
#train/test set
test_ratio <- 0.95
cv_h <- 140
cv_start <- 1000
prediction_length <- 140
checkresiduals(s03_v05_naive_fit)
autoplot(s03_v05_naive_fit) +
autolayer(s03_v05_naive_test_fit) +
autolayer(s03_v05_naive_ts) +
ggtitle('S03 Var05 Naive Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_naive_fit)
autoplot(s03_v07_naive_fit) +
autolayer(s03_v07_naive_test_fit) +
autolayer(s03_v07_naive_ts) +
ggtitle('S03 Var07 Naive Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v05_stl_fit)
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 409.12, df = 322.4, p-value = 0.0007483
##
## Model df: 2. Total lags used: 324.4
autoplot(s03_v05_stl_fit) +
autolayer(s03_v05_stl_test_fit, series = 'STL_Test') +
autolayer(s03_v05_stl_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_stl_train_ts, h = length(s03_v05_stl_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var05 STL Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_stl_fit)
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 275.4, df = 178, p-value = 3.839e-06
##
## Model df: 2. Total lags used: 180
autoplot(s03_v07_stl_fit) +
autolayer(s03_v07_stl_test_fit, series = 'STL_Test') +
autolayer(s03_v07_stl_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_stl_train_ts, h = length(s03_v07_stl_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var07 STL Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v05_holt)
s03_v05_holt_fit <- s03_v05_holt %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v05_holt_fit %>%
autoplot() +
autolayer(s03_v05_holt_test_fit, series = 'Holt_Test') +
autolayer(s03_v05_holt_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_holt_train_ts, h = length(s03_v05_holt_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var05 Holt Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_holt)
s03_v07_holt_fit <- s03_v07_holt %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v07_holt_fit %>%
autoplot() +
autolayer(s03_v07_holt_test_fit, series = 'Holt_Test') +
autolayer(s03_v07_holt_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_holt_train_ts, h = length(s03_v07_holt_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var07 Holt Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v05_hw)
s03_v05_hw_fit <- s03_v05_hw %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v05_hw_fit %>%
autoplot() +
autolayer(s03_v05_hw_test_fit, series = 'HoltWinter_Test') +
autolayer(s03_v05_hw_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_hw_train_ts, h = length(s03_v05_hw_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var05 Holt Winter Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_hw)
s03_v07_hw_fit <- s03_v07_hw %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v07_hw_fit %>%
autoplot() +
autolayer(s03_v07_hw_test_fit, series = 'HoltWinter_Test') +
autolayer(s03_v07_hw_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_hw_train_ts, h = length(s03_v07_hw_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var07 Holt Winter Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v05_arima_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with drift
## Q* = 262.86, df = 175, p-value = 1.924e-05
##
## Model df: 5. Total lags used: 180
s03_v05_arima_fit %>%
autoplot() +
autolayer(s03_v05_arima_test_fit, series = 'Arima_Test') +
autolayer(s03_v05_arima_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_arima_train_ts, h = length(s03_v05_arima_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var05 Arima Plot') +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_arima_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0) with drift
## Q* = 243.34, df = 176, p-value = 0.000583
##
## Model df: 4. Total lags used: 180
s03_v07_arima_fit %>%
autoplot() +
autolayer(s03_v07_arima_test_fit, series = 'Arima_Test') +
autolayer(s03_v07_arima_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_arima_train_ts, h = length(s03_v07_arima_test_ts)), series="Naive_Test", PI=FALSE) +
scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) +
ggtitle('S03 Var07 Arima Plot') +
theme(plot.title = element_text(hjust = 0.5))
The results of the hyperparameter training of the models indicate that STL and ARIMA models have outperformed the naive model while trading off as the best model in various steps of measurement. Upon visual inspection, it appears the STL model provides a more reliable trend tracking component, as the ARIMA model is greatly over-guessing on the presented train/test split. It appears the STL model is more reactive to the cylce component of the trend-cyle; therefore, it will be the model used. If the prediction length were to be much more than 140 periods, ti appears the ARIMA model would fit best due to its better capture of the trend component. It is noted that the STL model does not pass the traditional statistical tests; however, it appears to be the best attempted fit and remains the selected model for S03 V05. For S03 V07, the same model type, with a slightly different set of hyperparameters was chosen and for similar reasons which returned similar results.
model_comparison_s03_v05_df %>%
arrange(RMSE_140) %>%
kable('html', caption = 'S03 Var05 Model Comparison') %>%
kable_styling()
| Name | RMSE_10 | RMSE_100 | RMSE_140 | MAPE_10 | MAPE_100 | MAPE_140 |
|---|---|---|---|---|---|---|
| stl | 5.394094 | 15.52126 | 19.89764 | 4.107568 | 12.99470 | 16.73123 |
| arima | 5.087217 | 15.87391 | 19.91363 | 3.778246 | 13.52621 | 16.77458 |
| naive | 5.063947 | 16.27119 | 21.32072 | 3.789333 | 13.77596 | 17.78273 |
| holt | 5.092060 | 16.31039 | 21.51643 | 3.804666 | 13.78571 | 17.98299 |
| hw | 5.132639 | 16.80882 | 22.43504 | 3.835250 | 14.10881 | 18.38663 |
model_comparison_s03_v07_df %>%
arrange(RMSE_140) %>%
kable('html', caption = 'S03 Var07 Model Comparison') %>%
kable_styling()
| Name | RMSE_10 | RMSE_100 | RMSE_140 | MAPE_10 | MAPE_100 | MAPE_140 |
|---|---|---|---|---|---|---|
| stl | 5.004000 | 15.09197 | 19.61776 | 3.761986 | 12.39642 | 16.11634 |
| arima | 5.104626 | 15.76577 | 19.71367 | 3.785808 | 13.41122 | 16.63635 |
| naive | 5.084478 | 16.19112 | 21.23420 | 3.815446 | 13.69470 | 17.71861 |
| holt | 5.095034 | 16.24039 | 21.50517 | 3.815808 | 13.67977 | 17.87598 |
| hw | 5.219941 | 16.31255 | 22.15851 | 3.905713 | 13.21275 | 17.85656 |
forecast_fun_reg <- function(y, h = h, xreg=NULL) {
new_xreg <- fourier(y, K=9)
x <- auto.arima(y, lambda = 0, xreg=new_xreg) %>%
forecast(xreg=new_xreg, h=h)
x$mean <- x$mean[1:h]
return(x)
}
source('local_cv_test.R')
#try cv instead
e <- tsCV_jg(y = s03_v05_arima_ts,
forecastfunction = forecast_fun_no_reg,
h=2,
initial=1600)
sqrt(colMeans(e^2, na.rm = TRUE))
ets_frequency <- 18
s03_v05_ets_ts <- ts(s03_v05_full_ts, frequency = ets_frequency, start = 1)
s03_v05_ets_train_ts <- subset(s03_v05_ets_ts, start = 1, end = floor(test_ratio * 1622))
s03_v05_ets_test_ts <- subset(s03_v05_ets_ts, start = floor(test_ratio * 1622))
s03_v05_ets_test <- ets(s03_v05_ets_train_ts,
'AAN',
lambda = NULL)
s03_v05_ets_test
checkresiduals(s03_v05_ets_test)
autoplot(s03_v05_ets_test)
s03_v05_ets_test_fit <- s03_v05_ets_test %>% forecast(h=test_length)
s03_v05_ets_test_fit %>%
autoplot() +
autolayer(s03_v05_ets_test_ts)
s03_v05_tbats_ts <- ts(s03_v05_full_ts, frequency = 30, start = min(excel_df$SeriesInd))
s03_v05_tbats_train_ts <- subset(s03_v05_tbats_ts, start = 1, end = length(s03_v05_tbats_ts)-test_length)
s03_v05_tbats_test_ts <- subset(s03_v05_tbats_ts, start = length(s03_v05_tbats_ts)-test_length+1)
s03_v05_tbats_test <- tbats(s03_v05_tbats_train_ts)
s03_v05_tbats_test
checkresiduals(s03_v05_tbats_test)
s03_v05_tbats_test_fit <- s03_v05_tbats_test %>% forecast(h=test_length)
s03_v05_tbats_test_fit %>%
autoplot() +
autolayer(s03_v05_tbats_test_ts)