install_and_load_pkgs <- function(pkg){
# Load a vector of packages and install them if needed.
# CODE SOURCE: https://gist.github.com/stevenworthington/3178163
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg)) install.packages(new.pkg, dependencies = T)
sapply(pkg, require, character.only = T, quietly = T, warn.conflicts = T)
}
# list the packages here
pkgs <- c("dplyr", "tidyr", "forecast", "ggplot2", "xts", "doMC", "kableExtra", "openxlsx")
install_and_load_pkgs(pkgs)
# if you are using windows, use this command:
# install.packages("doMC", repos="http://R-Forge.R-project.org")
source('Source/project1_jg.R')
source('local_cv_test.R')
On initial inspection, the SeriesInd appear to have minor regularly occuring gaps for all data points. The consecutive runs of four and five index values do indicate that this time series might be based on weekday reporting; however, there did not appear to be a noticeable origin date to use in order to link the indices to actual datetimes. Therefore, it was decided that the indices were to be treated as if they were consecutive, ignoring the gaps, much like how weekday data would be processed anyways, but without an indication of frequency (e.g. week, month, year time periods). In fact, due to the unknown nature of the frequency of values, this feature will be varied in hyperparameter training, which is typically not the case.
As a first step, the data were checked for missing values, which were present in all of the variables. These values were filled in by interpolation.
#check nulls
sapply(excel_df, function(x){sum(is.na(x))})
## SeriesInd group Var01 Var02 Var03 Var05 Var07
## 0 0 14 2 26 26 26
#Graph imputations
autoplot(s03_v01_full_ts, series='interpolated') +
autolayer(s03_v01_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v02_full_ts, series='interpolated') +
autolayer(s03_v02_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v03_full_ts, series='interpolated') +
autolayer(s03_v03_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v05_full_ts, series='interpolated') +
autolayer(s03_v05_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
autoplot(s03_v07_full_ts, series='interpolated') +
autolayer(s03_v07_ts, series='original') +
scale_color_manual(
values = c(`interpolated`='red',`original`='gray')
)
Next, the data were analyzed for outliers. After a preliminary inspection, it appears that there was one value in S06 (Var01, Var03, Var05, Var07) and one value in S02 (Var01, Var03, Var05, Var07) that is much different that any other measurements, and were therefore removed.
ggplot_df <- excel_df %>%
gather('Var', 'Value', 3:7)
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) + geom_line() + facet_wrap(~Var, scales = 'free', ncol = 2)
#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) + geom_boxplot() + facet_wrap(~Var, scales = 'free', ncol = 2)
## Warning: Removed 94 rows containing non-finite values (stat_boxplot).
ggplot_df <- excel_no_outlier_df %>%
gather('Var', 'Value', 3:7) #%>%
#use this to filter and graph subgroups
#filter(group=='S03' & Var=='Var05' & (SeriesInd > as.Date('2015-01-01') & SeriesInd < as.Date('2017-01-01')))
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) + geom_line() + facet_wrap(~Var, scales = 'free', ncol = 2)
#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) + geom_boxplot() + facet_wrap(~Var, scales = 'free', ncol = 2)
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).
s03_v01_max_points_df <- s03_v01_df %>% filter(
SeriesInd<41600 & Var01 == max(s03_v01_df %>% filter(SeriesInd<41600) %>% select(Var01)) |
SeriesInd<42500 & Var01 == max(s03_v01_df %>% filter(SeriesInd<42500) %>% select(Var01))
)
s03_v01_freq <- (s03_v01_max_points_df[2,1] - s03_v01_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v01_df, aes(x=SeriesInd, y=Var01)) +
geom_line() +
geom_point(s03_v01_max_points_df,
mapping = aes(x=SeriesInd, Var01),
size=3,
color='blue')
#s03_v02_freq <- 125
#freq_points_df <- s03_v02_df[seq(1, nrow(s03_v02_df), s03_v02_freq), ]
ggplot(s03_v02_df, aes(x=SeriesInd, y=Var02)) +
geom_line() #+
#geom_point(freq_points_df,
# mapping = aes(x=SeriesInd, Var02),
# size=3,
# color='blue')
s03_v03_max_points_df <- s03_v03_df %>% filter(
SeriesInd<41600 & Var03 == max(s03_v03_df %>% filter(SeriesInd<41600) %>% select(Var03)) |
SeriesInd<42500 & Var03 == max(s03_v03_df %>% filter(SeriesInd<42500) %>% select(Var03))
)
s03_v03_freq <- (s03_v03_max_points_df[2,1] - s03_v03_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v03_df, aes(x=SeriesInd, y=Var03)) +
geom_line() +
geom_point(s03_v03_max_points_df,
mapping = aes(x=SeriesInd, Var03),
size=3,
color='blue')
There is an obvious upward trend in the data for S03 Var05 and Var07 with possibly some cyclical nature. Both are highly correlated with each other so it appears the final selected method for one will work with the other. For time series data with no obvious trend-cycle, a simple exponential smooothing method would work the best; however, since there is an apparent upward trend, then it is possible that the holt method would be a good fit, which was specifically developed to address these types of time series. A holt-winter and STL decomposition model will also be studied in case there is any unseen seasonal action being performed. Additionally, given the short-term erratic nature of the values from point to point, an ARIMA model will be attempted. This model will specifically include a drift component given that the data show an upward trend. Finally, a simple naive model will be constructed in order to gauge the effectiveness of the other models.
All the models mentioned above will have various hyperparameters trained and optimized to have the lowest root mean square error on the 140th observation (maximum prediction length). In order to ensure that the model that best fits the series, and not the last few points, the 140th step-ahead error was calculated from the 1000th point to the end of the data set and the RMSE was calculated on these results. Additionally, the mean average percentage error (MAPE) was taken to provide further insight.
s03_v05_max_points_df <- s03_v05_df %>% filter(
SeriesInd<41600 & Var05 == max(s03_v05_df %>% filter(SeriesInd<41600) %>% select(Var05)) |
SeriesInd<42500 & Var05 == max(s03_v05_df %>% filter(SeriesInd<42500) %>% select(Var05))
)
s03_v05_freq <- (s03_v05_max_points_df[2,1] - s03_v05_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v05_df, aes(x=SeriesInd, y=Var05)) +
geom_line() +
geom_point(s03_v05_max_points_df,
mapping = aes(x=SeriesInd, Var05),
size=3,
color='blue')
s03_v07_max_points_df <- s03_v07_df %>% filter(
SeriesInd<41600 & Var07 == max(s03_v07_df %>% filter(SeriesInd<41600) %>% select(Var07)) |
SeriesInd<42500 & Var07 == max(s03_v07_df %>% filter(SeriesInd<42500) %>% select(Var07))
)
s03_v07_freq <- (s03_v07_max_points_df[2,1] - s03_v07_max_points_df[1,1])[[1]]
#graph and examine possible frequencies
ggplot(s03_v07_df, aes(x=SeriesInd, y=Var07)) +
geom_line() +
geom_point(s03_v07_max_points_df,
mapping = aes(x=SeriesInd, Var07),
size=3,
color='blue')
#ggsubseriesplot(s03_v05_full_ts)
library(GGally)
ggpairs(base_df %>% select(-SeriesInd, -group))
gglagplot(s03_v05_ts)
ggAcf(s03_v05_full_ts, lag = 1000)
#train/test set
test_ratio <- 0.95
cv_h <- 140
cv_start <- 1000
prediction_length <- 140
naive_frequency = 1
s03_v05_naive_ts <- s03_v05_full_ts
s03_v05_naive_train_ts <- window(s03_v05_naive_ts,
start = 1,
end = floor(test_ratio * 1622/naive_frequency))
s03_v05_naive_test_ts <- window(s03_v05_naive_ts,
start = floor(test_ratio * 1622/naive_frequency))
s03_v05_naive_test_fit <- naive(s03_v05_naive_train_ts, h = length(s03_v05_naive_test_ts))
#checkresiduals(s03_v05_naive_test_fit)
#autoplot(s03_v05_naive_test_fit) +
# autolayer(s03_v05_naive_test_ts)
s03_v05_naive_fit <- naive(s03_v05_naive_ts, h = prediction_length)
checkresiduals(s03_v05_naive_fit)
##
## Ljung-Box test
##
## data: Residuals from Naive method
## Q* = 61.076, df = 10, p-value = 2.266e-09
##
## Model df: 0. Total lags used: 10
autoplot(s03_v05_naive_fit) +
autolayer(s03_v05_naive_test_fit) +
autolayer(s03_v05_naive_ts)
forecast_fun_no_reg <- function(y, h = h, xreg=NULL) {
x <- naive(y, h = h)
return(x)
}
#Do CV on final model
e_naive_s03_v05 <- tsCV_jg(y = s03_v05_naive_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_naive_s03_v05, file = 'Models/project1_jg_e_naive_s03_v05.RData')
naive_frequency = 1
s03_v07_naive_ts <- s03_v07_full_ts
s03_v07_naive_train_ts <- window(s03_v07_naive_ts,
start = 1,
end = floor(test_ratio * 1622/naive_frequency))
s03_v07_naive_test_ts <- window(s03_v07_naive_ts,
start = floor(test_ratio * 1622/naive_frequency))
s03_v07_naive_test_fit <- naive(s03_v07_naive_train_ts, h = length(s03_v07_naive_test_ts))
#checkresiduals(s03_v07_naive_test_fit)
#autoplot(s03_v07_naive_test_fit) +
# autolayer(s03_v07_naive_test_ts)
s03_v07_naive_fit <- naive(s03_v07_naive_ts, h = prediction_length)
checkresiduals(s03_v07_naive_fit)
##
## Ljung-Box test
##
## data: Residuals from Naive method
## Q* = 7.4084, df = 10, p-value = 0.6864
##
## Model df: 0. Total lags used: 10
autoplot(s03_v07_naive_fit) +
autolayer(s03_v07_naive_test_fit) +
autolayer(s03_v07_naive_ts)
forecast_fun_no_reg <- function(y, h = h, xreg=NULL) {
x <- naive(y, h = h)
return(x)
}
#Do CV on final model
e_naive_s03_v07 <- tsCV_jg(y = s03_v07_naive_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_naive_s03_v07, file = 'Models/project1_jg_e_naive_s03_v07.RData')
#STL Model
stl_frequency <- 180
s03_v05_stl_ts <- ts(s03_v05_full_ts, frequency = stl_frequency, start = 1)
s03_v05_stl_train_ts <- window(s03_v05_stl_ts,
start = 1,
end = floor(test_ratio * 1622/stl_frequency))
s03_v05_stl_test_ts <- window(s03_v05_stl_ts,
start = floor(test_ratio * 1622/stl_frequency))
s03_v05_stl_test_fit <- stlf(s03_v05_stl_train_ts,
method='ets',
t.window = 11,
s.window = 101,
robust = TRUE,
h = length(s03_v05_stl_test_ts))
#checkresiduals(s03_v05_stl_test_fit)
#autoplot(s03_v05_stl_test_fit) +
# autolayer(s03_v05_stl_test_ts)
#check full model
s03_v05_stl_fit <- stlf(s03_v05_stl_ts,
method='ets',
t.window = 11,
s.window = 101,
robust = TRUE,
h = prediction_length)
checkresiduals(s03_v05_stl_fit)
## Warning in checkresiduals(s03_v05_stl_fit): The fitted degrees of freedom
## is based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 409.12, df = 322.4, p-value = 0.0007483
##
## Model df: 2. Total lags used: 324.4
autoplot(s03_v05_stl_fit) +
autolayer(s03_v05_stl_test_fit, series = 'STL_Test') +
autolayer(s03_v05_stl_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_stl_train_ts, h = length(s03_v05_stl_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, xreg=NULL) {
x <- stlf(y,
method='ets',
t.window = 11,
s.window = 101,
robust = TRUE,
h = h)
return(x)
}
e_stl_s03_v05 <- tsCV_jg(y = s03_v05_stl_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_stl_s03_v05, file = 'Models/project1_jg_e_stl_s03_v05.RData')
stl_frequency <- 90
s03_v07_stl_ts <- ts(s03_v07_full_ts, frequency = stl_frequency, start = 1)
s03_v07_stl_train_ts <- window(s03_v07_stl_ts,
start = 1,
end = floor(test_ratio * 1622/stl_frequency))
s03_v07_stl_test_ts <- window(s03_v07_stl_ts,
start = floor(test_ratio * 1622/stl_frequency))
s03_v07_stl_test_fit <- stlf(s03_v07_stl_train_ts,
method='ets',
t.window = 1,
s.window = 7,
robust = TRUE,
h = length(s03_v07_stl_test_ts))
#checkresiduals(s03_v07_stl_test_fit)
#autoplot(s03_v07_stl_test_fit) +
# autolayer(s03_v07_stl_test_ts)
#check full model
s03_v07_stl_fit <- stlf(s03_v07_stl_ts,
method='ets',
t.window = 1,
s.window = 7,
robust = TRUE,
h = prediction_length)
checkresiduals(s03_v07_stl_fit)
## Warning in checkresiduals(s03_v07_stl_fit): The fitted degrees of freedom
## is based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 275.4, df = 178, p-value = 3.839e-06
##
## Model df: 2. Total lags used: 180
autoplot(s03_v07_stl_fit) +
autolayer(s03_v07_stl_test_fit, series = 'STL_Test') +
autolayer(s03_v07_stl_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_stl_train_ts, h = length(s03_v07_stl_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, xreg=NULL) {
x <- stlf(y,
method='ets',
t.window = 1,
s.window = 7,
robust = TRUE,
h = h)
return(x)
}
e_stl_s03_v07 <- tsCV_jg(y = s03_v07_stl_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_stl_s03_v07, file = 'Models/project1_jg_e_stl_s03_v07.RData')
#Holt Model
holt_frequency <- 1
s03_v05_holt_ts <- ts(s03_v05_full_ts, frequency = holt_frequency, start = 1)
s03_v05_holt_train_ts <- window(s03_v05_holt_ts,
start = 1,
end = floor(test_ratio * 1622/holt_frequency))
s03_v05_holt_test_ts <- window(s03_v05_holt_ts,
start = floor(test_ratio * 1622/holt_frequency))
s03_v05_holt_test <- holt(s03_v05_holt_train_ts,
lambda = NULL,
level = c(85, 90),
h=length(s03_v05_holt_test_ts))
#checkresiduals(s03_v05_holt_test)
s03_v05_holt_test_fit <- s03_v05_holt_test %>%
forecast(h=length(s03_v05_holt_test_ts), lambda=NULL, level = c(85, 90))
#s03_v05_holt_test_fit %>%
# autoplot() +
# autolayer(s03_v05_holt_test_ts)
#full model
s03_v05_holt <- holt(s03_v05_holt_ts,
lambda = NULL,
level = c(85, 90),
h=prediction_length)
checkresiduals(s03_v05_holt)
##
## Ljung-Box test
##
## data: Residuals from Holt's method
## Q* = 13.339, df = 6, p-value = 0.03795
##
## Model df: 4. Total lags used: 10
s03_v05_holt_fit <- s03_v05_holt %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v05_holt_fit %>%
autoplot() +
autolayer(s03_v05_holt_test_fit, series = 'Holt_Test') +
autolayer(s03_v05_holt_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_holt_train_ts, h = length(s03_v05_holt_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, lambda_sub=NULL) {
if(lambda_sub=='NULL'){
lambda_sub=NULL
}
tmp_model <- holt(y, h=h, lambda = lambda_sub)
tmp_fit <- tmp_model %>% forecast(h=h)
return(tmp_fit)
}
e_holt_s03_v05 <- tsCV_jg(y = s03_v05_holt_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
lambda_sub='NULL',
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_holt_s03_v05, file = 'Models/project1_jg_e_holt_s03_v05.RData')
holt_frequency <- 1
s03_v07_holt_ts <- ts(s03_v07_full_ts, frequency = holt_frequency, start = 1)
s03_v07_holt_train_ts <- window(s03_v07_holt_ts,
start = 1,
end = floor(test_ratio * 1622/holt_frequency))
s03_v07_holt_test_ts <- window(s03_v07_holt_ts,
start = floor(test_ratio * 1622/holt_frequency))
s03_v07_holt_test <- holt(s03_v07_holt_train_ts,
lambda = NULL,
level = c(85, 90),
h=length(s03_v07_holt_test_ts))
#checkresiduals(s03_v07_holt_test)
s03_v07_holt_test_fit <- s03_v07_holt_test %>%
forecast(h=length(s03_v07_holt_test_ts), lambda=NULL, level = c(85, 90))
#s03_v07_holt_test_fit %>%
# autoplot() +
# autolayer(s03_v07_holt_test_ts)
#full model
s03_v07_holt <- holt(s03_v07_holt_ts,
lambda = NULL,
level = c(85, 90),
h=prediction_length)
checkresiduals(s03_v07_holt)
##
## Ljung-Box test
##
## data: Residuals from Holt's method
## Q* = 7.5532, df = 6, p-value = 0.2727
##
## Model df: 4. Total lags used: 10
s03_v07_holt_fit <- s03_v07_holt %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v07_holt_fit %>%
autoplot() +
autolayer(s03_v07_holt_test_fit, series = 'Holt_Test') +
autolayer(s03_v07_holt_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_holt_train_ts, h = length(s03_v07_holt_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, lambda_sub=NULL) {
if(lambda_sub=='NULL'){
lambda_sub=NULL
}
tmp_model <- holt(y, h=h, lambda = lambda_sub)
tmp_fit <- tmp_model %>% forecast(h=h)
return(tmp_fit)
}
e_holt_s03_v07 <- tsCV_jg(y = s03_v07_holt_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
lambda_sub='NULL',
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_holt_s03_v07, file = 'Models/project1_jg_e_holt_s03_v07.RData')
#Holt Winter Model
hw_frequency <- 18
s03_v05_hw_ts <- ts(s03_v05_full_ts, frequency = hw_frequency, start = 1)
s03_v05_hw_train_ts <- window(s03_v05_hw_ts,
start = 1,
end = floor(test_ratio * 1622/hw_frequency))
s03_v05_hw_test_ts <- window(s03_v05_hw_ts,
start = floor(test_ratio * 1622/hw_frequency))
s03_v05_hw_test <- hw(s03_v05_hw_train_ts,
lambda = NULL,
seasonal='additive',
h=length(s03_v05_hw_test_ts),
level = c(85, 90))
#checkresiduals(s03_v05_hw_test)
s03_v05_hw_test_fit <- s03_v05_hw_test %>%
forecast(h=length(s03_v05_hw_test_ts), level = c(85, 90))
#s03_v05_hw_test_fit %>%
# autoplot() +
# autolayer(s03_v05_hw_test_ts)
#full model
s03_v05_hw <- holt(s03_v05_hw_ts,
lambda = NULL,
seasonal='additive',
h=prediction_length,
level = c(85, 90))
checkresiduals(s03_v05_hw)
##
## Ljung-Box test
##
## data: Residuals from Holt's method
## Q* = 59.657, df = 32, p-value = 0.002132
##
## Model df: 4. Total lags used: 36
s03_v05_hw_fit <- s03_v05_hw %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v05_hw_fit %>%
autoplot() +
autolayer(s03_v05_hw_test_fit, series = 'HoltWinter_Test') +
autolayer(s03_v05_hw_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_hw_train_ts, h = length(s03_v05_hw_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, lambda_sub=NULL, seasonal_sub=NULL) {
if(lambda_sub=='NULL'){
lambda_sub=NULL
}
tmp_model <- hw(y, h=h, lambda = lambda_sub, seasonal = seasonal_sub)
tmp_fit <- tmp_model %>% forecast(h=h)
return(tmp_fit)
}
e_hw_s03_v05 <- tsCV_jg(y = s03_v05_hw_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
lambda_sub='NULL',
seasonal_sub='additive',
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_hw_s03_v05, file = 'Models/project1_jg_e_hw_s03_v05.RData')
hw_frequency <- 6
s03_v07_hw_ts <- ts(s03_v07_full_ts, frequency = hw_frequency, start = 1)
s03_v07_hw_train_ts <- window(s03_v07_hw_ts,
start = 1,
end = floor(test_ratio * 1622/hw_frequency))
s03_v07_hw_test_ts <- window(s03_v07_hw_ts,
start = floor(test_ratio * 1622/hw_frequency))
s03_v07_hw_test <- hw(s03_v07_hw_train_ts,
lambda = NULL,
seasonal='additive',
h=length(s03_v07_hw_test_ts),
level = c(85, 90))
#checkresiduals(s03_v07_hw_test)
s03_v07_hw_test_fit <- s03_v07_hw_test %>%
forecast(h=length(s03_v07_hw_test_ts), level = c(85, 90))
#s03_v07_hw_test_fit %>%
# autoplot() +
# autolayer(s03_v07_hw_test_ts)
#full model
s03_v07_hw <- holt(s03_v07_hw_ts,
lambda = NULL,
seasonal='additive',
h=prediction_length,
level = c(85, 90))
checkresiduals(s03_v07_hw)
##
## Ljung-Box test
##
## data: Residuals from Holt's method
## Q* = 12.081, df = 8, p-value = 0.1476
##
## Model df: 4. Total lags used: 12
s03_v07_hw_fit <- s03_v07_hw %>%
forecast(h=prediction_length, level = c(85, 90))
s03_v07_hw_fit %>%
autoplot() +
autolayer(s03_v07_hw_test_fit, series = 'HoltWinter_Test') +
autolayer(s03_v07_hw_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_hw_train_ts, h = length(s03_v07_hw_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, lambda_sub=NULL, seasonal_sub=NULL) {
if(lambda_sub=='NULL'){
lambda_sub=NULL
}
tmp_model <- hw(y, h=h, lambda = lambda_sub, seasonal = seasonal_sub)
tmp_fit <- tmp_model %>% forecast(h=h)
return(tmp_fit)
}
e_hw_s03_v07 <- tsCV_jg(y = s03_v07_hw_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
lambda_sub='NULL',
seasonal_sub='additive',
extra_args = TRUE)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_hw_s03_v07, file = 'Models/project1_jg_e_hw_s03_v07.RData')
Arima Model
arima_frequency = 90
s03_v05_arima_ts <- ts(s03_v05_full_ts, frequency = arima_frequency)
s03_v05_arima_train_ts <- window(s03_v05_arima_ts, start = 1, end = floor(test_ratio * 1622/arima_frequency))
s03_v05_arima_test_ts <- window(s03_v05_arima_ts, start = floor(test_ratio * 1622/arima_frequency))
s03_v05_arima_test <- Arima(s03_v05_arima_train_ts,
order = c(1,0,2),
lambda = NULL,
include.drift = TRUE)
#checkresiduals(s03_v05_arima_test)
s03_v05_arima_test_fit <- s03_v05_arima_test %>%
forecast(h=length(s03_v05_arima_test_ts))
#s03_v05_arima_test_fit %>%
# autoplot() +
# autolayer(s03_v05_arima_test_ts)
#check full model
s03_v05_arima <- Arima(s03_v05_arima_ts,
order = c(1,0,2),
lambda = NULL,
include.drift = TRUE)
s03_v05_arima_fit <- s03_v05_arima %>%
forecast(h=prediction_length)
checkresiduals(s03_v05_arima_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with drift
## Q* = 262.86, df = 175, p-value = 1.924e-05
##
## Model df: 5. Total lags used: 180
s03_v05_arima_fit %>%
autoplot() +
autolayer(s03_v05_arima_test_fit, series = 'Arima_Test') +
autolayer(s03_v05_arima_test_ts, series = 'Actual') +
autolayer(naive(s03_v05_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v05_arima_train_ts, h = length(s03_v05_arima_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, xreg=NULL, lambda_sub=NULL, p_sub=NULL, d_sub=NULL, q_sub=NULL) {
if(lambda_sub=='NULL'){
lambda_sub=NULL
}
tmp_model <- Arima(y,
lambda = lambda_sub,
include.drift = TRUE,
order=c(p_sub, d_sub, q_sub))
tmp_fit <- tmp_model %>%
forecast(h=h)
return(tmp_fit)
}
e_arima_s03_v05 <- tsCV_jg(y = s03_v05_arima_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
extra_args = TRUE,
lambda_sub='NULL',
p_sub=1,
d_sub=0,
q_sub=2)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_arima_s03_v05, file = 'Models/project1_jg_e_arima_s03_v05.RData')
arima_frequency = 90
s03_v07_arima_ts <- ts(s03_v07_full_ts, frequency = arima_frequency)
s03_v07_arima_train_ts <- window(s03_v07_arima_ts, start = 1, end = floor(test_ratio * 1622/arima_frequency))
s03_v07_arima_test_ts <- window(s03_v07_arima_ts, start = floor(test_ratio * 1622/arima_frequency))
s03_v07_arima_test <- Arima(s03_v07_arima_train_ts,
order = c(2,0,0),
lambda = NULL,
include.drift = TRUE)
#checkresiduals(s03_v07_arima_test)
s03_v07_arima_test_fit <- s03_v07_arima_test %>%
forecast(h=length(s03_v07_arima_test_ts))
#s03_v07_arima_test_fit %>%
# autoplot() +
# autolayer(s03_v07_arima_test_ts)
#check full model
s03_v07_arima <- Arima(s03_v07_arima_ts,
order = c(2,0,0),
lambda = NULL,
include.drift = TRUE)
s03_v07_arima_fit <- s03_v07_arima %>%
forecast(h=prediction_length)
checkresiduals(s03_v07_arima_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0) with drift
## Q* = 243.34, df = 176, p-value = 0.000583
##
## Model df: 4. Total lags used: 180
s03_v07_arima_fit %>%
autoplot() +
autolayer(s03_v07_arima_test_fit, series = 'Arima_Test') +
autolayer(s03_v07_arima_test_ts, series = 'Actual') +
autolayer(naive(s03_v07_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
autolayer(naive(s03_v07_arima_train_ts, h = length(s03_v07_arima_test_ts)), series="Naive_Test", PI=FALSE) + scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green'))
forecast_fun_no_reg <- function(y, h = h, xreg=NULL, lambda_sub=NULL, p_sub=NULL, d_sub=NULL, q_sub=NULL) {
if(lambda_sub=='NULL'){
lambda_sub=NULL
}
tmp_model <- Arima(y,
lambda = lambda_sub,
include.drift = TRUE,
order=c(p_sub, d_sub, q_sub))
tmp_fit <- tmp_model %>%
forecast(h=h)
return(tmp_fit)
}
e_arima_s03_v07 <- tsCV_jg(y = s03_v07_arima_ts,
forecastfunction = forecast_fun_no_reg,
h=cv_h,
initial=cv_start,
extra_args = TRUE,
lambda_sub='NULL',
p_sub=2,
d_sub=0,
q_sub=0)
## ...................................................................................................1100
## ....................................................................................................1200
## ....................................................................................................1300
## ....................................................................................................1400
## ....................................................................................................1500
## ....................................................................................................1600
## ......................
save(e_arima_s03_v07, file = 'Models/project1_jg_e_arima_s03_v07.RData')
#Load CV tests
load('Models/project1_jg_e_naive_s03_v05.RData')
load('Models/project1_jg_e_stl_s03_v05.RData')
load('Models/project1_jg_e_holt_s03_v05.RData')
load('Models/project1_jg_e_hw_s03_v05.RData')
load('Models/project1_jg_e_arima_s03_v05.RData')
load('Models/project1_jg_e_naive_s03_v07.RData')
load('Models/project1_jg_e_stl_s03_v07.RData')
load('Models/project1_jg_e_holt_s03_v07.RData')
load('Models/project1_jg_e_hw_s03_v07.RData')
load('Models/project1_jg_e_arima_s03_v07.RData')
#compare models
model_comparison_s03_v05_df <- data.frame(
Name = c('naive', 'stl', 'holt', 'hw', 'arima'),
RMSE_10 = c(
sqrt(mean(e_naive_s03_v05$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_stl_s03_v05$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_holt_s03_v05$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_hw_s03_v05$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_arima_s03_v05$e[,10]^2, na.rm=TRUE))
),
RMSE_100 = c(
sqrt(mean(e_naive_s03_v05$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_stl_s03_v05$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_holt_s03_v05$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_hw_s03_v05$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_arima_s03_v05$e[,100]^2, na.rm=TRUE))
),
RMSE_140 = c(
sqrt(mean(e_naive_s03_v05$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_stl_s03_v05$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_holt_s03_v05$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_hw_s03_v05$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_arima_s03_v05$e[,140]^2, na.rm=TRUE))
),
MAPE_10 = c(
mean(abs(e_naive_s03_v05$pe[,10]), na.rm = TRUE),
mean(abs(e_stl_s03_v05$pe[,10]), na.rm = TRUE),
mean(abs(e_holt_s03_v05$pe[,10]), na.rm = TRUE),
mean(abs(e_hw_s03_v05$pe[,10]), na.rm = TRUE),
mean(abs(e_arima_s03_v05$pe[,10]), na.rm = TRUE)
),
MAPE_100 = c(
mean(abs(e_naive_s03_v05$pe[,100]), na.rm = TRUE),
mean(abs(e_stl_s03_v05$pe[,100]), na.rm = TRUE),
mean(abs(e_holt_s03_v05$pe[,100]), na.rm = TRUE),
mean(abs(e_hw_s03_v05$pe[,100]), na.rm = TRUE),
mean(abs(e_arima_s03_v05$pe[,100]), na.rm = TRUE)
),
MAPE_140 = c(
mean(abs(e_naive_s03_v05$pe[,140]), na.rm = TRUE),
mean(abs(e_stl_s03_v05$pe[,140]), na.rm = TRUE),
mean(abs(e_holt_s03_v05$pe[,140]), na.rm = TRUE),
mean(abs(e_hw_s03_v05$pe[,140]), na.rm = TRUE),
mean(abs(e_arima_s03_v05$pe[,140]), na.rm = TRUE)
)
)
#compare models
model_comparison_s03_v07_df <- data.frame(
Name = c('naive', 'stl', 'holt', 'hw', 'arima'),
RMSE_10 = c(
sqrt(mean(e_naive_s03_v07$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_stl_s03_v07$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_holt_s03_v07$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_hw_s03_v07$e[,10]^2, na.rm=TRUE)),
sqrt(mean(e_arima_s03_v07$e[,10]^2, na.rm=TRUE))
),
RMSE_100 = c(
sqrt(mean(e_naive_s03_v07$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_stl_s03_v07$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_holt_s03_v07$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_hw_s03_v07$e[,100]^2, na.rm=TRUE)),
sqrt(mean(e_arima_s03_v07$e[,100]^2, na.rm=TRUE))
),
RMSE_140 = c(
sqrt(mean(e_naive_s03_v07$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_stl_s03_v07$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_holt_s03_v07$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_hw_s03_v07$e[,140]^2, na.rm=TRUE)),
sqrt(mean(e_arima_s03_v07$e[,140]^2, na.rm=TRUE))
),
MAPE_10 = c(
mean(abs(e_naive_s03_v07$pe[,10]), na.rm = TRUE),
mean(abs(e_stl_s03_v07$pe[,10]), na.rm = TRUE),
mean(abs(e_holt_s03_v07$pe[,10]), na.rm = TRUE),
mean(abs(e_hw_s03_v07$pe[,10]), na.rm = TRUE),
mean(abs(e_arima_s03_v07$pe[,10]), na.rm = TRUE)
),
MAPE_100 = c(
mean(abs(e_naive_s03_v07$pe[,100]), na.rm = TRUE),
mean(abs(e_stl_s03_v07$pe[,100]), na.rm = TRUE),
mean(abs(e_holt_s03_v07$pe[,100]), na.rm = TRUE),
mean(abs(e_hw_s03_v07$pe[,100]), na.rm = TRUE),
mean(abs(e_arima_s03_v07$pe[,100]), na.rm = TRUE)
),
MAPE_140 = c(
mean(abs(e_naive_s03_v07$pe[,140]), na.rm = TRUE),
mean(abs(e_stl_s03_v07$pe[,140]), na.rm = TRUE),
mean(abs(e_holt_s03_v07$pe[,140]), na.rm = TRUE),
mean(abs(e_hw_s03_v07$pe[,140]), na.rm = TRUE),
mean(abs(e_arima_s03_v07$pe[,140]), na.rm = TRUE)
)
)
#Summary
The results of the hyperparameter training of the models indicate that STL and ARIMA models have outperformed the naive model while trading off as the best model in various steps of measurement. Upon visual inspection, it appears the STL model provides a more reliable trend tracking component, as the ARIMA model is greatly over-guessing on the presented train/test split. It appears the STL model is more reactive to the cylce component of the trend-cyle; therefore, it will be the model used. If the prediction length were to be much more than 140 periods, ti appears the ARIMA model would fit best due to its better capture of the trend component. It is noted that the STL model does not pass the traditional statistical tests; however, it appears to be the best attempted fit and remains the selected model for S03 V05. For S03 V07, the same model type, with a slightly different set of hyperparameters was chosen and for similar reasons which returned similar results.
model_comparison_s03_v05_df %>%
arrange(RMSE_140) %>%
kable('html') %>%
kable_styling()
| Name | RMSE_10 | RMSE_100 | RMSE_140 | MAPE_10 | MAPE_100 | MAPE_140 |
|---|---|---|---|---|---|---|
| stl | 5.394094 | 15.52126 | 19.89764 | 4.107568 | 12.99470 | 16.73123 |
| arima | 5.087217 | 15.87391 | 19.91363 | 3.778246 | 13.52621 | 16.77458 |
| naive | 5.063947 | 16.27119 | 21.32072 | 3.789333 | 13.77596 | 17.78273 |
| holt | 5.092060 | 16.31039 | 21.51643 | 3.804666 | 13.78571 | 17.98299 |
| hw | 5.132639 | 16.80882 | 22.43504 | 3.835250 | 14.10881 | 18.38663 |
model_comparison_s03_v07_df %>%
arrange(RMSE_140) %>%
kable('html') %>%
kable_styling()
| Name | RMSE_10 | RMSE_100 | RMSE_140 | MAPE_10 | MAPE_100 | MAPE_140 |
|---|---|---|---|---|---|---|
| stl | 5.004000 | 15.09197 | 19.61776 | 3.761986 | 12.39642 | 16.11634 |
| arima | 5.104626 | 15.76577 | 19.71367 | 3.785808 | 13.41122 | 16.63635 |
| naive | 5.084478 | 16.19112 | 21.23420 | 3.815446 | 13.69470 | 17.71861 |
| holt | 5.095034 | 16.24039 | 21.50517 | 3.815808 | 13.67977 | 17.87598 |
| hw | 5.219941 | 16.31255 | 22.15851 | 3.905713 | 13.21275 | 17.85656 |
#Output predictions and merge with excel output
s03_v05_output <- list(Var05_new = s03_v05_stl_fit$mean)
#update this
s03_v07_output <- list(Var07_new = s03_v07_stl_fit$mean)
#make output df
excel_output_df <- raw_excel_df %>% filter(
group=='S03'&
is.na(Var01)&
is.na(Var02)&
is.na(Var03)&
is.na(Var05)&
is.na(Var07)) %>%
select(SeriesInd, group) %>%
bind_cols(s03_v05_output, s03_v07_output)
#re-insert into original df to get the indices
s03_out_df <- raw_excel_df %>%
filter(group=='S03') %>%
left_join(excel_output_df, by = c('group'='group', 'SeriesInd'='SeriesInd')) %>%
mutate(Var05_tmp = ifelse(is.na(Var05),Var05_new, Var05)) %>%
mutate(Var07_tmp = ifelse(is.na(Var07),Var07_new, Var07)) %>%
select(-Var05, -Var05_new, -Var07, -Var07_new) %>%
rename(Var05=Var05_tmp, Var07=Var07_tmp) %>%
select(SeriesInd, group, Var05, Var07)
#save df to excel
file_name <- 'DATA624_project1_submission.xls'
wb <- loadWorkbook(file_name)
writeData(wb, x = s03_out_df, sheet = 'S03')
saveWorkbook(wb, file_name, overwrite = TRUE)
## Note: zip::zip() is deprecated, please use zip::zipr() instead
#combine to make better model 12.4
Old
forecast_fun_reg <- function(y, h = h, xreg=NULL) {
new_xreg <- fourier(y, K=9)
x <- auto.arima(y, lambda = 0, xreg=new_xreg) %>%
forecast(xreg=new_xreg, h=h)
x$mean <- x$mean[1:h]
return(x)
}
source('local_cv_test.R')
#try cv instead
e <- tsCV_jg(y = s03_v05_arima_ts,
forecastfunction = forecast_fun_no_reg,
h=2,
initial=1600)
sqrt(colMeans(e^2, na.rm = TRUE))
ets_frequency <- 18
s03_v05_ets_ts <- ts(s03_v05_full_ts, frequency = ets_frequency, start = 1)
s03_v05_ets_train_ts <- subset(s03_v05_ets_ts, start = 1, end = floor(test_ratio * 1622))
s03_v05_ets_test_ts <- subset(s03_v05_ets_ts, start = floor(test_ratio * 1622))
s03_v05_ets_test <- ets(s03_v05_ets_train_ts,
'AAN',
lambda = NULL)
s03_v05_ets_test
checkresiduals(s03_v05_ets_test)
autoplot(s03_v05_ets_test)
s03_v05_ets_test_fit <- s03_v05_ets_test %>% forecast(h=test_length)
s03_v05_ets_test_fit %>%
autoplot() +
autolayer(s03_v05_ets_test_ts)
s03_v05_tbats_ts <- ts(s03_v05_full_ts, frequency = 30, start = min(excel_df$SeriesInd))
s03_v05_tbats_train_ts <- subset(s03_v05_tbats_ts, start = 1, end = length(s03_v05_tbats_ts)-test_length)
s03_v05_tbats_test_ts <- subset(s03_v05_tbats_ts, start = length(s03_v05_tbats_ts)-test_length+1)
s03_v05_tbats_test <- tbats(s03_v05_tbats_train_ts)
s03_v05_tbats_test
checkresiduals(s03_v05_tbats_test)
s03_v05_tbats_test_fit <- s03_v05_tbats_test %>% forecast(h=test_length)
s03_v05_tbats_test_fit %>%
autoplot() +
autolayer(s03_v05_tbats_test_ts)