install_and_load_pkgs <- function(pkg){
  # Load a vector of packages and install them if needed.
  # CODE SOURCE: https://gist.github.com/stevenworthington/3178163
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) install.packages(new.pkg, dependencies = T)
  sapply(pkg, require, character.only = T, quietly = T, warn.conflicts = T)
}

# list the packages here
pkgs <- c("dplyr", "tidyr", "forecast", "ggplot2", "xts", "doMC", "kableExtra", "openxlsx")

install_and_load_pkgs(pkgs)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'forecast' was built under R version 3.4.4
## Warning: package 'xts' was built under R version 3.4.4
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Warning: package 'kableExtra' was built under R version 3.4.4
## Warning: package 'openxlsx' was built under R version 3.4.4
# if you are using windows, use this command:
# install.packages("doMC", repos="http://R-Forge.R-project.org")
source('Source/project1_jg.R')
source('local_cv_test.R')

Data Preparation

On initial inspection, the SeriesInd appear to have minor regularly occuring gaps for all data points. The consecutive runs of four and five index values do indicate that this time series might be based on weekday reporting; however, there did not appear to be a noticeable origin date to use in order to link the indices to actual datetimes. Therefore, it was decided that the indices were to be treated as if they were consecutive, ignoring the gaps, much like how weekday data would be processed anyways, but without an indication of frequency (e.g. week, month, year time periods). In fact, due to the unknown nature of the frequency of values, this feature will be varied in hyperparameter training, which is typically not the case.

As a first step, the data were checked for missing values, which were present in all of the variables. These values were filled in by interpolation.

#check nulls
sapply(excel_df, function(x){sum(is.na(x))})
## SeriesInd     group     Var01     Var02     Var03     Var05     Var07 
##         0         0        14         2        26        26        26
#Graph imputations
autoplot(s03_v01_full_ts, series='interpolated') +
  autolayer(s03_v01_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v02_full_ts, series='interpolated') +
  autolayer(s03_v02_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v03_full_ts, series='interpolated') +
  autolayer(s03_v03_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v05_full_ts, series='interpolated') +
  autolayer(s03_v05_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v07_full_ts, series='interpolated') +
  autolayer(s03_v07_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

Next, the data were analyzed for outliers. After a preliminary inspection, it appears that there was one value in S06 (Var01, Var03, Var05, Var07) and one value in S02 (Var01, Var03, Var05, Var07) that is much different that any other measurements, and were therefore removed.

ggplot_df <- excel_df %>% 
  gather('Var', 'Value', 3:7)
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) + 
  geom_line() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Plots With Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) + 
  geom_boxplot() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Box Plots With Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

ggplot_df <- excel_no_outlier_df %>% 
  gather('Var', 'Value', 3:7) #%>% 
  #use this to filter and graph subgroups
  #filter(group=='S03' & Var=='Var05' & (SeriesInd > as.Date('2015-01-01') & SeriesInd < as.Date('2017-01-01')))
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) + 
  geom_line() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Plots Without Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) + 
  geom_boxplot() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Box Plots Without Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

s03_v01_max_points_df <- s03_v01_df %>% filter(
  SeriesInd<41600 & Var01 == max(s03_v01_df %>% filter(SeriesInd<41600) %>% select(Var01)) |
  SeriesInd<42500 & Var01 == max(s03_v01_df %>% filter(SeriesInd<42500) %>% select(Var01))
)

s03_v01_freq <- (s03_v01_max_points_df[2,1] - s03_v01_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v01_df, aes(x=SeriesInd, y=Var01)) +
  geom_line() +
  geom_point(s03_v01_max_points_df, 
             mapping = aes(x=SeriesInd, Var01), 
             size=3, 
             color='blue')
#s03_v02_freq <- 125

#freq_points_df <- s03_v02_df[seq(1, nrow(s03_v02_df), s03_v02_freq), ]

ggplot(s03_v02_df, aes(x=SeriesInd, y=Var02)) +
  geom_line() #+
  #geom_point(freq_points_df, 
  #           mapping = aes(x=SeriesInd, Var02), 
  #           size=3, 
  #           color='blue')

S03 Analysis

s03_v03_max_points_df <- s03_v03_df %>% filter(
  SeriesInd<41600 & Var03 == max(s03_v03_df %>% filter(SeriesInd<41600) %>% select(Var03)) |
  SeriesInd<42500 & Var03 == max(s03_v03_df %>% filter(SeriesInd<42500) %>% select(Var03))
)

s03_v03_freq <- (s03_v03_max_points_df[2,1] - s03_v03_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v03_df, aes(x=SeriesInd, y=Var03)) +
  geom_line() +
  ggtitle('S03 Var03 Data Plot') +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(s03_v03_max_points_df, 
             mapping = aes(x=SeriesInd, Var03), 
             size=3, 
             color='blue')

There is an obvious upward trend in the data for S03 Var05 and Var07 with possibly some cyclical nature. Both are highly correlated with each other so it appears the final selected method for one will work with the other. For time series data with no obvious trend-cycle, a simple exponential smooothing method would work the best; however, since there is an apparent upward trend, then it is possible that the holt method would be a good fit, which was specifically developed to address these types of time series. A holt-winter and STL decomposition model will also be studied in case there is any unseen seasonal action being performed. Additionally, given the short-term erratic nature of the values from point to point, an ARIMA model will be attempted. This model will specifically include a drift component given that the data show an upward trend. Finally, a simple naive model will be constructed in order to gauge the effectiveness of the other models.

All the models mentioned above will have various hyperparameters trained and optimized to have the lowest root mean square error on the 140th observation (maximum prediction length). In order to ensure that the model that best fits the series, and not the last few points, the 140th step-ahead error was calculated from the 1000th point to the end of the data set and the RMSE was calculated on these results. Additionally, the mean average percentage error (MAPE) was taken to provide further insight. Only valid candidate models will be produced below

s03_v05_max_points_df <- s03_v05_df %>% filter(
  SeriesInd<41600 & Var05 == max(s03_v05_df %>% filter(SeriesInd<41600) %>% select(Var05)) |
  SeriesInd<42500 & Var05 == max(s03_v05_df %>% filter(SeriesInd<42500) %>% select(Var05))
)

s03_v05_freq <- (s03_v05_max_points_df[2,1] - s03_v05_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v05_df, aes(x=SeriesInd, y=Var05)) +
  geom_line() +
  ggtitle('S03 Var05 Data Plot') +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(s03_v05_max_points_df, 
             mapping = aes(x=SeriesInd, Var05), 
             size=3, 
             color='blue')

s03_v07_max_points_df <- s03_v07_df %>% filter(
  SeriesInd<41600 & Var07 == max(s03_v07_df %>% filter(SeriesInd<41600) %>% select(Var07)) |
  SeriesInd<42500 & Var07 == max(s03_v07_df %>% filter(SeriesInd<42500) %>% select(Var07))
)

s03_v07_freq <- (s03_v07_max_points_df[2,1] - s03_v07_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v07_df, aes(x=SeriesInd, y=Var07)) +
  geom_line() +
  ggtitle('S03 Var07 Data Plot') +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(s03_v07_max_points_df, 
             mapping = aes(x=SeriesInd, Var07), 
             size=3, 
             color='blue')

#ggsubseriesplot(s03_v05_full_ts)
library(GGally)
ggpairs(base_df %>% select(-SeriesInd, -group))
gglagplot(s03_v05_ts) + 
  ggtitle('S03 Var05 Lag Plot') + 
  theme(plot.title = element_text(hjust = 0.5))
ggAcf(s03_v05_full_ts, lag = 1000) + 
  ggtitle('S03 Var05 ACF Plot') + 
  theme(plot.title = element_text(hjust = 0.5))
#train/test set
test_ratio <- 0.95
cv_h <- 140
cv_start <- 1000
prediction_length <- 140
checkresiduals(s03_v05_naive_fit)
autoplot(s03_v05_naive_fit) + 
  autolayer(s03_v05_naive_test_fit) +
  autolayer(s03_v05_naive_ts) + 
  ggtitle('S03 Var05 Naive Plot') + 
  theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_naive_fit)
autoplot(s03_v07_naive_fit) + 
  autolayer(s03_v07_naive_test_fit) +
  autolayer(s03_v07_naive_ts) + 
  ggtitle('S03 Var07 Naive Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

STL Model

checkresiduals(s03_v05_stl_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 409.12, df = 322.4, p-value = 0.0007483
## 
## Model df: 2.   Total lags used: 324.4
autoplot(s03_v05_stl_fit) + 
  autolayer(s03_v05_stl_test_fit, series = 'STL_Test') +
  autolayer(s03_v05_stl_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_stl_train_ts, h = length(s03_v05_stl_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 STL Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_stl_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 275.4, df = 178, p-value = 3.839e-06
## 
## Model df: 2.   Total lags used: 180
autoplot(s03_v07_stl_fit) + 
  autolayer(s03_v07_stl_test_fit, series = 'STL_Test') +
  autolayer(s03_v07_stl_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_stl_train_ts, h = length(s03_v07_stl_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 STL Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v05_holt)
s03_v05_holt_fit <- s03_v05_holt %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v05_holt_fit %>% 
  autoplot() + 
  autolayer(s03_v05_holt_test_fit, series = 'Holt_Test') +
  autolayer(s03_v05_holt_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_holt_train_ts, h = length(s03_v05_holt_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 Holt Plot') + 
  theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_holt)
s03_v07_holt_fit <- s03_v07_holt %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v07_holt_fit %>% 
  autoplot() + 
  autolayer(s03_v07_holt_test_fit, series = 'Holt_Test') +
  autolayer(s03_v07_holt_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_holt_train_ts, h = length(s03_v07_holt_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 Holt Plot') + 
  theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v05_hw)
s03_v05_hw_fit <- s03_v05_hw %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v05_hw_fit %>% 
  autoplot() + 
  autolayer(s03_v05_hw_test_fit, series = 'HoltWinter_Test') +
  autolayer(s03_v05_hw_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_hw_train_ts, h = length(s03_v05_hw_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 Holt Winter Plot') + 
  theme(plot.title = element_text(hjust = 0.5))
checkresiduals(s03_v07_hw)
s03_v07_hw_fit <- s03_v07_hw %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v07_hw_fit %>% 
  autoplot() + 
  autolayer(s03_v07_hw_test_fit, series = 'HoltWinter_Test') +
  autolayer(s03_v07_hw_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_hw_train_ts, h = length(s03_v07_hw_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 Holt Winter Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

Arima Model

checkresiduals(s03_v05_arima_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with drift
## Q* = 262.86, df = 175, p-value = 1.924e-05
## 
## Model df: 5.   Total lags used: 180
s03_v05_arima_fit %>% 
  autoplot() + 
  autolayer(s03_v05_arima_test_fit, series = 'Arima_Test') +
  autolayer(s03_v05_arima_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_arima_train_ts, h = length(s03_v05_arima_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 Arima Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_arima_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with drift
## Q* = 243.34, df = 176, p-value = 0.000583
## 
## Model df: 4.   Total lags used: 180
s03_v07_arima_fit %>% 
  autoplot() + 
  autolayer(s03_v07_arima_test_fit, series = 'Arima_Test') +
  autolayer(s03_v07_arima_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_arima_train_ts, h = length(s03_v07_arima_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 Arima Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

S03 Summary

The results of the hyperparameter training of the models indicate that STL and ARIMA models have outperformed the naive model while trading off as the best model in various steps of measurement. Upon visual inspection, it appears the STL model provides a more reliable trend tracking component, as the ARIMA model is greatly over-guessing on the presented train/test split. It appears the STL model is more reactive to the cylce component of the trend-cyle; therefore, it will be the model used. If the prediction length were to be much more than 140 periods, ti appears the ARIMA model would fit best due to its better capture of the trend component. It is noted that the STL model does not pass the traditional statistical tests; however, it appears to be the best attempted fit and remains the selected model for S03 V05. For S03 V07, the same model type, with a slightly different set of hyperparameters was chosen and for similar reasons which returned similar results.

model_comparison_s03_v05_df %>% 
  arrange(RMSE_140) %>% 
  kable('html', caption = 'S03 Var05 Model Comparison') %>% 
  kable_styling()
S03 Var05 Model Comparison
Name RMSE_10 RMSE_100 RMSE_140 MAPE_10 MAPE_100 MAPE_140
stl 5.394094 15.52126 19.89764 4.107568 12.99470 16.73123
arima 5.087217 15.87391 19.91363 3.778246 13.52621 16.77458
naive 5.063947 16.27119 21.32072 3.789333 13.77596 17.78273
holt 5.092060 16.31039 21.51643 3.804666 13.78571 17.98299
hw 5.132639 16.80882 22.43504 3.835250 14.10881 18.38663
model_comparison_s03_v07_df %>% 
  arrange(RMSE_140) %>% 
  kable('html', caption = 'S03 Var07 Model Comparison') %>% 
  kable_styling()
S03 Var07 Model Comparison
Name RMSE_10 RMSE_100 RMSE_140 MAPE_10 MAPE_100 MAPE_140
stl 5.004000 15.09197 19.61776 3.761986 12.39642 16.11634
arima 5.104626 15.76577 19.71367 3.785808 13.41122 16.63635
naive 5.084478 16.19112 21.23420 3.815446 13.69470 17.71861
holt 5.095034 16.24039 21.50517 3.815808 13.67977 17.87598
hw 5.219941 16.31255 22.15851 3.905713 13.21275 17.85656
forecast_fun_reg <- function(y, h = h, xreg=NULL) {
  new_xreg <- fourier(y, K=9)
  x <- auto.arima(y, lambda = 0, xreg=new_xreg) %>% 
    forecast(xreg=new_xreg, h=h) 
  x$mean <- x$mean[1:h]
  return(x)
}
source('local_cv_test.R')

#try cv instead
e <- tsCV_jg(y = s03_v05_arima_ts, 
          forecastfunction = forecast_fun_no_reg,
          h=2,
          initial=1600)
sqrt(colMeans(e^2, na.rm = TRUE))
ets_frequency <- 18
s03_v05_ets_ts <- ts(s03_v05_full_ts, frequency = ets_frequency, start = 1)
s03_v05_ets_train_ts <- subset(s03_v05_ets_ts, start = 1, end = floor(test_ratio * 1622))
s03_v05_ets_test_ts <- subset(s03_v05_ets_ts, start = floor(test_ratio * 1622))
s03_v05_ets_test <- ets(s03_v05_ets_train_ts, 
                  'AAN',
                  lambda = NULL)
s03_v05_ets_test
checkresiduals(s03_v05_ets_test)
autoplot(s03_v05_ets_test)
s03_v05_ets_test_fit <- s03_v05_ets_test %>% forecast(h=test_length)
s03_v05_ets_test_fit %>% 
  autoplot() + 
  autolayer(s03_v05_ets_test_ts)

s03_v05_tbats_ts <- ts(s03_v05_full_ts, frequency = 30, start = min(excel_df$SeriesInd))
s03_v05_tbats_train_ts <- subset(s03_v05_tbats_ts, start = 1, end = length(s03_v05_tbats_ts)-test_length)
s03_v05_tbats_test_ts <- subset(s03_v05_tbats_ts, start = length(s03_v05_tbats_ts)-test_length+1)
s03_v05_tbats_test <- tbats(s03_v05_tbats_train_ts)
s03_v05_tbats_test
checkresiduals(s03_v05_tbats_test)
s03_v05_tbats_test_fit <- s03_v05_tbats_test %>% forecast(h=test_length)
s03_v05_tbats_test_fit %>% 
  autoplot() + 
  autolayer(s03_v05_tbats_test_ts)