Project 1

install_and_load_pkgs <- function(pkg){
  # Load a vector of packages and install them if needed.
  # CODE SOURCE: https://gist.github.com/stevenworthington/3178163
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) install.packages(new.pkg, dependencies = T)
  sapply(pkg, require, character.only = T, quietly = T, warn.conflicts = T)
}

# list the packages here
pkgs <- c("dplyr", "tidyr", "forecast", "ggplot2", "xts", "doMC", "kableExtra", "openxlsx")

install_and_load_pkgs(pkgs)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: package 'forecast' was built under R version 3.4.4

## Warning: package 'xts' was built under R version 3.4.4

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Warning: package 'kableExtra' was built under R version 3.4.4

## Warning: package 'openxlsx' was built under R version 3.4.4

# if you are using windows, use this command:
# install.packages("doMC", repos="http://R-Forge.R-project.org")

source('Source/project1_jg.R')
source('local_cv_test.R')

Data Preparation

On initial inspection, the SeriesInd appear to have minor regularly occuring gaps for all data points. The consecutive runs of four and five index values do indicate that this time series might be based on weekday reporting; however, there did not appear to be a noticeable origin date to use in order to link the indices to actual datetimes. Therefore, it was decided that the indices were to be treated as if they were consecutive, ignoring the gaps, much like how weekday data would be processed anyways, but without an indication of frequency (e.g. week, month, year time periods). In fact, due to the unknown nature of the frequency of values, this feature will be varied in hyperparameter training, which is typically not the case.

As a first step, the data were checked for missing values, which were present in all of the variables. These values were filled in by interpolation.

#check nulls
sapply(excel_df, function(x){sum(is.na(x))})

## SeriesInd     group     Var01     Var02     Var03     Var05     Var07 
##         0         0        14         2        26        26        26

#Graph imputations
autoplot(s03_v01_full_ts, series='interpolated') +
  autolayer(s03_v01_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v02_full_ts, series='interpolated') +
  autolayer(s03_v02_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v03_full_ts, series='interpolated') +
  autolayer(s03_v03_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v05_full_ts, series='interpolated') +
  autolayer(s03_v05_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

autoplot(s03_v07_full_ts, series='interpolated') +
  autolayer(s03_v07_ts, series='original') +
  scale_color_manual(
    values = c(`interpolated`='red',`original`='gray')
  )

Next, the data were analyzed for outliers. After a preliminary inspection, it appears that there was one value in S06 (Var01, Var03, Var05, Var07) and one value in S02 (Var01, Var03, Var05, Var07) that is much different that any other measurements, and were therefore removed.

ggplot_df <- excel_df %>% 
  gather('Var', 'Value', 3:7)
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) + 
  geom_line() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Plots With Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) + 
  geom_boxplot() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Box Plots With Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

ggplot_df <- excel_no_outlier_df %>% 
  gather('Var', 'Value', 3:7) #%>% 
  #use this to filter and graph subgroups
  #filter(group=='S03' & Var=='Var05' & (SeriesInd > as.Date('2015-01-01') & SeriesInd < as.Date('2017-01-01')))
#check trends
ggplot(ggplot_df, aes(x=SeriesInd, y=Value, color=group)) + 
  geom_line() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Plots Without Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

#check outliers
ggplot(ggplot_df, aes(x=group, y=Value)) + 
  geom_boxplot() + 
  facet_wrap(~Var, scales = 'free', ncol = 2) +
  ggtitle('Series Box Plots Without Outliers') +
  theme(plot.title = element_text(hjust = 0.5))

s03_v01_max_points_df <- s03_v01_df %>% filter(
  SeriesInd<41600 & Var01 == max(s03_v01_df %>% filter(SeriesInd<41600) %>% select(Var01)) |
  SeriesInd<42500 & Var01 == max(s03_v01_df %>% filter(SeriesInd<42500) %>% select(Var01))
)

s03_v01_freq <- (s03_v01_max_points_df[2,1] - s03_v01_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v01_df, aes(x=SeriesInd, y=Var01)) +
  geom_line() +
  geom_point(s03_v01_max_points_df, 
             mapping = aes(x=SeriesInd, Var01), 
             size=3, 
             color='blue')

#s03_v02_freq <- 125

#freq_points_df <- s03_v02_df[seq(1, nrow(s03_v02_df), s03_v02_freq), ]

ggplot(s03_v02_df, aes(x=SeriesInd, y=Var02)) +
  geom_line() #+
  #geom_point(freq_points_df, 
  #           mapping = aes(x=SeriesInd, Var02), 
  #           size=3, 
  #           color='blue')

S03 Analysis

s03_v03_max_points_df <- s03_v03_df %>% filter(
  SeriesInd<41600 & Var03 == max(s03_v03_df %>% filter(SeriesInd<41600) %>% select(Var03)) |
  SeriesInd<42500 & Var03 == max(s03_v03_df %>% filter(SeriesInd<42500) %>% select(Var03))
)

s03_v03_freq <- (s03_v03_max_points_df[2,1] - s03_v03_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v03_df, aes(x=SeriesInd, y=Var03)) +
  geom_line() +
  ggtitle('S03 Var03 Data Plot') +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(s03_v03_max_points_df, 
             mapping = aes(x=SeriesInd, Var03), 
             size=3, 
             color='blue')

There is an obvious upward trend in the data for S03 Var05 and Var07 with possibly some cyclical nature. Both are highly correlated with each other so it appears the final selected method for one will work with the other. For time series data with no obvious trend-cycle, a simple exponential smooothing method would work the best; however, since there is an apparent upward trend, then it is possible that the holt method would be a good fit, which was specifically developed to address these types of time series. A holt-winter and STL decomposition model will also be studied in case there is any unseen seasonal action being performed. Additionally, given the short-term erratic nature of the values from point to point, an ARIMA model will be attempted. This model will specifically include a drift component given that the data show an upward trend. Finally, a simple naive model will be constructed in order to gauge the effectiveness of the other models.

All the models mentioned above will have various hyperparameters trained and optimized to have the lowest root mean square error on the 140th observation (maximum prediction length). In order to ensure that the model that best fits the series, and not the last few points, the 140th step-ahead error was calculated from the 1000th point to the end of the data set and the RMSE was calculated on these results. Additionally, the mean average percentage error (MAPE) was taken to provide further insight. Only valid candidate models will be produced below

s03_v05_max_points_df <- s03_v05_df %>% filter(
  SeriesInd<41600 & Var05 == max(s03_v05_df %>% filter(SeriesInd<41600) %>% select(Var05)) |
  SeriesInd<42500 & Var05 == max(s03_v05_df %>% filter(SeriesInd<42500) %>% select(Var05))
)

s03_v05_freq <- (s03_v05_max_points_df[2,1] - s03_v05_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v05_df, aes(x=SeriesInd, y=Var05)) +
  geom_line() +
  ggtitle('S03 Var05 Data Plot') +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(s03_v05_max_points_df, 
             mapping = aes(x=SeriesInd, Var05), 
             size=3, 
             color='blue')

s03_v07_max_points_df <- s03_v07_df %>% filter(
  SeriesInd<41600 & Var07 == max(s03_v07_df %>% filter(SeriesInd<41600) %>% select(Var07)) |
  SeriesInd<42500 & Var07 == max(s03_v07_df %>% filter(SeriesInd<42500) %>% select(Var07))
)

s03_v07_freq <- (s03_v07_max_points_df[2,1] - s03_v07_max_points_df[1,1])[[1]]

#graph and examine possible frequencies
ggplot(s03_v07_df, aes(x=SeriesInd, y=Var07)) +
  geom_line() +
  ggtitle('S03 Var07 Data Plot') +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(s03_v07_max_points_df, 
             mapping = aes(x=SeriesInd, Var07), 
             size=3, 
             color='blue')

#ggsubseriesplot(s03_v05_full_ts)
library(GGally)
ggpairs(base_df %>% select(-SeriesInd, -group))

gglagplot(s03_v05_ts) + 
  ggtitle('S03 Var05 Lag Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

ggAcf(s03_v05_full_ts, lag = 1000) + 
  ggtitle('S03 Var05 ACF Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

#train/test set
test_ratio <- 0.95
cv_h <- 140
cv_start <- 1000
prediction_length <- 140

checkresiduals(s03_v05_naive_fit)
autoplot(s03_v05_naive_fit) + 
  autolayer(s03_v05_naive_test_fit) +
  autolayer(s03_v05_naive_ts) + 
  ggtitle('S03 Var05 Naive Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_naive_fit)
autoplot(s03_v07_naive_fit) + 
  autolayer(s03_v07_naive_test_fit) +
  autolayer(s03_v07_naive_ts) + 
  ggtitle('S03 Var07 Naive Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

STL Model

checkresiduals(s03_v05_stl_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 409.12, df = 322.4, p-value = 0.0007483
## 
## Model df: 2.   Total lags used: 324.4

autoplot(s03_v05_stl_fit) + 
  autolayer(s03_v05_stl_test_fit, series = 'STL_Test') +
  autolayer(s03_v05_stl_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_stl_train_ts, h = length(s03_v05_stl_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 STL Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_stl_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 275.4, df = 178, p-value = 3.839e-06
## 
## Model df: 2.   Total lags used: 180

autoplot(s03_v07_stl_fit) + 
  autolayer(s03_v07_stl_test_fit, series = 'STL_Test') +
  autolayer(s03_v07_stl_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_stl_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_stl_train_ts, h = length(s03_v07_stl_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 STL Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v05_holt)
s03_v05_holt_fit <- s03_v05_holt %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v05_holt_fit %>% 
  autoplot() + 
  autolayer(s03_v05_holt_test_fit, series = 'Holt_Test') +
  autolayer(s03_v05_holt_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_holt_train_ts, h = length(s03_v05_holt_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 Holt Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_holt)
s03_v07_holt_fit <- s03_v07_holt %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v07_holt_fit %>% 
  autoplot() + 
  autolayer(s03_v07_holt_test_fit, series = 'Holt_Test') +
  autolayer(s03_v07_holt_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_holt_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_holt_train_ts, h = length(s03_v07_holt_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 Holt Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v05_hw)
s03_v05_hw_fit <- s03_v05_hw %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v05_hw_fit %>% 
  autoplot() + 
  autolayer(s03_v05_hw_test_fit, series = 'HoltWinter_Test') +
  autolayer(s03_v05_hw_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_hw_train_ts, h = length(s03_v05_hw_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 Holt Winter Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_hw)
s03_v07_hw_fit <- s03_v07_hw %>% 
  forecast(h=prediction_length, level = c(85, 90))
s03_v07_hw_fit %>% 
  autoplot() + 
  autolayer(s03_v07_hw_test_fit, series = 'HoltWinter_Test') +
  autolayer(s03_v07_hw_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_hw_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_hw_train_ts, h = length(s03_v07_hw_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 Holt Winter Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

Arima Model

checkresiduals(s03_v05_arima_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with drift
## Q* = 262.86, df = 175, p-value = 1.924e-05
## 
## Model df: 5.   Total lags used: 180

s03_v05_arima_fit %>% 
  autoplot() + 
  autolayer(s03_v05_arima_test_fit, series = 'Arima_Test') +
  autolayer(s03_v05_arima_test_ts, series = 'Actual') +
  autolayer(naive(s03_v05_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v05_arima_train_ts, h = length(s03_v05_arima_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var05 Arima Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(s03_v07_arima_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with drift
## Q* = 243.34, df = 176, p-value = 0.000583
## 
## Model df: 4.   Total lags used: 180

s03_v07_arima_fit %>% 
  autoplot() + 
  autolayer(s03_v07_arima_test_fit, series = 'Arima_Test') +
  autolayer(s03_v07_arima_test_ts, series = 'Actual') +
  autolayer(naive(s03_v07_arima_ts, h = prediction_length), series="Naive", PI=FALSE) +
  autolayer(naive(s03_v07_arima_train_ts, h = length(s03_v07_arima_test_ts)), series="Naive_Test", PI=FALSE) + 
  scale_color_manual(values=c('springgreen', 'steelblue', 'yellow', 'green')) + 
  ggtitle('S03 Var07 Arima Plot') + 
  theme(plot.title = element_text(hjust = 0.5))

S03 Summary

The results of the hyperparameter training of the models indicate that STL and ARIMA models have outperformed the naive model while trading off as the best model in various steps of measurement. Upon visual inspection, it appears the STL model provides a more reliable trend tracking component, as the ARIMA model is greatly over-guessing on the presented train/test split. It appears the STL model is more reactive to the cylce component of the trend-cyle; therefore, it will be the model used. If the prediction length were to be much more than 140 periods, ti appears the ARIMA model would fit best due to its better capture of the trend component. It is noted that the STL model does not pass the traditional statistical tests; however, it appears to be the best attempted fit and remains the selected model for S03 V05. For S03 V07, the same model type, with a slightly different set of hyperparameters was chosen and for similar reasons which returned similar results.

model_comparison_s03_v05_df %>% 
  arrange(RMSE_140) %>% 
  kable('html', caption = 'S03 Var05 Model Comparison') %>% 
  kable_styling()

S03 Var05 Model Comparison
Name	RMSE_10	RMSE_100	RMSE_140	MAPE_10	MAPE_100	MAPE_140
stl	5.394094	15.52126	19.89764	4.107568	12.99470	16.73123
arima	5.087217	15.87391	19.91363	3.778246	13.52621	16.77458
naive	5.063947	16.27119	21.32072	3.789333	13.77596	17.78273
holt	5.092060	16.31039	21.51643	3.804666	13.78571	17.98299
hw	5.132639	16.80882	22.43504	3.835250	14.10881	18.38663

model_comparison_s03_v07_df %>% 
  arrange(RMSE_140) %>% 
  kable('html', caption = 'S03 Var07 Model Comparison') %>% 
  kable_styling()

S03 Var07 Model Comparison
Name	RMSE_10	RMSE_100	RMSE_140	MAPE_10	MAPE_100	MAPE_140
stl	5.004000	15.09197	19.61776	3.761986	12.39642	16.11634
arima	5.104626	15.76577	19.71367	3.785808	13.41122	16.63635
naive	5.084478	16.19112	21.23420	3.815446	13.69470	17.71861
holt	5.095034	16.24039	21.50517	3.815808	13.67977	17.87598
hw	5.219941	16.31255	22.15851	3.905713	13.21275	17.85656

forecast_fun_reg <- function(y, h = h, xreg=NULL) {
  new_xreg <- fourier(y, K=9)
  x <- auto.arima(y, lambda = 0, xreg=new_xreg) %>% 
    forecast(xreg=new_xreg, h=h) 
  x$mean <- x$mean[1:h]
  return(x)
}
source('local_cv_test.R')

#try cv instead
e <- tsCV_jg(y = s03_v05_arima_ts, 
          forecastfunction = forecast_fun_no_reg,
          h=2,
          initial=1600)
sqrt(colMeans(e^2, na.rm = TRUE))

ets_frequency <- 18
s03_v05_ets_ts <- ts(s03_v05_full_ts, frequency = ets_frequency, start = 1)
s03_v05_ets_train_ts <- subset(s03_v05_ets_ts, start = 1, end = floor(test_ratio * 1622))
s03_v05_ets_test_ts <- subset(s03_v05_ets_ts, start = floor(test_ratio * 1622))
s03_v05_ets_test <- ets(s03_v05_ets_train_ts, 
                  'AAN',
                  lambda = NULL)
s03_v05_ets_test
checkresiduals(s03_v05_ets_test)
autoplot(s03_v05_ets_test)
s03_v05_ets_test_fit <- s03_v05_ets_test %>% forecast(h=test_length)
s03_v05_ets_test_fit %>% 
  autoplot() + 
  autolayer(s03_v05_ets_test_ts)

s03_v05_tbats_ts <- ts(s03_v05_full_ts, frequency = 30, start = min(excel_df$SeriesInd))
s03_v05_tbats_train_ts <- subset(s03_v05_tbats_ts, start = 1, end = length(s03_v05_tbats_ts)-test_length)
s03_v05_tbats_test_ts <- subset(s03_v05_tbats_ts, start = length(s03_v05_tbats_ts)-test_length+1)
s03_v05_tbats_test <- tbats(s03_v05_tbats_train_ts)
s03_v05_tbats_test
checkresiduals(s03_v05_tbats_test)
s03_v05_tbats_test_fit <- s03_v05_tbats_test %>% forecast(h=test_length)
s03_v05_tbats_test_fit %>% 
  autoplot() + 
  autolayer(s03_v05_tbats_test_ts)

Project 1

DATA 624 Group 2 Homework

John Grando, Kyle Gilde, Jonathan Hernandez, Albert Gilharry, Neil Hwang

January 27, 2019

Data Preparation

S03 Analysis

STL Model

Arima Model

S03 Summary