Predictive Analytics Forecasting

# Clear the workspace
rm(list = ls()) # Clear environment
gc()            # Clear unused memory

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 458678 24.5     991037   53   638942 34.2
## Vcells 823618  6.3    8388608   64  1633064 12.5

cat("\f")       # Clear the console

library("feasts")
library("seasonal")
library("tsibble")
library("tsibbledata")
library("dplyr")
library("ggplot2")
library("forecast")
library("fable")
library("weathermetrics")
library("seasonalview")
library("tibbletime")
library("fpp3")
library("fma")
library("fpp")
library("fpp2")
library("grid")
library("gridExtra")
library("seasonal")

# Set working directory
setwd("/Users/spoll/OneDrive/Documents/Boston College/Predictive Analytics_Forecasting/Week 6/midterm")

# Load the datasets
train <- read.csv("train.csv"
                  , check.names = FALSE
                  , stringsAsFactors = FALSE
                  , na.strings = ""
)

test <- read.csv("test.csv"
                 , check.names = FALSE
                 , stringsAsFactors = FALSE
                 , na.strings = ""
)

confirmed <- train[which(train$Target == "ConfirmedCases"),]
fatalities <- train[which(train$Target == "Fatalities"),]

It is interesting here how the medians for cases and fatalities are very off. The median for cases is 116,928 while the median for fatalities 4451. This pattern is similar for the means as well. This makes sense looking at the time series plots as fatalities started to decline around day 80-90 and the trend between fatalities and cases decoupled.

# TIME SERIES---------------------------------
# Confirmed Cases Time Series
confirmed_agg <- aggregate(TargetValue ~ Date, data = confirmed, FUN = sum)

confirmed_ts <- ts(confirmed_agg)

summary(confirmed_ts[,"TargetValue"])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     194    4926  116928   82349  142319  197934

confirmed_plot <- autoplot(confirmed_ts[,"TargetValue"]) + 
  labs(title="Confirmed Cases Time Series") + 
  ylab("Cases") +
  theme(plot.title = element_text(hjust = 0.5))

# Fatalities Time Series
fatalities_agg <- aggregate(TargetValue ~ Date, data = fatalities, FUN = sum)

fatalities_ts <- ts(fatalities_agg)

summary(fatalities_ts[,"TargetValue"])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   206.5  4451.0  4666.2  8238.0 17355.0

fatalities_plot <- autoplot(fatalities_ts[,"TargetValue"]) + 
  labs(title="Fatalities Time Series") + 
  ylab("Fatalities") +
  theme(plot.title = element_text(hjust = 0.5))

# Combine Time Series Plots
grid.arrange(confirmed_plot, fatalities_plot, nrow = 2)

Univariate Time Series: Confirmed: The ACF plot suggest that there is a trend but not too much seasonality as the “scalloped” shaped described in the textbook is not as severe here. The PACF suggest that an AR1 model maybe be a good fit because the lags severely drop off after lag 1.

# UNIVARIATE TIME SERIES-----------------------------------------
# Confirmed Univariate Time Series
confirmed_ts2 <- ts(subset(confirmed_agg
                          , select = "TargetValue")
                   , frequency = 2)

ggtsdisplay(confirmed_ts2) #confirmed diagnostics

Fatalities: The ACF plot suggests there is seasonality and trend apparent. The PACF plot suggests an AR9 model.

# Fatalities Univariate Time Series
fatalities_ts2 <- ts(subset(fatalities_agg
                           , select = "TargetValue")
                    , frequency = 2)

ggtsdisplay(fatalities_ts2) #fatalities diagnostics

Linear Regression: Confirmed: The p-value is statistically significant as it is less than 0.05. The residuals are mostly distributed normally, maybe slightly left-skewed. The ACF plot suggests there is still some trend and seasonlity present.

# LINEAR REGRESSIONS ---------------------
# Confirmed Cases Linear Regression
confirmed_lm <- tslm(TargetValue ~ Date
                    , data = confirmed_ts)

confirmed_lm_plot <- autoplot(confirmed_ts[,"TargetValue"], 
                             main = "Confirmed Cases Linear Regression") + 
  autolayer(fitted(confirmed_lm)
            , series = "Fitted Linear Regression") +
  ylab("Confirmed Cases") +
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(confirmed_lm)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals from Linear regression model
## LM test = 128.78, df = 10, p-value < 2.2e-16

Fatalities: The p-value is statistically significant as it is less than 0.05. The residual plot and ACF plot show that there is still some seasonality and trend present. The residuals are not normally distributed as there is a spike in the residuals histogram around the positive 5000 mark.

# Fatalities Linear Regression
fatalities_lm <- tslm(TargetValue ~ Date
                     , data = fatalities_ts)

fatalities_lm_plot <- autoplot(fatalities_ts[,"TargetValue"], 
                              main = "Fatalities Linear Regression") + 
  autolayer(fitted(fatalities_lm)
            , series = "Fitted Linear Regression") +
  ylab("Fatalities") +
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(fatalities_lm)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals from Linear regression model
## LM test = 125.76, df = 10, p-value < 2.2e-16

# Combine Plots
grid.arrange(confirmed_lm_plot, fatalities_lm_plot, nrow = 2)

The linear regression fits both show an increasing trend. The fatalities trend has a flatter trend mostly because of the decreasing trend around day 80-90 as previously stated. Autocorrelation is very similar between the cases and fatalities as the lags dip below the bounds after lag 16.

# LINEAR REGRESSIONS FORECAST--------------------
newdata <- data.frame(Date = c(141:150))
forecast(confirmed_lm, newdata = newdata)

##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 141       187921.5 150017.3 225825.6 129719.2 246123.7
## 142       189419.0 151503.4 227334.5 131199.2 247638.8
## 143       190916.4 152989.3 228843.6 132678.8 249154.0
## 144       192413.9 154475.0 230352.8 134158.3 250669.6
## 145       193911.4 155960.6 231862.2 135637.5 252185.3
## 146       195408.9 157446.0 233371.8 137116.4 253701.4
## 147       196906.4 158931.3 234881.5 138595.1 255217.6
## 148       198403.9 160416.4 236391.4 140073.6 256734.1
## 149       199901.4 161901.3 237901.4 141551.9 258250.9
## 150       201398.8 163386.1 239411.6 143029.9 259767.8

forecast(fatalities_lm, newdata = newdata)

##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 141       10085.35 5993.529 14177.16 3802.313 16368.38
## 142       10162.21 6069.161 14255.26 3877.284 16447.14
## 143       10239.08 6144.776 14333.38 3952.229 16525.93
## 144       10315.95 6220.375 14411.52 4027.148 16604.75
## 145       10392.81 6295.956 14489.67 4102.040 16683.59
## 146       10469.68 6371.520 14567.84 4176.907 16762.45
## 147       10546.55 6447.068 14646.03 4251.748 16841.35
## 148       10623.41 6522.598 14724.23 4326.563 16920.27
## 149       10700.28 6598.112 14802.45 4401.352 16999.21
## 150       10777.15 6673.608 14880.69 4476.114 17078.18

Holt-Winters’ Methods: The book states that the additive method is preferred when the seasonal variations are roughly constant through the series, while the multiplicative method is preferred when the seasonal variations are changing proportional to the level of the series. Judging by the previous plots about the time series for confirmed cases and fatalities, the additive method would probably be the best method since the seasonal variation is generally constant. I decided to also perform the multiplicative method because I was interested to see its performance.

# HOLT-WINTERS' ---------------------------
# HOLT-WINTERS' CONFIRMED
holtwinters_confirmed <- hw(confirmed_ts2, seasonal = "additive")
holtwinters_confirmed2 <- hw(confirmed_ts2, seasonal = "multiplicative")

holtwinters_confirmed_plot <- autoplot(confirmed_ts2, 
                                      main = "Confirmed Cases Holt-Winters'") + 
  autolayer(fitted(holtwinters_confirmed), series = "Fitted Additive") +
  autolayer(fitted(holtwinters_confirmed2), series = "Fitted Multiplicative") +
  ylab("Confirmed Cases") +
  theme(plot.title = element_text(hjust = 0.5))

# HOLT-WINTERS' FATALITIES
holtwinters_fatalities <- hw(fatalities_ts2, seasonal = "additive")
holtwinters_fatalities2 <- hw(fatalities_ts2, seasonal = "multiplicative")

holtwinters_fatalities_plot <- autoplot(fatalities_ts2, 
             main = "Fatalities Holt-Winters'") + 
  autolayer(fitted(holtwinters_fatalities), series = "Fitted Additive") +
  autolayer(fitted(holtwinters_fatalities2), series = "Fitted Multiplicative") +
  ylab("Fatalities") + 
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(holtwinters_confirmed_plot, holtwinters_fatalities_plot, nrow = 2)

The Holt-Winters’ method for the confirmed cases did not perform that well as the RSME is very high, compared to the fatalities. The additive method outperformed the multiplicative method as predicted based on the seasonality variation. Though the Holt-Winters’ method perfromed relatively well, I feel like there is a better model fit out there.

# forecast holt-winter's
accuracy(holtwinters_confirmed)

##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 650.7878 10986.12 7752.638 -21.73239 44.64228 0.6507677 0.0617892

accuracy(holtwinters_confirmed2)

##                    ME    RMSE      MAE      MPE     MAPE      MASE      ACF1
## Training set 266.8821 11261.3 7969.761 -34.1713 50.47504 0.6689933 0.1001789

accuracy(holtwinters_fatalities)

##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 41.49784 1418.019 880.6689 -86.07229 185.7256 0.6510194 0.1062649

accuracy(holtwinters_fatalities2)

##                    ME    RMSE      MAE       MPE     MAPE     MASE      ACF1
## Training set 7.850963 1452.54 873.9613 -152.8496 177.3269 0.646061 0.1281022

Due to the disappointing performances of the original Holt-Winters’ methods, dampening the models may prove to provide better results. Below are the models run again but with a damped trend method. The damped method for the confirmed cases performed worse than the non-damped method.

# DAMPED MODELS-----------------------------
# Damped Confirmed Cases
damped_confirmed <- hw(confirmed_ts2, seasonal = "additive", damped = T)
damped_confirmed2 <- hw(confirmed_ts2, seasonal = "multiplicative", damped = T)

damped_confirmed_plot <- autoplot(confirmed_ts2, 
                                 main = "Confirmed Cases Damped Holt-Winters'") + 
  autolayer(fitted(damped_confirmed)
            , series = "Fitted Additive") + 
  autolayer(fitted(damped_confirmed2)
            , series = "Fitted Multiplicative") + 
  ylab("Confirmed Cases")

accuracy(damped_confirmed)

##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 1002.928 11011.26 7762.176 -17.46443 41.13679 0.6515683 0.0624929

accuracy(damped_confirmed2)

##                    ME     RMSE     MAE       MPE     MAPE      MASE      ACF1
## Training set 709.8901 11284.55 8003.31 -30.07141 48.43074 0.6718095 0.1014102

It is interesting here that the damped multiplicative method improved the RSME for fatalities. For the non-damped, the additive method was the better option but it flips once the data is damped.

# Damped Fatalities
damped_fatalities <- hw(fatalities_ts2, seasonal = "additive", damped = T)
damped_fatalities2 <- hw(fatalities_ts2, seasonal = "multiplicative", damped = T)

accuracy(damped_fatalities)

##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 45.72692 1418.129 880.6073 -83.65027 185.2103 0.6509739 0.1062452

accuracy(damped_fatalities2)

##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 33.55697 1389.253 851.1629 -101.9423 126.0482 0.6292076 0.1387812

damped_fatalities_plot <- autoplot(fatalities_ts2
                                  , main = "Fatalities Damped Holt-Winters'") + 
  autolayer(fitted(damped_fatalities)
            , series = "Fitted Additive") + 
  autolayer(fitted(damped_fatalities2)
            , series = "Fitted Multiplicative") + 
  ylab("Fatalities")

# Combine Plots
grid.arrange(damped_confirmed_plot, damped_fatalities_plot, nrow = 2)

ARIMA Models: For the confirmed cases, an ARIMA(0,1,0) model was used and the p-value shows that it is statistically significant. The ACF plot suggest there is still some seasonality present but the residuals histogram shows that the residuals are normally distributed.

# ARIMA MODELS --------------
# ARIMA Confirmed
confirmed_arima <- auto.arima(confirmed_ts2)

checkresiduals(confirmed_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 25.342, df = 4, p-value = 4.294e-05
## 
## Model df: 0.   Total lags used: 4

accuracy(confirmed_arima)

##                    ME     RMSE     MAE       MPE    MAPE     MASE         ACF1
## Training set 1144.244 11221.51 7746.53 -9.089343 29.7029 0.650255 -0.001010136

confirmed_forecast <- forecast(confirmed_arima, h = 12)

accuracy(confirmed_forecast)

##                    ME     RMSE     MAE       MPE    MAPE     MASE         ACF1
## Training set 1144.244 11221.51 7746.53 -9.089343 29.7029 0.650255 -0.001010136

confirmed_arime_fc_plot <- autoplot(confirmed_forecast) +
  labs(title = "Confirmed COVID Cases Forecast using ARIMA(0,1,0)") +
  ylab("Confirmed COVID Cases") +
  theme(plot.title = element_text(hjust = 0.5))

For the fatalities data, an ARIMA(0,1,1)(2,0,2)[2] model was chosen as the optimal model and the p-value shows it is statistically significant. The ACF plot shows that lags 7 and 8 were significant as shown in the ARIMA output that it used 8 lags. The residuals are normally distributed as well.

# ARIMA Fatalities
fatalities_arima <- auto.arima(fatalities_ts2)

checkresiduals(fatalities_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(2,0,2)[2]
## Q* = 23.503, df = 3, p-value = 3.173e-05
## 
## Model df: 5.   Total lags used: 8

accuracy(fatalities_arima)

##                    ME    RMSE      MAE       MPE     MAPE      MASE
## Training set 76.26603 1135.24 709.0354 -82.31197 110.9137 0.5241423
##                      ACF1
## Training set -0.004822486

fatalities_forecast <- forecast(fatalities_arima, h = 12)

accuracy(fatalities_forecast)

##                    ME    RMSE      MAE       MPE     MAPE      MASE
## Training set 76.26603 1135.24 709.0354 -82.31197 110.9137 0.5241423
##                      ACF1
## Training set -0.004822486

fatalities_arima_fc_plot <- autoplot(fatalities_forecast) +
  labs(title = "COVID Fatalities Forecast using ARIMA(0,1,1)(2,0,2)") +
  ylab("COVID Fatalities") +
  theme(plot.title = element_text(hjust = 0.5))
  

# Combine Plots
grid.arrange(confirmed_arime_fc_plot, fatalities_arima_fc_plot, nrow = 2)

ETS Models:

The ETS model for confirmed cases was optimized using ETS(A,N,A) and a total of 7 lags. The ETS model for fatalities was optimized using ETS(M,A,M). This makes sense as when we performed the damped Holt-Winters’ test, the multiplicative method outperformed the additive method.

# ETS MODELS -------------------------
# Confirmed ETS
confirmed_ets <- ets(confirmed_ts2)

checkresiduals(confirmed_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 71.237, df = 3, p-value = 2.331e-15
## 
## Model df: 4.   Total lags used: 7

# Fatalities ETS
fatalities_ets <- ets(fatalities_ts2)

checkresiduals(fatalities_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,M)
## Q* = 50.238, df = 3, p-value = 7.109e-11
## 
## Model df: 6.   Total lags used: 9

# ETS FORECASTS ------------------------------
# Confirmed ETS
confirmed_ets_forecast <- forecast(confirmed_ets, h = 12)

accuracy(confirmed_ets_forecast)

##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 1158.831 11021.44 7772.226 -20.22401 47.23515 0.6524119 0.06383282

confirmed_ets_fc_plot <- confirmed_ets_forecast %>% autoplot() +
  labs(title="Confirmed COVID Cases Forecast Using ETS(A,N,A)") +
  ylab("Cases") +
  theme(plot.title = element_text(hjust = 0.5))

# ETS Fatalities
fatalities_ets_forecast <- forecast(fatalities_ets, h = 12)

accuracy(fatalities_ets_forecast)

##                     ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set -28.73626 1606.458 1045.536 -97.71345 120.2126 0.7728947 0.5045869

fatalities_ets_fc_plot <- fatalities_ets_forecast %>% autoplot() +
  labs(title="COVID Fatalities Forecast Using ETS(M,A,M)") + 
  ylab("Fatalities") +
  theme(plot.title = element_text(hjust = 0.5))

# Merge ETS plots
grid.arrange(confirmed_ets_fc_plot
             , fatalities_ets_fc_plot
             , nrow = 2)

After performing multiple models, the results differed for the two data sets. The optimal model for the confirmed cases is the Holt-Winters’ Additive with the lowest RSME of 10,986. The optimal model for the fatalities is the ARIMA(0,1,1)(2,0,2)[2] with the lowest RSME of 1135. Below are the predictions for each model:

# Predictions
predict(holtwinters_confirmed, confirmed_test_ts2)

##       Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 71.00       165106.0 150714.9 179497.0 143096.8 187115.2
## 71.50       161439.3 141339.0 181539.6 130698.6 192180.1
## 72.00       166157.3 141434.7 190879.9 128347.4 203967.2
## 72.50       162490.6 134061.7 190919.6 119012.3 205969.0

fatalities_forecast

##       Point Forecast    Lo 80    Hi 80       Lo 95     Hi 95
## 71.00       7340.400 5853.317 8827.483 5066.102417  9614.697
## 71.50       6704.019 4919.689 8488.348 3975.122705  9432.915
## 72.00       5193.833 3279.823 7107.844 2266.607172  8121.059
## 72.50       4530.372 2459.188 6601.557 1362.768678  7697.976
## 73.00       4800.893 2641.270 6960.517 1498.034922  8103.752
## 73.50       5676.898 3415.066 7938.730 2217.724243  9136.071
## 74.00       6998.095 4515.060 9481.129 3200.621224 10795.568
## 74.50       7267.603 4624.503 9910.702 3225.330539 11309.875
## 75.00       6468.488 3665.999 9270.976 2182.451174 10754.524
## 75.50       5526.997 2576.442 8477.552 1014.512281 10039.481
## 76.00       4602.106 1595.419 7608.794    3.774553  9200.438
## 76.50       4733.122 1647.293 7818.951   13.754254  9452.490

Predictive Analytics Forecasting

Sal Pollastro

4/24/2022