# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 458678 24.5 991037 53 638942 34.2
## Vcells 823618 6.3 8388608 64 1633064 12.5
cat("\f") # Clear the console
library("feasts")
library("seasonal")
library("tsibble")
library("tsibbledata")
library("dplyr")
library("ggplot2")
library("forecast")
library("fable")
library("weathermetrics")
library("seasonalview")
library("tibbletime")
library("fpp3")
library("fma")
library("fpp")
library("fpp2")
library("grid")
library("gridExtra")
library("seasonal")
# Set working directory
setwd("/Users/spoll/OneDrive/Documents/Boston College/Predictive Analytics_Forecasting/Week 6/midterm")
# Load the datasets
train <- read.csv("train.csv"
, check.names = FALSE
, stringsAsFactors = FALSE
, na.strings = ""
)
test <- read.csv("test.csv"
, check.names = FALSE
, stringsAsFactors = FALSE
, na.strings = ""
)
confirmed <- train[which(train$Target == "ConfirmedCases"),]
fatalities <- train[which(train$Target == "Fatalities"),]
It is interesting here how the medians for cases and fatalities are very off. The median for cases is 116,928 while the median for fatalities 4451. This pattern is similar for the means as well. This makes sense looking at the time series plots as fatalities started to decline around day 80-90 and the trend between fatalities and cases decoupled.
# TIME SERIES---------------------------------
# Confirmed Cases Time Series
confirmed_agg <- aggregate(TargetValue ~ Date, data = confirmed, FUN = sum)
confirmed_ts <- ts(confirmed_agg)
summary(confirmed_ts[,"TargetValue"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 194 4926 116928 82349 142319 197934
confirmed_plot <- autoplot(confirmed_ts[,"TargetValue"]) +
labs(title="Confirmed Cases Time Series") +
ylab("Cases") +
theme(plot.title = element_text(hjust = 0.5))
# Fatalities Time Series
fatalities_agg <- aggregate(TargetValue ~ Date, data = fatalities, FUN = sum)
fatalities_ts <- ts(fatalities_agg)
summary(fatalities_ts[,"TargetValue"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 206.5 4451.0 4666.2 8238.0 17355.0
fatalities_plot <- autoplot(fatalities_ts[,"TargetValue"]) +
labs(title="Fatalities Time Series") +
ylab("Fatalities") +
theme(plot.title = element_text(hjust = 0.5))
# Combine Time Series Plots
grid.arrange(confirmed_plot, fatalities_plot, nrow = 2)
Univariate Time Series: Confirmed: The ACF plot suggest that there is a trend but not too much seasonality as the “scalloped” shaped described in the textbook is not as severe here. The PACF suggest that an AR1 model maybe be a good fit because the lags severely drop off after lag 1.
# UNIVARIATE TIME SERIES-----------------------------------------
# Confirmed Univariate Time Series
confirmed_ts2 <- ts(subset(confirmed_agg
, select = "TargetValue")
, frequency = 2)
ggtsdisplay(confirmed_ts2) #confirmed diagnostics
Fatalities: The ACF plot suggests there is seasonality and trend apparent. The PACF plot suggests an AR9 model.
# Fatalities Univariate Time Series
fatalities_ts2 <- ts(subset(fatalities_agg
, select = "TargetValue")
, frequency = 2)
ggtsdisplay(fatalities_ts2) #fatalities diagnostics
Linear Regression: Confirmed: The p-value is statistically significant as it is less than 0.05. The residuals are mostly distributed normally, maybe slightly left-skewed. The ACF plot suggests there is still some trend and seasonlity present.
# LINEAR REGRESSIONS ---------------------
# Confirmed Cases Linear Regression
confirmed_lm <- tslm(TargetValue ~ Date
, data = confirmed_ts)
confirmed_lm_plot <- autoplot(confirmed_ts[,"TargetValue"],
main = "Confirmed Cases Linear Regression") +
autolayer(fitted(confirmed_lm)
, series = "Fitted Linear Regression") +
ylab("Confirmed Cases") +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(confirmed_lm)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals from Linear regression model
## LM test = 128.78, df = 10, p-value < 2.2e-16
Fatalities: The p-value is statistically significant as it is less than 0.05. The residual plot and ACF plot show that there is still some seasonality and trend present. The residuals are not normally distributed as there is a spike in the residuals histogram around the positive 5000 mark.
# Fatalities Linear Regression
fatalities_lm <- tslm(TargetValue ~ Date
, data = fatalities_ts)
fatalities_lm_plot <- autoplot(fatalities_ts[,"TargetValue"],
main = "Fatalities Linear Regression") +
autolayer(fitted(fatalities_lm)
, series = "Fitted Linear Regression") +
ylab("Fatalities") +
theme(plot.title = element_text(hjust = 0.5))
checkresiduals(fatalities_lm)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals from Linear regression model
## LM test = 125.76, df = 10, p-value < 2.2e-16
# Combine Plots
grid.arrange(confirmed_lm_plot, fatalities_lm_plot, nrow = 2)
The linear regression fits both show an increasing trend. The fatalities trend has a flatter trend mostly because of the decreasing trend around day 80-90 as previously stated. Autocorrelation is very similar between the cases and fatalities as the lags dip below the bounds after lag 16.
# LINEAR REGRESSIONS FORECAST--------------------
newdata <- data.frame(Date = c(141:150))
forecast(confirmed_lm, newdata = newdata)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 141 187921.5 150017.3 225825.6 129719.2 246123.7
## 142 189419.0 151503.4 227334.5 131199.2 247638.8
## 143 190916.4 152989.3 228843.6 132678.8 249154.0
## 144 192413.9 154475.0 230352.8 134158.3 250669.6
## 145 193911.4 155960.6 231862.2 135637.5 252185.3
## 146 195408.9 157446.0 233371.8 137116.4 253701.4
## 147 196906.4 158931.3 234881.5 138595.1 255217.6
## 148 198403.9 160416.4 236391.4 140073.6 256734.1
## 149 199901.4 161901.3 237901.4 141551.9 258250.9
## 150 201398.8 163386.1 239411.6 143029.9 259767.8
forecast(fatalities_lm, newdata = newdata)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 141 10085.35 5993.529 14177.16 3802.313 16368.38
## 142 10162.21 6069.161 14255.26 3877.284 16447.14
## 143 10239.08 6144.776 14333.38 3952.229 16525.93
## 144 10315.95 6220.375 14411.52 4027.148 16604.75
## 145 10392.81 6295.956 14489.67 4102.040 16683.59
## 146 10469.68 6371.520 14567.84 4176.907 16762.45
## 147 10546.55 6447.068 14646.03 4251.748 16841.35
## 148 10623.41 6522.598 14724.23 4326.563 16920.27
## 149 10700.28 6598.112 14802.45 4401.352 16999.21
## 150 10777.15 6673.608 14880.69 4476.114 17078.18
Holt-Winters’ Methods: The book states that the additive method is preferred when the seasonal variations are roughly constant through the series, while the multiplicative method is preferred when the seasonal variations are changing proportional to the level of the series. Judging by the previous plots about the time series for confirmed cases and fatalities, the additive method would probably be the best method since the seasonal variation is generally constant. I decided to also perform the multiplicative method because I was interested to see its performance.
# HOLT-WINTERS' ---------------------------
# HOLT-WINTERS' CONFIRMED
holtwinters_confirmed <- hw(confirmed_ts2, seasonal = "additive")
holtwinters_confirmed2 <- hw(confirmed_ts2, seasonal = "multiplicative")
holtwinters_confirmed_plot <- autoplot(confirmed_ts2,
main = "Confirmed Cases Holt-Winters'") +
autolayer(fitted(holtwinters_confirmed), series = "Fitted Additive") +
autolayer(fitted(holtwinters_confirmed2), series = "Fitted Multiplicative") +
ylab("Confirmed Cases") +
theme(plot.title = element_text(hjust = 0.5))
# HOLT-WINTERS' FATALITIES
holtwinters_fatalities <- hw(fatalities_ts2, seasonal = "additive")
holtwinters_fatalities2 <- hw(fatalities_ts2, seasonal = "multiplicative")
holtwinters_fatalities_plot <- autoplot(fatalities_ts2,
main = "Fatalities Holt-Winters'") +
autolayer(fitted(holtwinters_fatalities), series = "Fitted Additive") +
autolayer(fitted(holtwinters_fatalities2), series = "Fitted Multiplicative") +
ylab("Fatalities") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(holtwinters_confirmed_plot, holtwinters_fatalities_plot, nrow = 2)
The Holt-Winters’ method for the confirmed cases did not perform that well as the RSME is very high, compared to the fatalities. The additive method outperformed the multiplicative method as predicted based on the seasonality variation. Though the Holt-Winters’ method perfromed relatively well, I feel like there is a better model fit out there.
# forecast holt-winter's
accuracy(holtwinters_confirmed)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 650.7878 10986.12 7752.638 -21.73239 44.64228 0.6507677 0.0617892
accuracy(holtwinters_confirmed2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 266.8821 11261.3 7969.761 -34.1713 50.47504 0.6689933 0.1001789
accuracy(holtwinters_fatalities)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 41.49784 1418.019 880.6689 -86.07229 185.7256 0.6510194 0.1062649
accuracy(holtwinters_fatalities2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 7.850963 1452.54 873.9613 -152.8496 177.3269 0.646061 0.1281022
Due to the disappointing performances of the original Holt-Winters’ methods, dampening the models may prove to provide better results. Below are the models run again but with a damped trend method. The damped method for the confirmed cases performed worse than the non-damped method.
# DAMPED MODELS-----------------------------
# Damped Confirmed Cases
damped_confirmed <- hw(confirmed_ts2, seasonal = "additive", damped = T)
damped_confirmed2 <- hw(confirmed_ts2, seasonal = "multiplicative", damped = T)
damped_confirmed_plot <- autoplot(confirmed_ts2,
main = "Confirmed Cases Damped Holt-Winters'") +
autolayer(fitted(damped_confirmed)
, series = "Fitted Additive") +
autolayer(fitted(damped_confirmed2)
, series = "Fitted Multiplicative") +
ylab("Confirmed Cases")
accuracy(damped_confirmed)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1002.928 11011.26 7762.176 -17.46443 41.13679 0.6515683 0.0624929
accuracy(damped_confirmed2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 709.8901 11284.55 8003.31 -30.07141 48.43074 0.6718095 0.1014102
It is interesting here that the damped multiplicative method improved the RSME for fatalities. For the non-damped, the additive method was the better option but it flips once the data is damped.
# Damped Fatalities
damped_fatalities <- hw(fatalities_ts2, seasonal = "additive", damped = T)
damped_fatalities2 <- hw(fatalities_ts2, seasonal = "multiplicative", damped = T)
accuracy(damped_fatalities)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 45.72692 1418.129 880.6073 -83.65027 185.2103 0.6509739 0.1062452
accuracy(damped_fatalities2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 33.55697 1389.253 851.1629 -101.9423 126.0482 0.6292076 0.1387812
damped_fatalities_plot <- autoplot(fatalities_ts2
, main = "Fatalities Damped Holt-Winters'") +
autolayer(fitted(damped_fatalities)
, series = "Fitted Additive") +
autolayer(fitted(damped_fatalities2)
, series = "Fitted Multiplicative") +
ylab("Fatalities")
# Combine Plots
grid.arrange(damped_confirmed_plot, damped_fatalities_plot, nrow = 2)
ARIMA Models: For the confirmed cases, an ARIMA(0,1,0) model was used and the p-value shows that it is statistically significant. The ACF plot suggest there is still some seasonality present but the residuals histogram shows that the residuals are normally distributed.
# ARIMA MODELS --------------
# ARIMA Confirmed
confirmed_arima <- auto.arima(confirmed_ts2)
checkresiduals(confirmed_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 25.342, df = 4, p-value = 4.294e-05
##
## Model df: 0. Total lags used: 4
accuracy(confirmed_arima)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1144.244 11221.51 7746.53 -9.089343 29.7029 0.650255 -0.001010136
confirmed_forecast <- forecast(confirmed_arima, h = 12)
accuracy(confirmed_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1144.244 11221.51 7746.53 -9.089343 29.7029 0.650255 -0.001010136
confirmed_arime_fc_plot <- autoplot(confirmed_forecast) +
labs(title = "Confirmed COVID Cases Forecast using ARIMA(0,1,0)") +
ylab("Confirmed COVID Cases") +
theme(plot.title = element_text(hjust = 0.5))
For the fatalities data, an ARIMA(0,1,1)(2,0,2)[2] model was chosen as the optimal model and the p-value shows it is statistically significant. The ACF plot shows that lags 7 and 8 were significant as shown in the ARIMA output that it used 8 lags. The residuals are normally distributed as well.
# ARIMA Fatalities
fatalities_arima <- auto.arima(fatalities_ts2)
checkresiduals(fatalities_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(2,0,2)[2]
## Q* = 23.503, df = 3, p-value = 3.173e-05
##
## Model df: 5. Total lags used: 8
accuracy(fatalities_arima)
## ME RMSE MAE MPE MAPE MASE
## Training set 76.26603 1135.24 709.0354 -82.31197 110.9137 0.5241423
## ACF1
## Training set -0.004822486
fatalities_forecast <- forecast(fatalities_arima, h = 12)
accuracy(fatalities_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set 76.26603 1135.24 709.0354 -82.31197 110.9137 0.5241423
## ACF1
## Training set -0.004822486
fatalities_arima_fc_plot <- autoplot(fatalities_forecast) +
labs(title = "COVID Fatalities Forecast using ARIMA(0,1,1)(2,0,2)") +
ylab("COVID Fatalities") +
theme(plot.title = element_text(hjust = 0.5))
# Combine Plots
grid.arrange(confirmed_arime_fc_plot, fatalities_arima_fc_plot, nrow = 2)
ETS Models:
The ETS model for confirmed cases was optimized using ETS(A,N,A) and a total of 7 lags. The ETS model for fatalities was optimized using ETS(M,A,M). This makes sense as when we performed the damped Holt-Winters’ test, the multiplicative method outperformed the additive method.
# ETS MODELS -------------------------
# Confirmed ETS
confirmed_ets <- ets(confirmed_ts2)
checkresiduals(confirmed_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 71.237, df = 3, p-value = 2.331e-15
##
## Model df: 4. Total lags used: 7
# Fatalities ETS
fatalities_ets <- ets(fatalities_ts2)
checkresiduals(fatalities_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,M)
## Q* = 50.238, df = 3, p-value = 7.109e-11
##
## Model df: 6. Total lags used: 9
# ETS FORECASTS ------------------------------
# Confirmed ETS
confirmed_ets_forecast <- forecast(confirmed_ets, h = 12)
accuracy(confirmed_ets_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1158.831 11021.44 7772.226 -20.22401 47.23515 0.6524119 0.06383282
confirmed_ets_fc_plot <- confirmed_ets_forecast %>% autoplot() +
labs(title="Confirmed COVID Cases Forecast Using ETS(A,N,A)") +
ylab("Cases") +
theme(plot.title = element_text(hjust = 0.5))
# ETS Fatalities
fatalities_ets_forecast <- forecast(fatalities_ets, h = 12)
accuracy(fatalities_ets_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -28.73626 1606.458 1045.536 -97.71345 120.2126 0.7728947 0.5045869
fatalities_ets_fc_plot <- fatalities_ets_forecast %>% autoplot() +
labs(title="COVID Fatalities Forecast Using ETS(M,A,M)") +
ylab("Fatalities") +
theme(plot.title = element_text(hjust = 0.5))
# Merge ETS plots
grid.arrange(confirmed_ets_fc_plot
, fatalities_ets_fc_plot
, nrow = 2)
After performing multiple models, the results differed for the two data sets. The optimal model for the confirmed cases is the Holt-Winters’ Additive with the lowest RSME of 10,986. The optimal model for the fatalities is the ARIMA(0,1,1)(2,0,2)[2] with the lowest RSME of 1135. Below are the predictions for each model:
# Predictions
predict(holtwinters_confirmed, confirmed_test_ts2)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 71.00 165106.0 150714.9 179497.0 143096.8 187115.2
## 71.50 161439.3 141339.0 181539.6 130698.6 192180.1
## 72.00 166157.3 141434.7 190879.9 128347.4 203967.2
## 72.50 162490.6 134061.7 190919.6 119012.3 205969.0
fatalities_forecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 71.00 7340.400 5853.317 8827.483 5066.102417 9614.697
## 71.50 6704.019 4919.689 8488.348 3975.122705 9432.915
## 72.00 5193.833 3279.823 7107.844 2266.607172 8121.059
## 72.50 4530.372 2459.188 6601.557 1362.768678 7697.976
## 73.00 4800.893 2641.270 6960.517 1498.034922 8103.752
## 73.50 5676.898 3415.066 7938.730 2217.724243 9136.071
## 74.00 6998.095 4515.060 9481.129 3200.621224 10795.568
## 74.50 7267.603 4624.503 9910.702 3225.330539 11309.875
## 75.00 6468.488 3665.999 9270.976 2182.451174 10754.524
## 75.50 5526.997 2576.442 8477.552 1014.512281 10039.481
## 76.00 4602.106 1595.419 7608.794 3.774553 9200.438
## 76.50 4733.122 1647.293 7818.951 13.754254 9452.490