Test
#Reading in Training data
covid_data_train <- read.csv('train.csv', stringsAsFactors = FALSE)
dim(covid_data_train)
## [1] 969640 9
head(covid_data_train)
## Id County Province_State Country_Region Population Weight Date
## 1 1 Afghanistan 27657145 0.05835874 2020-01-23
## 2 2 Afghanistan 27657145 0.58358737 2020-01-23
## 3 3 Afghanistan 27657145 0.05835874 2020-01-24
## 4 4 Afghanistan 27657145 0.58358737 2020-01-24
## 5 5 Afghanistan 27657145 0.05835874 2020-01-25
## 6 6 Afghanistan 27657145 0.58358737 2020-01-25
## Target TargetValue
## 1 ConfirmedCases 0
## 2 Fatalities 0
## 3 ConfirmedCases 0
## 4 Fatalities 0
## 5 ConfirmedCases 0
## 6 Fatalities 0
str(covid_data_train)
## 'data.frame': 969640 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ County : chr "" "" "" "" ...
## $ Province_State: chr "" "" "" "" ...
## $ Country_Region: chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Population : int 27657145 27657145 27657145 27657145 27657145 27657145 27657145 27657145 27657145 27657145 ...
## $ Weight : num 0.0584 0.5836 0.0584 0.5836 0.0584 ...
## $ Date : chr "2020-01-23" "2020-01-23" "2020-01-24" "2020-01-24" ...
## $ Target : chr "ConfirmedCases" "Fatalities" "ConfirmedCases" "Fatalities" ...
## $ TargetValue : int 0 0 0 0 0 0 0 0 0 0 ...
min(covid_data_train$TargetValue)
## [1] -10034
#It appears that there are some negative case and fatality totals. I am going to assume that these were entered in error and take the absolute value.
covid_data_train$TargetValue <- abs(covid_data_train$TargetValue)
covid_data_train$Date <- as.Date.character(covid_data_train$Date)
#Reading in Test data
covid_data_test<-filter(covid_data_train, between(covid_data_train$Date, as.Date("2020-04-27"), as.Date("2020-06-10")))
#Confirm subset
head(covid_data_test)
## Id County Province_State Country_Region Population Weight Date
## 1 191 Afghanistan 27657145 0.05835874 2020-04-27
## 2 192 Afghanistan 27657145 0.58358737 2020-04-27
## 3 193 Afghanistan 27657145 0.05835874 2020-04-28
## 4 194 Afghanistan 27657145 0.58358737 2020-04-28
## 5 195 Afghanistan 27657145 0.05835874 2020-04-29
## 6 196 Afghanistan 27657145 0.58358737 2020-04-29
## Target TargetValue
## 1 ConfirmedCases 172
## 2 Fatalities 7
## 3 ConfirmedCases 125
## 4 Fatalities 1
## 5 ConfirmedCases 111
## 6 Fatalities 2
tail(covid_data_train)
## Id County Province_State Country_Region Population Weight
## 969635 969635 Zimbabwe 14240168 0.06071064
## 969636 969636 Zimbabwe 14240168 0.60710640
## 969637 969637 Zimbabwe 14240168 0.06071064
## 969638 969638 Zimbabwe 14240168 0.60710640
## 969639 969639 Zimbabwe 14240168 0.06071064
## 969640 969640 Zimbabwe 14240168 0.60710640
## Date Target TargetValue
## 969635 2020-06-08 ConfirmedCases 5
## 969636 2020-06-08 Fatalities 0
## 969637 2020-06-09 ConfirmedCases 27
## 969638 2020-06-09 Fatalities 0
## 969639 2020-06-10 ConfirmedCases 6
## 969640 2020-06-10 Fatalities 0
#segment into cases and fatalities
test_cases<-subset(covid_data_test, covid_data_test$Target == "ConfirmedCases")
test_fatalities<- subset(covid_data_test, covid_data_test$Target == "Fatalities")
#Aggregat Test data
test_cases_agg <- aggregate(x = test_cases[c("TargetValue")], FUN = sum, by = list(Group.date = test_cases$Date))
head(test_cases_agg)
## Group.date TargetValue
## 1 2020-04-27 115459
## 2 2020-04-28 125866
## 3 2020-04-29 136992
## 4 2020-04-30 145603
## 5 2020-05-01 157932
## 6 2020-05-02 141704
test_fatalities_agg <- aggregate(x = test_fatalities[c("TargetValue")], FUN = sum, by = list(Group.date = test_fatalities$Date))
head(test_fatalities_agg)
## Group.date TargetValue
## 1 2020-04-27 7686
## 2 2020-04-28 10723
## 3 2020-04-29 12282
## 4 2020-04-30 9857
## 5 2020-05-01 9578
## 6 2020-05-02 8333
#Create Test Time Series
test_cases_xts <- xts(test_cases_agg$TargetValue, test_cases_agg$Group.date)
test_fatalities_xts <- xts(test_fatalities_agg$TargetValue, test_fatalities_agg$Group.date)
test_cases_ts <- ts(test_cases_agg$TargetValue, start = 89)
test_fatalities_ts <- ts(test_fatalities_agg$TargetValue, start = 89)
#Test cases full
data_cases <- subset(covid_data_train, covid_data_train$Target == "ConfirmedCases")
data_cases_agg <- aggregate(x = data_cases[c("TargetValue")], FUN = sum, by = list(Group.date = data_cases$Date))
data_cases_ts <- ts(data_cases_agg$TargetValue, start = 1)
#Test Fatalities full
data_fatalities <- subset(covid_data_train, covid_data_train$Target == "Fatalities")
data_fatalities_agg <- aggregate(x = data_fatalities[c("TargetValue")], FUN = sum, by = list(Group.date = data_fatalities$Date))
data_fatalities_ts <- ts(data_fatalities_agg$TargetValue, start = 1)
autoplot(test_cases_ts)
autoplot(test_fatalities_ts)
#Segment Training Data into confirmed cases and fatalaties data sets
data_train <- filter(covid_data_train, between(covid_data_train$Date, as.Date("2020-01-23"), as.Date("2020-04-26")))
train_cases <- subset(data_train, data_train$Target == "ConfirmedCases")
train_fatalities <- subset(data_train, data_train$Target == "Fatalities")
#Aggregate Training Data
train_cases_agg <- aggregate(x = train_cases[c("TargetValue")], FUN = sum, by = list(Group.date = train_cases$Date))
head(test_cases_agg)
## Group.date TargetValue
## 1 2020-04-27 115459
## 2 2020-04-28 125866
## 3 2020-04-29 136992
## 4 2020-04-30 145603
## 5 2020-05-01 157932
## 6 2020-05-02 141704
train_fatalities_agg <- aggregate(x = train_fatalities[c("TargetValue")], FUN = sum, by = list(Group.date = train_fatalities$Date))
head(test_fatalities_agg)
## Group.date TargetValue
## 1 2020-04-27 7686
## 2 2020-04-28 10723
## 3 2020-04-29 12282
## 4 2020-04-30 9857
## 5 2020-05-01 9578
## 6 2020-05-02 8333
#Create Training Time Series
train_cases_xts <- xts(train_cases_agg$TargetValue, train_cases_agg$Group.date)
train_fatalities_xts <- xts(train_fatalities_agg$TargetValue, train_fatalities_agg$Group.date)
train_cases_ts <- ts(train_cases_agg$TargetValue)
train_fatalities_ts <- ts(train_fatalities_agg$TargetValue)
#EDA of Timeseries & Training Set
autoplot(train_cases_xts)
autoplot(train_cases_ts)
#Because this is daily data we will not seasonality. To find seasonality it might make sense to tranch into weekly case/fatality loads
#There is a clear upward trend starting in mid-march before settling in April 2020
#The level of variability is increasing as the time series moves along, so I would expect multiplicative error terms, with an additive trends
ggtsdisplay(train_cases_ts)
#Lots of ACF beyond the critical value, never fully decays by 20 lags
autoplot(train_fatalities_xts)
autoplot(train_fatalities_ts)
#The trend for fatalities mirrors the case load as would be expected just on a lag. The uptick in deaths hits later in March.
#Fatalities do seem to decline more rapidly than cases had, but nonetheless would expect a similar additive trend line
ggtsdisplay(train_fatalities_ts)
#ACF does decay from a positive starting point close to the critical line by lag 20
#Build Cases Models and Forecast
ndiffs(train_cases_ts)
## [1] 1
ggtsdisplay(diff(train_cases_ts))
#looks likes it still needs some work
lambda_tcts <- BoxCox.lambda(train_cases_ts)
lambda_tcts
## [1] 0.4815143
tcts_boxcox <- box_cox(train_cases_ts, lambda = lambda_tcts)
ndiffs(tcts_boxcox)
## [1] 1
ggtsdisplay(diff(tcts_boxcox))
#Spike in variability at day 20, but much more stable around 0 mean
#Autocorrelation mostly resolved with some remaining PACF
#Because the initial lag has negative ACF and the negative spike at 10, I anticipate this will be a MA ARIMA model -> Predict ARIMA(1,1,2)
#ARIMA Models for cases
arima_cases_fitbc <- auto.arima(train_cases_ts, lambda = lambda_tcts)
summary(arima_cases_fitbc)
## Series: train_cases_ts
## ARIMA(2,1,2)
## Box Cox transformation: lambda= 0.4815143
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.2867 -0.3490 -1.7230 0.8252
## s.e. 0.1468 0.1437 0.0918 0.0866
##
## sigma^2 estimated as 1464: log likelihood=-474.57
## AIC=959.14 AICc=959.83 BIC=971.86
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 644.7848 9416.811 5760.299 -11.73825 39.93755 0.9015078 0.09295166
#ARIMA(2,1,2), pretty close to my prediction.
#AICc = 959.83, RMSE of 9416.811 on training set
arima_cases_fit <- auto.arima(train_cases_ts)
summary(arima_cases_fit)
## Series: train_cases_ts
## ARIMA(0,1,0)
##
## sigma^2 estimated as 110572055: log likelihood=-1003.88
## AIC=2009.75 AICc=2009.79 BIC=2012.29
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1373.002 10459.83 6322.37 -13.42249 40.20727 0.989474 -0.1234093
#ARIMA(0,1,0), so no AR or MA term, just differencing
#AICc= 2012.29 and RMSE = 10459.83
#The Box.Cox transformation results in a much stronger ARIMA model on the training data than the non-stationary data
#ETS Model
ets_cases_fit <- ets(train_cases_ts)
summary(ets_cases_fit)
## ETS(M,A,N)
##
## Call:
## ets(y = train_cases_ts)
##
## Smoothing parameters:
## alpha = 0.2407
## beta = 0.0111
##
## Initial states:
## l = -197.8657
## b = 451.3574
##
## sigma: 0.6536
##
## AIC AICc BIC
## 2193.365 2194.039 2206.134
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1281.785 13566.7 8956.531 -43.8529 70.03329 1.40173 0.6219405
#ETS(M,A,N) model, which is to be expected given increasing errors, steady trend and no seasonality
#Very low alpha (0.0071 means the model is very front weighted
#RMSE is better with the ARIMA models on the training data, can't compare the AICcs
#RMSE is 13566.7
#Cases Forecasts
#Training cases is 88 observations and Test set is 44 observations, so a 2/3 split
arima_cases_fc <- forecast(arima_cases_fitbc, h = 44, level = c(95))
#95% requested in kaggle assignment
arima_cases_fc
## Point Forecast Lo 95 Hi 95
## 96 142938.4 109885.043777 180513.3
## 97 148320.7 109978.887900 192628.7
## 98 150842.6 109179.020858 199513.7
## 99 152192.0 107108.859690 205516.4
## 100 153045.6 103895.444056 212102.8
## 101 153672.7 99742.808429 219732.5
## 102 154182.1 94896.419541 228438.6
## 103 154618.9 89596.297487 238095.1
## 104 155003.5 84046.124325 248538.0
## 105 155346.0 78404.705402 259613.8
## 106 155652.8 72789.824111 271192.2
## 107 155928.1 67286.049410 283167.1
## 108 156175.4 61952.494786 295452.6
## 109 156397.6 56829.245082 307979.5
## 110 156597.3 51942.315459 320691.4
## 111 156776.8 47307.369279 333542.2
## 112 156938.1 42932.478132 346493.8
## 113 157083.0 38820.168384 359514.8
## 114 157213.3 34968.942610 372579.0
## 115 157330.3 31374.414832 385664.5
## 116 157435.4 28030.160546 398753.1
## 117 157529.9 24928.354723 411829.8
## 118 157614.8 22060.251080 424881.8
## 119 157691.1 19416.541725 437898.7
## 120 157759.6 16987.626080 450871.8
## 121 157821.1 14763.810639 463794.1
## 122 157876.4 12735.455771 476659.8
## 123 157926.1 10893.081819 489464.4
## 124 157970.7 9227.443816 502204.0
## 125 158010.8 7729.581912 514876.1
## 126 158046.8 6390.852909 527478.3
## 127 158079.2 5202.946983 540009.3
## 128 158108.2 4157.892586 552468.1
## 129 158134.3 3248.051614 564854.1
## 130 158157.8 2466.106045 577167.2
## 131 158178.8 1805.036305 589407.4
## 132 158197.7 1258.090265 601575.3
## 133 158214.7 818.739577 613671.4
## 134 158230.0 480.615372 625696.5
## 135 158243.7 237.403815 637651.6
## 136 158256.0 82.645291 649537.8
## 137 158267.1 9.207699 661356.2
## 138 158277.0 -6.566049 673108.1
## 139 158285.9 -70.962550 684794.7
autoplot(arima_cases_fc)
ets_case_fc <- forecast(ets_cases_fit, 44, level = c(95))
ets_case_fc
## Point Forecast Lo 95 Hi 95
## 96 154978.5 -43546.98 353503.9
## 97 156783.1 -52748.31 366314.6
## 98 158587.8 -62562.07 379737.7
## 99 160392.5 -72992.63 393777.7
## 100 162197.2 -84045.41 408439.9
## 101 164001.9 -95727.02 423730.9
## 102 165806.6 -108045.33 439658.5
## 103 167611.3 -121009.50 456232.1
## 104 169416.0 -134630.01 473462.0
## 105 171220.7 -148918.68 491360.1
## 106 173025.4 -163888.67 509939.4
## 107 174830.1 -179554.47 529214.6
## 108 176634.8 -195931.95 549201.5
## 109 178439.5 -213038.29 569917.2
## 110 180244.1 -230892.03 591380.3
## 111 182048.8 -249513.05 613610.7
## 112 183853.5 -268922.60 636629.7
## 113 185658.2 -289143.29 660459.7
## 114 187462.9 -310199.08 685124.9
## 115 189267.6 -332115.36 710650.6
## 116 191072.3 -354918.89 737063.5
## 117 192877.0 -378637.89 764391.9
## 118 194681.7 -403302.02 792665.4
## 119 196486.4 -428942.43 821915.2
## 120 198291.1 -455591.78 852173.9
## 121 200095.8 -483284.28 883475.8
## 122 201900.5 -512055.73 915856.6
## 123 203705.1 -541943.57 949353.9
## 124 205509.8 -572986.89 984006.6
## 125 207314.5 -605226.51 1019855.6
## 126 209119.2 -638705.03 1056943.5
## 127 210923.9 -673466.84 1095314.7
## 128 212728.6 -709558.24 1135015.5
## 129 214533.3 -747027.44 1176094.0
## 130 216338.0 -785924.66 1218600.6
## 131 218142.7 -826302.19 1262587.6
## 132 219947.4 -868214.43 1308109.2
## 133 221752.1 -911717.99 1355222.1
## 134 223556.8 -956871.75 1403985.3
## 135 225361.5 -1003736.96 1454459.9
## 136 227166.1 -1052377.27 1506709.6
## 137 228970.8 -1102858.89 1560800.6
## 138 230775.5 -1155250.59 1616801.7
## 139 232580.2 -1209623.88 1674784.3
autoplot(ets_case_fc)
#Evaluate Cases Forecasts against Test Data
#Check ARIMA forecast residuals and error metrics for cases
checkresiduals(arima_cases_fitbc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)
## Q* = 7.9245, df = 6, p-value = 0.2437
##
## Model df: 4. Total lags used: 10
#ACF spikes at 10 lags, but the residuals pass the ljung-box test and can be considered white noise
#Check ETS residuals and error metrics for cases
checkresiduals(ets_case_fc)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 29.492, df = 6, p-value = 4.908e-05
##
## Model df: 4. Total lags used: 10
#ETS model has some autocorrelation concerns through lag 5 and does not pass Ljung-Box test, residuals are not white noise
#Residuals also have a rigt skew with non-symmetrical variability or mean of 0
#Evaluate error metrics
accuracy(ets_case_fc$fitted, test_cases_ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -16603.38 22403.7 18779.82 -13.26096 14.63905 0.3479993 1.601121
accuracy(arima_cases_fitbc$fitted, test_cases_ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -6355.205 16003.94 14203.49 -5.445906 10.6551 0.2420513 1.155024
#Neither model appears to be particularly good at explaining residuals
#The ARIMA model outperforms the ETS model for prediction cases
#Build Fatalities Models and Forecast
ndiffs(train_fatalities_ts)
## [1] 1
ggtsdisplay(diff(train_fatalities_ts))
#looks likes it still needs some work
lambda_tfts <- BoxCox.lambda(train_fatalities_ts)
lambda_tfts
## [1] 0.2980666
tfts_boxcox <- box_cox(train_fatalities_ts, lambda = lambda_tfts)
ndiffs(tfts_boxcox)
## [1] 1
ggtsdisplay(diff(tfts_boxcox))
#Variability is not symetrical with a number of spikes
#Autocorrelation not fully corrected but appears to be white noise
#Predict ARIMA(2,1,2) - given the 2 ACF spike on the positive and neg sides before 10 lags and the one differencing
#ARIMA Models for cases
arima_fatalities_fitbc <- auto.arima(train_fatalities_ts, lambda = lambda_tfts)
summary(arima_cases_fitbc)
## Series: train_cases_ts
## ARIMA(2,1,2)
## Box Cox transformation: lambda= 0.4815143
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.2867 -0.3490 -1.7230 0.8252
## s.e. 0.1468 0.1437 0.0918 0.0866
##
## sigma^2 estimated as 1464: log likelihood=-474.57
## AIC=959.14 AICc=959.83 BIC=971.86
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 644.7848 9416.811 5760.299 -11.73825 39.93755 0.9015078 0.09295166
#My prediction was spot on with the ARIMA, ARIMA(2,1,2)
#Lambda is very similar for fatalities as for cases
#AICc =959.83, RMSE = 9416.811
#ETS Model for Fatalities
ets_fatalities_fit <- ets(train_fatalities_ts)
summary(ets_fatalities_fit)
## ETS(M,A,N)
##
## Call:
## ets(y = train_fatalities_ts)
##
## Smoothing parameters:
## alpha = 0.3269
## beta = 0.0044
##
## Initial states:
## l = -12.5647
## b = 14.2549
##
## sigma: 0.4482
##
## AIC AICc BIC
## 1536.129 1536.803 1548.898
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 177.735 1457.534 772.6324 -142.8654 167.7226 1.337177 0.4457083
#ETS(M,A,N) -> Errors are multiplicative, trend is additive
#alpha is 0.3269 so a more balanced weighting of historical data
#AICc = 1526.802, RMSE = 1457.534
#The ETS model has better error measures for training data
#Fatalities Forecasts
#Training Forecasts is 88 observations and Test set is 44 observations, so a 2/3 split
arima_fatalities_fc <- forecast(arima_fatalities_fitbc, h = 44, level = c(95))
#95% requested in kaggle assignment
arima_fatalities_fc
## Point Forecast Lo 95 Hi 95
## 96 8745.074 5376.235 13373.80
## 97 7514.451 4278.284 12170.19
## 98 8583.507 4404.969 14970.02
## 99 8372.118 3988.057 15361.89
## 100 8901.248 3953.044 17103.26
## 101 9020.622 3757.547 18053.42
## 102 9381.970 3687.910 19461.26
## 103 9615.320 3577.423 20615.60
## 104 9930.534 3512.519 21932.06
## 105 10208.673 3441.390 23175.93
## 106 10517.277 3389.636 24485.51
## 107 10818.165 3340.773 25784.06
## 108 11133.091 3302.086 27117.77
## 109 11449.836 3268.073 28461.96
## 110 11775.437 3240.590 29832.29
## 111 12106.020 3217.626 31221.10
## 112 12443.891 3199.479 32633.91
## 113 12787.858 3185.300 34068.57
## 114 13138.711 3174.979 35527.20
## 115 13496.115 3168.084 37009.40
## 116 13860.373 3164.430 38516.14
## 117 14231.430 3163.755 40047.54
## 118 14609.432 3165.887 41604.17
## 119 14994.410 3170.646 43186.29
## 120 15386.463 3177.892 44794.30
## 121 15785.651 3187.491 46428.51
## 122 16192.055 3199.332 48089.24
## 123 16605.747 3213.312 49776.78
## 124 17026.802 3229.345 51491.42
## 125 17455.295 3247.352 53233.43
## 126 17891.301 3267.263 55003.10
## 127 18334.896 3289.016 56800.67
## 128 18786.154 3312.557 58626.40
## 129 19245.151 3337.838 60480.54
## 130 19711.963 3364.817 62363.33
## 131 20186.667 3393.454 64275.02
## 132 20669.338 3423.716 66215.82
## 133 21160.053 3455.576 68185.97
## 134 21658.888 3489.005 70185.70
## 135 22165.920 3523.982 72215.23
## 136 22681.226 3560.488 74274.78
## 137 23204.883 3598.506 76364.57
## 138 23736.969 3638.020 78484.81
## 139 24277.561 3679.019 80635.72
autoplot(arima_fatalities_fc)
#Showing a steady upward trajectory in the near future
ets_fatalities_fc <- forecast(ets_fatalities_fit, 44, level = c(95))
ets_fatalities_fc
## Point Forecast Lo 95 Hi 95
## 96 9794.093 1190.1125 18398.07
## 97 9882.712 655.9676 19109.46
## 98 9971.330 127.3973 19815.26
## 99 10059.949 -398.0185 20517.92
## 100 10148.568 -922.2139 21219.35
## 101 10237.187 -1446.7709 21921.14
## 102 10325.805 -1973.0100 22624.62
## 103 10414.424 -2502.0534 23330.90
## 104 10503.043 -3034.8705 24040.96
## 105 10591.661 -3572.3109 24755.63
## 106 10680.280 -4115.1295 25475.69
## 107 10768.899 -4664.0053 26201.80
## 108 10857.517 -5219.5557 26934.59
## 109 10946.136 -5782.3486 27674.62
## 110 11034.755 -6352.9107 28422.42
## 111 11123.373 -6931.7352 29178.48
## 112 11211.992 -7519.2878 29943.27
## 113 11300.611 -8116.0112 30717.23
## 114 11389.229 -8722.3293 31500.79
## 115 11477.848 -9338.6504 32294.35
## 116 11566.467 -9965.3700 33098.30
## 117 11655.085 -10602.8734 33913.04
## 118 11743.704 -11251.5375 34738.95
## 119 11832.323 -11911.7324 35576.38
## 120 11920.941 -12583.8232 36425.71
## 121 12009.560 -13268.1712 37287.29
## 122 12098.179 -13965.1348 38161.49
## 123 12186.797 -14675.0709 39048.67
## 124 12275.416 -15398.3354 39949.17
## 125 12364.035 -16135.2845 40863.35
## 126 12452.654 -16886.2747 41791.58
## 127 12541.272 -17651.6642 42734.21
## 128 12629.891 -18431.8129 43691.59
## 129 12718.510 -19227.0833 44664.10
## 130 12807.128 -20037.8407 45652.10
## 131 12895.747 -20864.4541 46655.95
## 132 12984.366 -21707.2959 47676.03
## 133 13072.984 -22566.7430 48712.71
## 134 13161.603 -23443.1770 49766.38
## 135 13250.222 -24336.9842 50837.43
## 136 13338.840 -25248.5563 51926.24
## 137 13427.459 -26178.2906 53033.21
## 138 13516.078 -27126.5903 54158.75
## 139 13604.696 -28093.8647 55303.26
autoplot(ets_fatalities_fc)
#Hold steady towards the middle of the prior data
#Concerning that negative potentialities are in the confidence interval
#Evaluate Fatalities Forecasts
#Check ARIMA forecast residuals and error metrics for cases
checkresiduals(arima_fatalities_fitbc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0) with drift
## Q* = 22.901, df = 8, p-value = 0.003493
##
## Model df: 2. Total lags used: 10
#ACF spikes at 6,10,& 13 lags, residuals just miss passing the ljung-box test and can be considered white noise
#Check ETS residuals and error metrics for cases
checkresiduals(ets_fatalities_fc)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 45.002, df = 6, p-value = 4.676e-08
##
## Model df: 4. Total lags used: 10
#ETS model has some autocorrelation concerns through lag 12 and does not pass Ljung-Box test, residuals are not white noise
#Residuals show an upward bias and variability
#Evaluate error metrics
accuracy(ets_fatalities_fc$fitted, test_cases_ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 124825.5 125470.6 124825.5 91.35553 91.35553 0.4376658 11.25395
accuracy(arima_fatalities_fitbc$fitted, test_cases_ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 125362.1 125913 125362.1 91.82009 91.82009 0.4117493 11.28691
#Neither model appears to be particularly good at explaining residuals
#The ETS model slightly outperforms the ARIMA model for prediction of fatalities
#Model selection and final visuals
#COVID Case Forecasting
#For the forecasting of Covid cases for the period of April 27 through June 10th - the ARIMA(2,1,2) model prodcues the better forecasts
#The ARIMA(2,1,2) model minimizes RMSE, produces white noise residuals, has a tighter confidence interval, and shows a visual trend that aligns with our retrospective understanding of the diseases spread
autoplot(data_cases_ts) + autolayer(arima_cases_fc) + ylab("Global covid cases") + xlab("Days (starting 01/23/2020)") + ggtitle("Covid Cases Forecast (06/10/20) - ARIMA(2,1,2)")
#COVID Fata;ities Forecasting
#The forecasting of Covid Fatalities was more voltaile as the uptic lagged cases, so there was less data in the training set that show the rise in deaths
#For the period of April 27 through June 10th - the ARIMA(2,1,2) model prodcues the better forecasts
#While it is true that the ETS (M,A,N) model reduces error with the training set and just edges the ARIMA model with the Test data, the overall the visual trend of the ARIMA model better fits the later uptick in deaths while also performing much better in terms of reducing autocorrelation.
autoplot(data_fatalities_ts) + autolayer(arima_fatalities_fc) + ylab("Global covid fatalities") + xlab("Days (starting 01/23/2020)") + ggtitle("Covid fatalities Forecast (06/10/20) - ARIMA(2,1,2)")