Predictive Analytics Midterm - Global Covid Cases and Fatalities

Test

#Reading in Training data
covid_data_train <- read.csv('train.csv', stringsAsFactors = FALSE)
dim(covid_data_train)

## [1] 969640      9

head(covid_data_train)

##   Id County Province_State Country_Region Population     Weight       Date
## 1  1                          Afghanistan   27657145 0.05835874 2020-01-23
## 2  2                          Afghanistan   27657145 0.58358737 2020-01-23
## 3  3                          Afghanistan   27657145 0.05835874 2020-01-24
## 4  4                          Afghanistan   27657145 0.58358737 2020-01-24
## 5  5                          Afghanistan   27657145 0.05835874 2020-01-25
## 6  6                          Afghanistan   27657145 0.58358737 2020-01-25
##           Target TargetValue
## 1 ConfirmedCases           0
## 2     Fatalities           0
## 3 ConfirmedCases           0
## 4     Fatalities           0
## 5 ConfirmedCases           0
## 6     Fatalities           0

str(covid_data_train)

## 'data.frame':    969640 obs. of  9 variables:
##  $ Id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ County        : chr  "" "" "" "" ...
##  $ Province_State: chr  "" "" "" "" ...
##  $ Country_Region: chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Population    : int  27657145 27657145 27657145 27657145 27657145 27657145 27657145 27657145 27657145 27657145 ...
##  $ Weight        : num  0.0584 0.5836 0.0584 0.5836 0.0584 ...
##  $ Date          : chr  "2020-01-23" "2020-01-23" "2020-01-24" "2020-01-24" ...
##  $ Target        : chr  "ConfirmedCases" "Fatalities" "ConfirmedCases" "Fatalities" ...
##  $ TargetValue   : int  0 0 0 0 0 0 0 0 0 0 ...

min(covid_data_train$TargetValue)

## [1] -10034

#It appears that there are some negative case and fatality totals.  I am going to assume that these were entered in error and take the absolute value.
covid_data_train$TargetValue <- abs(covid_data_train$TargetValue)
covid_data_train$Date <- as.Date.character(covid_data_train$Date)

#Reading in Test data
covid_data_test<-filter(covid_data_train, between(covid_data_train$Date, as.Date("2020-04-27"), as.Date("2020-06-10")))
#Confirm subset
head(covid_data_test)

##    Id County Province_State Country_Region Population     Weight       Date
## 1 191                          Afghanistan   27657145 0.05835874 2020-04-27
## 2 192                          Afghanistan   27657145 0.58358737 2020-04-27
## 3 193                          Afghanistan   27657145 0.05835874 2020-04-28
## 4 194                          Afghanistan   27657145 0.58358737 2020-04-28
## 5 195                          Afghanistan   27657145 0.05835874 2020-04-29
## 6 196                          Afghanistan   27657145 0.58358737 2020-04-29
##           Target TargetValue
## 1 ConfirmedCases         172
## 2     Fatalities           7
## 3 ConfirmedCases         125
## 4     Fatalities           1
## 5 ConfirmedCases         111
## 6     Fatalities           2

tail(covid_data_train)

##            Id County Province_State Country_Region Population     Weight
## 969635 969635                             Zimbabwe   14240168 0.06071064
## 969636 969636                             Zimbabwe   14240168 0.60710640
## 969637 969637                             Zimbabwe   14240168 0.06071064
## 969638 969638                             Zimbabwe   14240168 0.60710640
## 969639 969639                             Zimbabwe   14240168 0.06071064
## 969640 969640                             Zimbabwe   14240168 0.60710640
##              Date         Target TargetValue
## 969635 2020-06-08 ConfirmedCases           5
## 969636 2020-06-08     Fatalities           0
## 969637 2020-06-09 ConfirmedCases          27
## 969638 2020-06-09     Fatalities           0
## 969639 2020-06-10 ConfirmedCases           6
## 969640 2020-06-10     Fatalities           0

#segment into cases and fatalities
test_cases<-subset(covid_data_test, covid_data_test$Target == "ConfirmedCases")
test_fatalities<- subset(covid_data_test, covid_data_test$Target == "Fatalities")
#Aggregat Test data
test_cases_agg <- aggregate(x = test_cases[c("TargetValue")], FUN = sum, by = list(Group.date = test_cases$Date)) 
head(test_cases_agg)

##   Group.date TargetValue
## 1 2020-04-27      115459
## 2 2020-04-28      125866
## 3 2020-04-29      136992
## 4 2020-04-30      145603
## 5 2020-05-01      157932
## 6 2020-05-02      141704

test_fatalities_agg <- aggregate(x = test_fatalities[c("TargetValue")], FUN = sum, by = list(Group.date = test_fatalities$Date)) 
head(test_fatalities_agg)

##   Group.date TargetValue
## 1 2020-04-27        7686
## 2 2020-04-28       10723
## 3 2020-04-29       12282
## 4 2020-04-30        9857
## 5 2020-05-01        9578
## 6 2020-05-02        8333

#Create Test Time Series
test_cases_xts <- xts(test_cases_agg$TargetValue, test_cases_agg$Group.date)
test_fatalities_xts <- xts(test_fatalities_agg$TargetValue, test_fatalities_agg$Group.date)
test_cases_ts <- ts(test_cases_agg$TargetValue, start = 89)
test_fatalities_ts <- ts(test_fatalities_agg$TargetValue, start = 89)

#Test cases full
data_cases <- subset(covid_data_train, covid_data_train$Target == "ConfirmedCases")
data_cases_agg <- aggregate(x = data_cases[c("TargetValue")], FUN = sum, by = list(Group.date = data_cases$Date)) 
data_cases_ts <- ts(data_cases_agg$TargetValue, start = 1)
#Test Fatalities full
data_fatalities <- subset(covid_data_train, covid_data_train$Target == "Fatalities")
data_fatalities_agg <- aggregate(x = data_fatalities[c("TargetValue")], FUN = sum, by = list(Group.date = data_fatalities$Date)) 
data_fatalities_ts <- ts(data_fatalities_agg$TargetValue, start = 1)


autoplot(test_cases_ts)

autoplot(test_fatalities_ts)

#Segment Training Data into confirmed cases and fatalaties data sets
data_train <- filter(covid_data_train, between(covid_data_train$Date, as.Date("2020-01-23"), as.Date("2020-04-26")))
train_cases <- subset(data_train, data_train$Target == "ConfirmedCases")
train_fatalities <- subset(data_train, data_train$Target == "Fatalities")
#Aggregate Training Data
train_cases_agg <- aggregate(x = train_cases[c("TargetValue")], FUN = sum, by = list(Group.date = train_cases$Date)) 
head(test_cases_agg)

##   Group.date TargetValue
## 1 2020-04-27      115459
## 2 2020-04-28      125866
## 3 2020-04-29      136992
## 4 2020-04-30      145603
## 5 2020-05-01      157932
## 6 2020-05-02      141704

train_fatalities_agg <- aggregate(x = train_fatalities[c("TargetValue")], FUN = sum, by = list(Group.date = train_fatalities$Date)) 
head(test_fatalities_agg)

##   Group.date TargetValue
## 1 2020-04-27        7686
## 2 2020-04-28       10723
## 3 2020-04-29       12282
## 4 2020-04-30        9857
## 5 2020-05-01        9578
## 6 2020-05-02        8333

#Create Training Time Series
train_cases_xts <- xts(train_cases_agg$TargetValue, train_cases_agg$Group.date)
train_fatalities_xts <- xts(train_fatalities_agg$TargetValue, train_fatalities_agg$Group.date)
train_cases_ts <- ts(train_cases_agg$TargetValue)
train_fatalities_ts <- ts(train_fatalities_agg$TargetValue)

#EDA of Timeseries & Training Set
autoplot(train_cases_xts)

autoplot(train_cases_ts)

#Because this is daily data we will not seasonality.  To find seasonality it might make sense to tranch into weekly case/fatality loads
#There is a clear upward trend starting in mid-march before settling in April 2020
#The level of variability is increasing as the time series moves along, so I would expect multiplicative error terms, with an additive trends
ggtsdisplay(train_cases_ts)

#Lots of ACF beyond the critical value, never fully decays by 20 lags

autoplot(train_fatalities_xts)

autoplot(train_fatalities_ts)

#The trend for fatalities mirrors the case load as would be expected just on a lag.  The uptick in deaths hits later in March.
#Fatalities do seem to decline more rapidly than cases had, but nonetheless would expect a similar additive trend line
ggtsdisplay(train_fatalities_ts)

#ACF does decay from a positive starting point close to the critical line by lag 20

#Build Cases Models and Forecast
ndiffs(train_cases_ts)

## [1] 1

ggtsdisplay(diff(train_cases_ts))

#looks likes it still needs some work
lambda_tcts <- BoxCox.lambda(train_cases_ts)
lambda_tcts

## [1] 0.4815143

tcts_boxcox <- box_cox(train_cases_ts, lambda = lambda_tcts)
ndiffs(tcts_boxcox)

## [1] 1

ggtsdisplay(diff(tcts_boxcox))

#Spike in variability at day 20, but much more stable around 0 mean
#Autocorrelation mostly resolved with some remaining PACF
#Because the initial lag has negative ACF and the negative spike at 10, I anticipate this will be a MA ARIMA model -> Predict ARIMA(1,1,2)

#ARIMA Models for cases
arima_cases_fitbc <- auto.arima(train_cases_ts, lambda = lambda_tcts)
summary(arima_cases_fitbc)

## Series: train_cases_ts 
## ARIMA(2,1,2) 
## Box Cox transformation: lambda= 0.4815143 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       1.2867  -0.3490  -1.7230  0.8252
## s.e.  0.1468   0.1437   0.0918  0.0866
## 
## sigma^2 estimated as 1464:  log likelihood=-474.57
## AIC=959.14   AICc=959.83   BIC=971.86
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 644.7848 9416.811 5760.299 -11.73825 39.93755 0.9015078 0.09295166

#ARIMA(2,1,2), pretty close to my prediction.
#AICc = 959.83, RMSE of 9416.811 on training set
arima_cases_fit <- auto.arima(train_cases_ts)
summary(arima_cases_fit)

## Series: train_cases_ts 
## ARIMA(0,1,0) 
## 
## sigma^2 estimated as 110572055:  log likelihood=-1003.88
## AIC=2009.75   AICc=2009.79   BIC=2012.29
## 
## Training set error measures:
##                    ME     RMSE     MAE       MPE     MAPE     MASE       ACF1
## Training set 1373.002 10459.83 6322.37 -13.42249 40.20727 0.989474 -0.1234093

#ARIMA(0,1,0), so no AR or MA term, just differencing
#AICc= 2012.29 and RMSE = 10459.83
#The Box.Cox transformation results in a much stronger ARIMA model on the training data than the non-stationary data

#ETS Model
ets_cases_fit <- ets(train_cases_ts)
summary(ets_cases_fit)

## ETS(M,A,N) 
## 
## Call:
##  ets(y = train_cases_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.2407 
##     beta  = 0.0111 
## 
##   Initial states:
##     l = -197.8657 
##     b = 451.3574 
## 
##   sigma:  0.6536
## 
##      AIC     AICc      BIC 
## 2193.365 2194.039 2206.134 
## 
## Training set error measures:
##                    ME    RMSE      MAE      MPE     MAPE    MASE      ACF1
## Training set 1281.785 13566.7 8956.531 -43.8529 70.03329 1.40173 0.6219405

#ETS(M,A,N) model, which is to be expected given increasing errors, steady trend and no seasonality
#Very low alpha (0.0071 means the model is very front weighted
#RMSE is better with the ARIMA models on the training data, can't compare the AICcs
#RMSE is 13566.7

#Cases Forecasts
#Training cases is 88 observations and Test set is 44 observations, so a 2/3 split
arima_cases_fc <- forecast(arima_cases_fitbc, h = 44, level = c(95))
#95% requested in kaggle assignment
arima_cases_fc

##     Point Forecast         Lo 95    Hi 95
##  96       142938.4 109885.043777 180513.3
##  97       148320.7 109978.887900 192628.7
##  98       150842.6 109179.020858 199513.7
##  99       152192.0 107108.859690 205516.4
## 100       153045.6 103895.444056 212102.8
## 101       153672.7  99742.808429 219732.5
## 102       154182.1  94896.419541 228438.6
## 103       154618.9  89596.297487 238095.1
## 104       155003.5  84046.124325 248538.0
## 105       155346.0  78404.705402 259613.8
## 106       155652.8  72789.824111 271192.2
## 107       155928.1  67286.049410 283167.1
## 108       156175.4  61952.494786 295452.6
## 109       156397.6  56829.245082 307979.5
## 110       156597.3  51942.315459 320691.4
## 111       156776.8  47307.369279 333542.2
## 112       156938.1  42932.478132 346493.8
## 113       157083.0  38820.168384 359514.8
## 114       157213.3  34968.942610 372579.0
## 115       157330.3  31374.414832 385664.5
## 116       157435.4  28030.160546 398753.1
## 117       157529.9  24928.354723 411829.8
## 118       157614.8  22060.251080 424881.8
## 119       157691.1  19416.541725 437898.7
## 120       157759.6  16987.626080 450871.8
## 121       157821.1  14763.810639 463794.1
## 122       157876.4  12735.455771 476659.8
## 123       157926.1  10893.081819 489464.4
## 124       157970.7   9227.443816 502204.0
## 125       158010.8   7729.581912 514876.1
## 126       158046.8   6390.852909 527478.3
## 127       158079.2   5202.946983 540009.3
## 128       158108.2   4157.892586 552468.1
## 129       158134.3   3248.051614 564854.1
## 130       158157.8   2466.106045 577167.2
## 131       158178.8   1805.036305 589407.4
## 132       158197.7   1258.090265 601575.3
## 133       158214.7    818.739577 613671.4
## 134       158230.0    480.615372 625696.5
## 135       158243.7    237.403815 637651.6
## 136       158256.0     82.645291 649537.8
## 137       158267.1      9.207699 661356.2
## 138       158277.0     -6.566049 673108.1
## 139       158285.9    -70.962550 684794.7

autoplot(arima_cases_fc)

ets_case_fc <- forecast(ets_cases_fit, 44, level = c(95))
ets_case_fc

##     Point Forecast       Lo 95     Hi 95
##  96       154978.5   -43546.98  353503.9
##  97       156783.1   -52748.31  366314.6
##  98       158587.8   -62562.07  379737.7
##  99       160392.5   -72992.63  393777.7
## 100       162197.2   -84045.41  408439.9
## 101       164001.9   -95727.02  423730.9
## 102       165806.6  -108045.33  439658.5
## 103       167611.3  -121009.50  456232.1
## 104       169416.0  -134630.01  473462.0
## 105       171220.7  -148918.68  491360.1
## 106       173025.4  -163888.67  509939.4
## 107       174830.1  -179554.47  529214.6
## 108       176634.8  -195931.95  549201.5
## 109       178439.5  -213038.29  569917.2
## 110       180244.1  -230892.03  591380.3
## 111       182048.8  -249513.05  613610.7
## 112       183853.5  -268922.60  636629.7
## 113       185658.2  -289143.29  660459.7
## 114       187462.9  -310199.08  685124.9
## 115       189267.6  -332115.36  710650.6
## 116       191072.3  -354918.89  737063.5
## 117       192877.0  -378637.89  764391.9
## 118       194681.7  -403302.02  792665.4
## 119       196486.4  -428942.43  821915.2
## 120       198291.1  -455591.78  852173.9
## 121       200095.8  -483284.28  883475.8
## 122       201900.5  -512055.73  915856.6
## 123       203705.1  -541943.57  949353.9
## 124       205509.8  -572986.89  984006.6
## 125       207314.5  -605226.51 1019855.6
## 126       209119.2  -638705.03 1056943.5
## 127       210923.9  -673466.84 1095314.7
## 128       212728.6  -709558.24 1135015.5
## 129       214533.3  -747027.44 1176094.0
## 130       216338.0  -785924.66 1218600.6
## 131       218142.7  -826302.19 1262587.6
## 132       219947.4  -868214.43 1308109.2
## 133       221752.1  -911717.99 1355222.1
## 134       223556.8  -956871.75 1403985.3
## 135       225361.5 -1003736.96 1454459.9
## 136       227166.1 -1052377.27 1506709.6
## 137       228970.8 -1102858.89 1560800.6
## 138       230775.5 -1155250.59 1616801.7
## 139       232580.2 -1209623.88 1674784.3

autoplot(ets_case_fc)

#Evaluate Cases Forecasts against Test Data
#Check ARIMA forecast residuals and error metrics for cases
checkresiduals(arima_cases_fitbc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)
## Q* = 7.9245, df = 6, p-value = 0.2437
## 
## Model df: 4.   Total lags used: 10

#ACF spikes at 10 lags, but the residuals pass the ljung-box test and can be considered white noise
#Check ETS residuals and error metrics for cases
checkresiduals(ets_case_fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 29.492, df = 6, p-value = 4.908e-05
## 
## Model df: 4.   Total lags used: 10

#ETS model has some autocorrelation concerns through lag 5 and does not pass Ljung-Box test, residuals are not white noise
#Residuals also have a rigt skew with non-symmetrical variability or mean of 0

#Evaluate error metrics
accuracy(ets_case_fc$fitted, test_cases_ts)

##                 ME    RMSE      MAE       MPE     MAPE      ACF1 Theil's U
## Test set -16603.38 22403.7 18779.82 -13.26096 14.63905 0.3479993  1.601121

accuracy(arima_cases_fitbc$fitted, test_cases_ts)

##                 ME     RMSE      MAE       MPE    MAPE      ACF1 Theil's U
## Test set -6355.205 16003.94 14203.49 -5.445906 10.6551 0.2420513  1.155024

#Neither model appears to be particularly good at explaining residuals
#The ARIMA model outperforms the ETS model for prediction cases

#Build Fatalities Models and Forecast
ndiffs(train_fatalities_ts)

## [1] 1

ggtsdisplay(diff(train_fatalities_ts))

#looks likes it still needs some work
lambda_tfts <- BoxCox.lambda(train_fatalities_ts)
lambda_tfts

## [1] 0.2980666

tfts_boxcox <- box_cox(train_fatalities_ts, lambda = lambda_tfts)
ndiffs(tfts_boxcox)

## [1] 1

ggtsdisplay(diff(tfts_boxcox))

#Variability is not symetrical with a number of spikes
#Autocorrelation not fully corrected but appears to be white noise
#Predict ARIMA(2,1,2) - given the 2 ACF spike on the positive and neg sides before 10 lags and the one differencing

#ARIMA Models for cases
arima_fatalities_fitbc <- auto.arima(train_fatalities_ts, lambda = lambda_tfts)
summary(arima_cases_fitbc)

## Series: train_cases_ts 
## ARIMA(2,1,2) 
## Box Cox transformation: lambda= 0.4815143 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       1.2867  -0.3490  -1.7230  0.8252
## s.e.  0.1468   0.1437   0.0918  0.0866
## 
## sigma^2 estimated as 1464:  log likelihood=-474.57
## AIC=959.14   AICc=959.83   BIC=971.86
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 644.7848 9416.811 5760.299 -11.73825 39.93755 0.9015078 0.09295166

#My prediction was spot on with the ARIMA, ARIMA(2,1,2)
#Lambda is very similar for fatalities as for cases
#AICc =959.83, RMSE = 9416.811

#ETS Model for Fatalities
ets_fatalities_fit <- ets(train_fatalities_ts)
summary(ets_fatalities_fit)

## ETS(M,A,N) 
## 
## Call:
##  ets(y = train_fatalities_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.3269 
##     beta  = 0.0044 
## 
##   Initial states:
##     l = -12.5647 
##     b = 14.2549 
## 
##   sigma:  0.4482
## 
##      AIC     AICc      BIC 
## 1536.129 1536.803 1548.898 
## 
## Training set error measures:
##                   ME     RMSE      MAE       MPE     MAPE     MASE      ACF1
## Training set 177.735 1457.534 772.6324 -142.8654 167.7226 1.337177 0.4457083

#ETS(M,A,N) -> Errors are multiplicative, trend is additive
#alpha is 0.3269 so a more balanced weighting of historical data
#AICc = 1526.802, RMSE = 1457.534
#The ETS model has better error measures for training data

#Fatalities Forecasts
#Training Forecasts is 88 observations and Test set is 44 observations, so a 2/3 split
arima_fatalities_fc <- forecast(arima_fatalities_fitbc, h = 44, level = c(95))
#95% requested in kaggle assignment
arima_fatalities_fc

##     Point Forecast    Lo 95    Hi 95
##  96       8745.074 5376.235 13373.80
##  97       7514.451 4278.284 12170.19
##  98       8583.507 4404.969 14970.02
##  99       8372.118 3988.057 15361.89
## 100       8901.248 3953.044 17103.26
## 101       9020.622 3757.547 18053.42
## 102       9381.970 3687.910 19461.26
## 103       9615.320 3577.423 20615.60
## 104       9930.534 3512.519 21932.06
## 105      10208.673 3441.390 23175.93
## 106      10517.277 3389.636 24485.51
## 107      10818.165 3340.773 25784.06
## 108      11133.091 3302.086 27117.77
## 109      11449.836 3268.073 28461.96
## 110      11775.437 3240.590 29832.29
## 111      12106.020 3217.626 31221.10
## 112      12443.891 3199.479 32633.91
## 113      12787.858 3185.300 34068.57
## 114      13138.711 3174.979 35527.20
## 115      13496.115 3168.084 37009.40
## 116      13860.373 3164.430 38516.14
## 117      14231.430 3163.755 40047.54
## 118      14609.432 3165.887 41604.17
## 119      14994.410 3170.646 43186.29
## 120      15386.463 3177.892 44794.30
## 121      15785.651 3187.491 46428.51
## 122      16192.055 3199.332 48089.24
## 123      16605.747 3213.312 49776.78
## 124      17026.802 3229.345 51491.42
## 125      17455.295 3247.352 53233.43
## 126      17891.301 3267.263 55003.10
## 127      18334.896 3289.016 56800.67
## 128      18786.154 3312.557 58626.40
## 129      19245.151 3337.838 60480.54
## 130      19711.963 3364.817 62363.33
## 131      20186.667 3393.454 64275.02
## 132      20669.338 3423.716 66215.82
## 133      21160.053 3455.576 68185.97
## 134      21658.888 3489.005 70185.70
## 135      22165.920 3523.982 72215.23
## 136      22681.226 3560.488 74274.78
## 137      23204.883 3598.506 76364.57
## 138      23736.969 3638.020 78484.81
## 139      24277.561 3679.019 80635.72

autoplot(arima_fatalities_fc)

#Showing a steady upward trajectory in the near future

ets_fatalities_fc <- forecast(ets_fatalities_fit, 44, level = c(95))
ets_fatalities_fc

##     Point Forecast       Lo 95    Hi 95
##  96       9794.093   1190.1125 18398.07
##  97       9882.712    655.9676 19109.46
##  98       9971.330    127.3973 19815.26
##  99      10059.949   -398.0185 20517.92
## 100      10148.568   -922.2139 21219.35
## 101      10237.187  -1446.7709 21921.14
## 102      10325.805  -1973.0100 22624.62
## 103      10414.424  -2502.0534 23330.90
## 104      10503.043  -3034.8705 24040.96
## 105      10591.661  -3572.3109 24755.63
## 106      10680.280  -4115.1295 25475.69
## 107      10768.899  -4664.0053 26201.80
## 108      10857.517  -5219.5557 26934.59
## 109      10946.136  -5782.3486 27674.62
## 110      11034.755  -6352.9107 28422.42
## 111      11123.373  -6931.7352 29178.48
## 112      11211.992  -7519.2878 29943.27
## 113      11300.611  -8116.0112 30717.23
## 114      11389.229  -8722.3293 31500.79
## 115      11477.848  -9338.6504 32294.35
## 116      11566.467  -9965.3700 33098.30
## 117      11655.085 -10602.8734 33913.04
## 118      11743.704 -11251.5375 34738.95
## 119      11832.323 -11911.7324 35576.38
## 120      11920.941 -12583.8232 36425.71
## 121      12009.560 -13268.1712 37287.29
## 122      12098.179 -13965.1348 38161.49
## 123      12186.797 -14675.0709 39048.67
## 124      12275.416 -15398.3354 39949.17
## 125      12364.035 -16135.2845 40863.35
## 126      12452.654 -16886.2747 41791.58
## 127      12541.272 -17651.6642 42734.21
## 128      12629.891 -18431.8129 43691.59
## 129      12718.510 -19227.0833 44664.10
## 130      12807.128 -20037.8407 45652.10
## 131      12895.747 -20864.4541 46655.95
## 132      12984.366 -21707.2959 47676.03
## 133      13072.984 -22566.7430 48712.71
## 134      13161.603 -23443.1770 49766.38
## 135      13250.222 -24336.9842 50837.43
## 136      13338.840 -25248.5563 51926.24
## 137      13427.459 -26178.2906 53033.21
## 138      13516.078 -27126.5903 54158.75
## 139      13604.696 -28093.8647 55303.26

autoplot(ets_fatalities_fc)

#Hold steady towards the middle of the prior data
#Concerning that negative potentialities are in the confidence interval

#Evaluate Fatalities Forecasts
#Check ARIMA forecast residuals and error metrics for cases
checkresiduals(arima_fatalities_fitbc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0) with drift
## Q* = 22.901, df = 8, p-value = 0.003493
## 
## Model df: 2.   Total lags used: 10

#ACF spikes at 6,10,& 13 lags, residuals just miss passing the ljung-box test and can be considered white noise
#Check ETS residuals and error metrics for cases
checkresiduals(ets_fatalities_fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 45.002, df = 6, p-value = 4.676e-08
## 
## Model df: 4.   Total lags used: 10

#ETS model has some autocorrelation concerns through lag 12 and does not pass Ljung-Box test, residuals are not white noise
#Residuals show an upward bias and variability

#Evaluate error metrics
accuracy(ets_fatalities_fc$fitted, test_cases_ts)

##                ME     RMSE      MAE      MPE     MAPE      ACF1 Theil's U
## Test set 124825.5 125470.6 124825.5 91.35553 91.35553 0.4376658  11.25395

accuracy(arima_fatalities_fitbc$fitted, test_cases_ts)

##                ME   RMSE      MAE      MPE     MAPE      ACF1 Theil's U
## Test set 125362.1 125913 125362.1 91.82009 91.82009 0.4117493  11.28691

#Neither model appears to be particularly good at explaining residuals
#The ETS model slightly outperforms the ARIMA model for prediction of fatalities

#Model selection and final visuals
#COVID Case Forecasting
#For the forecasting of Covid cases for the period of April 27 through June 10th - the ARIMA(2,1,2) model prodcues the better forecasts
#The ARIMA(2,1,2) model minimizes RMSE, produces white noise residuals, has a tighter confidence interval, and shows a visual trend that aligns with our retrospective understanding of the diseases spread
autoplot(data_cases_ts) + autolayer(arima_cases_fc) + ylab("Global covid cases") + xlab("Days (starting 01/23/2020)") + ggtitle("Covid Cases Forecast (06/10/20) - ARIMA(2,1,2)")

#COVID Fata;ities Forecasting
#The forecasting of Covid Fatalities was more voltaile as the uptic lagged cases, so there was less data in the training set that show the rise in deaths 
#For the period of April 27 through June 10th - the ARIMA(2,1,2) model prodcues the better forecasts
#While it is true that the ETS (M,A,N) model reduces error with the training set and just edges the ARIMA model with the Test data, the overall the visual trend of the ARIMA model better fits the later uptick in deaths while also performing much better in terms of reducing autocorrelation.
autoplot(data_fatalities_ts) + autolayer(arima_fatalities_fc) + ylab("Global covid fatalities") + xlab("Days (starting 01/23/2020)") + ggtitle("Covid fatalities Forecast (06/10/20) - ARIMA(2,1,2)")

Predictive Analytics Midterm - Global Covid Cases and Fatalities

Thomas Laffey

12/1/2021