Libraries

library(tsibbledata)
library(dplyr)
library(ggplot2)
library(forecast)
library(tsibble)
library(feasts)
library(readr)
library(tidyverse)
library(patchwork)
library(fable)
library(hts)

Problem

There are likely few people on Earth who need to learn of what sort of problem(s) COVID is or has caused. The least we can aim for as a people is to reduce the harm we cause ourselves, which can arguably be attributed mostly to fear. The problem this paper aims to address is simply to bring awareness to the nature of COVID–since the more we know of it, the better we can live with it.

Significance

With a better understanding of COVID-19, people can make better decisions about how to live their life, policy-makers can implement better policy, and people everywhere can get some of their life back. By making information accurate and easily understood, the fear that’s grasped many for so long now will begin to subside and COVID can simply be a part of our daily lives and not something that consumes it.

Data

train <- read_csv("data/f_train.csv")
test <- read.csv("data/f_test.csv")
submit <- read.csv("data/f_submission.csv")

usData <- train %>%
  dplyr::filter(Country_Region == "US")

colSums(is.na(usData))

            Id         County Province_State Country_Region     Population 
             0          15400            280              0              0 
        Weight           Date         Target    TargetValue 
             0              0              0              0

At first glance, it’s clear that the missing values in the columns ‘County’ and ‘Province_State’ are meaningful. When values are missing for only ‘county,’ the numbers for infections and fatalities total that of the state. When values are missing for both variables, the same applies to the whole of the United States. The aggregate totals will be separated and explored below.

length(unique(usData$Date)) # number of days

[1] 140

dates <- sort(unique(usData$Date))
dates[1]

[1] "2020-01-23"

dates[length(dates)]

[1] "2020-06-10"

length(unique(usData$Province_State)) # number of states

[1] 55

unique(usData$Province_State)

 [1] "Alabama"              "Alaska"               "Arizona"             
 [4] "Arkansas"             "California"           "Colorado"            
 [7] "Connecticut"          "Delaware"             "District of Columbia"
[10] "Florida"              "Georgia"              "Guam"                
[13] "Hawaii"               "Idaho"                "Illinois"            
[16] "Indiana"              "Iowa"                 "Kansas"              
[19] "Kentucky"             "Louisiana"            "Maine"               
[22] "Maryland"             "Massachusetts"        "Michigan"            
[25] "Minnesota"            "Mississippi"          "Missouri"            
[28] "Montana"              "Nebraska"             "Nevada"              
[31] "New Hampshire"        "New Jersey"           "New Mexico"          
[34] "New York"             "North Carolina"       "North Dakota"        
[37] "Ohio"                 "Oklahoma"             "Oregon"              
[40] "Pennsylvania"         "Puerto Rico"          "Rhode Island"        
[43] "South Carolina"       "South Dakota"         "Tennessee"           
[46] "Texas"                "Utah"                 "Vermont"             
[49] "Virgin Islands"       "Virginia"             "Washington"          
[52] "West Virginia"        "Wisconsin"            "Wyoming"             
[55] NA

This data for the US spans 140 days, from Jan. 23, 2020 to June 10, 2020. It is made up of 55 state classifications, with the additional five listed below.

Guam
Virgin Islands
Puerto Rico
Distric of Columbia
NA

- US Aggregate

# US Aggregate
usAggregate <- usData %>%
  dplyr::filter(is.na(Province_State)) 

ggplot(data = usAggregate, 
       mapping = aes(x = Date, 
                     y = TargetValue,
                     fill=Target,
                     color = Target)) + 
  geom_line() +
  scale_color_manual(values=c("darkgreen", "red")) + 
  theme(legend.position = c(0.175, 0.835)) + 
  labs(title="US COVID: Confirmed Cases v Fatalities",
       x = "2020",
       y = "Amount")

usA <- usAggregate %>%
  select(-County, -Province_State) %>%
  pivot_wider(id_cols = Date, names_from = Target, values_from = TargetValue)

usProp <- usA %>%
  group_by(Date) %>%
  summarize(Prop = Fatalities / ConfirmedCases) %>%
  as_tsibble(index = Date)
usProp[is.na(usProp)] <- 0

ggplot(data = usProp)+
  geom_line(mapping=aes(x = Date, y = Prop)) + 
  labs(title="Proportion between Infection and Fatality")

- State Aggregates

# State Aggregates - Raw
stateAggregate <- usData %>%
  dplyr::filter(is.na(County)) %>%
  dplyr::filter(!is.na(Province_State))


# State Aggregates - Totals
tally <- stateAggregate %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0) %>%
  group_by(Province_State)
tally <- aggregate(tally[,c(8,9)], FUN="sum", by=list(tally$Province_State))
head(tally)

     Group.1 ConfirmedCases Fatalities
1    Alabama          21989        744
2     Alaska            592         11
3    Arizona          29852       1101
4   Arkansas          10368        165
5 California         139715       4854
6   Colorado          28484       1572

caseT <- tally$ConfirmedCases %in% head(sort(tally$ConfirmedCases, decreasing = TRUE),9)
caseT <- tally[caseT,1:2]
faTe <- tally$Fatalities %in% head(sort(tally$Fatalities, decreasing = TRUE),9)
faTe <- tally[faTe,c(1,3)]


# Top 9 - Cases
c <- stateAggregate$Province_State %in% caseT[1:9,1]
t5c <- stateAggregate %>%
  dplyr::filter(c)

# Top 9 - Fatalities
f <- stateAggregate$Province_State %in% faTe[1:9,1]
t5f <- stateAggregate %>%
  dplyr::filter(f)


# Cases
ggplot(data = t5c, 
       mapping = aes(x = Date, 
                     y = TargetValue,
                     fill=Target,
                     color = Target)) + 
  geom_line() + 
  facet_wrap(~ Province_State) + 
  scale_color_manual(values=c("darkgreen", "red")) + 
  theme(legend.position = c(0.75, 0.93)) +
  labs(title = "US States with highest Confirmed Cases - Top 9",
       y = "Amount")

# Fatalities
ggplot(data = t5f, 
       mapping = aes(x = Date, 
                     y = TargetValue,
                     fill=Target,
                     color = Target)) + 
  geom_line() + 
  facet_wrap(~ Province_State) + 
  scale_color_manual(values=c("darkgreen", "red")) + 
  theme(legend.position = c(0.75, 0.93)) +
  labs(title = "US States with highest Fatalities - Top 9",
       y = "Amount")

- Testing

ts.tot <- usAggregate %>%
  group_by(Date) %>%
  select(-County, -Province_State) %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
  
ts.tot <- aggregate(ts.tot[,c(6,7)], FUN="sum", by=list(ts.tot$Date))
ts.tot <- ts.tot %>% as_tsibble(index = Group.1)


### Stationarity Tests

Cases

gg_tsdisplay(ts.tot)

ts.tot %>%
  gg_tsdisplay(difference(ConfirmedCases), plot_type = 'partial')

# For cases, the ACF is sinusoidal and there is a significant spike at lag 7(p) in 
# the PACF, and none beyond lag 7. This suggests the data may follow an ARIMA(p,d,0) model.
#
# ARIMA(9, 2, 0)



# Ljung Box test
ts.tot %>%
  mutate(diff_close = difference(ConfirmedCases)) %>%
  features(diff_close, ljung_box, lag = 10)

# A tibble: 1 x 2
  lb_stat lb_pvalue
    <dbl>     <dbl>
1    88.7  9.55e-15

# Unitroot test
ts.tot %>%
  features(ConfirmedCases, unitroot_kpss) # need to difference

# A tibble: 1 x 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1      2.13        0.01

ndiffs(ts.tot$ConfirmedCases)

[1] 1

ts.tot$ConfirmedCases %>% diff(1) %>% unitroot_kpss()

  kpss_stat kpss_pvalue 
    0.16293     0.10000

ts.tot$ConfirmedCases %>% unitroot_nsdiffs()

nsdiffs 
      0

Fatalities

gg_tsdisplay(ts.tot, y=Fatalities)

ts.tot %>%
  gg_tsdisplay(difference(Fatalities), plot_type = 'partial')

# The PACF plot is exponentially decaying but there is more than one significant spike
# at lag q in the ACF. If there were only one spike, the data might follow an ARIMA(0,d,q)
# model. 
#
# ARIMA(0, 2, 21)

# Ljung Box test
ts.tot %>%
  mutate(diff_close = difference(Fatalities)) %>%
  features(diff_close, ljung_box, lag = 10)

# A tibble: 1 x 2
  lb_stat lb_pvalue
    <dbl>     <dbl>
1    75.5  3.78e-12

# Unitroot test
ts.tot %>%
  features(Fatalities, unitroot_kpss) # needs differencing

# A tibble: 1 x 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1      1.65        0.01

ndiffs(ts.tot$Fatalities)

[1] 1

ts.tot$Fatalities %>% diff(1) %>% unitroot_kpss()

  kpss_stat kpss_pvalue 
 0.07146237  0.10000000

ts.tot$Fatalities %>% unitroot_nsdiffs()

nsdiffs 
      0

# The data is not stationary for either Confirmed Cases or Fatalities. As shown by 
# the KPSS test, one order of differencing is required.

Types of Models

The primary models used to perform this analysis will be ARIMA and ETS models. Given clues from the ACF and PACF plots above, it appears that the best models may follow an ARIMA(p,d,0) model for both Confirmed Cases and Fatalities. Since the ACF plot is sinusoidal and there is a significant spike in the PACF plot at lag[x], with no significant lags beyond ‘x’, it is possible that the best model for this data will follow the previously mentioned model type for both Confirmed Cases and Fatalities.

In the final section, Future Work, we will quickly explore a Hierarchical / Grouped Time-Series. Since only US aggregates are modeled in this paper, it has plenty room for improvement when it comes to providing information about individual states.

Literature Review

While a majority of researches have used more advanced ML techniques, there is no shortage of ARIMA or Linear Regression models being put to use. Andres Hernandex-Matamoros, Hamido Fujita, Toshitaka Hayashi, and Hector Perez-Meana used various ARIMA models and polynomial functions to forecast COVID19 per region (2020). In their paper, 145 countries distributed among 6 regions were used to create predictions capable of achieving a RMSE average of 144.81, which is significantly better than what was achieved in this paper.

To build their model, they performed a 90/10 split with the first being used to train the ARIMA models and the second used to calculate the RMSE score. To calculate the best parameters of ARIMA, the group created an algorithm to test all pdq and PDQ combinations for all countries under study. This far exceeds this papers utilization of the ‘auto-arima’ function.

When compared to the work by Sudhir Bhandari, Amit Tak, Jitendra Gupta, Bhoopendra Patel, Jyotsna Shukla and 8 more in Prediction using autoregressive integrated moving average modeling, the previous paper performs much better. The work performed in this paper uses ARIMA(1,3,2) and ARIMA(3,3,1) models to achieve RMSE scores with a combination of Cases and Fatalities of 21,137 and 668.7, respectively. These scores rank similarly to the performance seen in this paper, but do not compare directly since Cases and Fatalities are modeled separately.

In all, the performance of this work under performs when compared with higher level analysis as seen in the first paper. However, its results are comparable to other works that use the same techniques.

Building Models

holts = ts.tot
holts[,c(2,3)] = holts[,c(2,3)] + 1


# finding best model
fix <- holts %>% filter_index(.~"2020-04-27")


fit <- fix %>%
  model(
    `ETS_c` = ETS(ConfirmedCases), 
    `ARIMA_c` = ARIMA(ConfirmedCases)
  )

fit2 <- fix %>%
  model(
    `ETS_f` = ETS(Fatalities),
    `ARIMA_f` = ARIMA(Fatalities)
  )

fc <- fit %>% forecast(h = 43)
fc2 <- fit2 %>% forecast(h = 43)

bind_rows(
  fit %>% accuracy(),
  fit2 %>% accuracy(),
  fit %>% forecast(h = 43) %>% accuracy(ts.tot),
  fit2 %>% forecast(h = 43) %>% accuracy(ts.tot)
)

# A tibble: 8 x 10
  .model  .type         ME   RMSE   MAE     MPE   MAPE   MASE RMSSE     ACF1
  <chr>   <chr>      <dbl>  <dbl> <dbl>   <dbl>  <dbl>  <dbl> <dbl>    <dbl>
1 ETS_c   Training  6803.  11113. 6811. -351.   426.    2.36  2.12   0.948  
2 ARIMA_c Training   -22.5  1607.  844.    1.37   8.50  0.293 0.306 -0.0740 
3 ETS_f   Training  -120.   1126.  342.  -33.7   82.1   1.31  2.07  -0.0581 
4 ARIMA_f Training    22.6   348.  131.    3.65  16.0   0.501 0.639  0.00627
5 ARIMA_c Test      -400.   2474. 2018.   -1.85   8.82  0.701 0.472  0.207  
6 ETS_c   Test      3073.   5929. 4737.   11.8   20.3   1.64  1.13  -0.0541 
7 ARIMA_f Test      -301.    545.  447.  -46.4   53.0   1.71  1.00   0.604  
8 ETS_f   Test     -2807.   3538. 2807. -231.   231.   10.8   6.50  -0.00464

# ARIMA models perform better than ETS for both training and testing data.

- Cases

myarima <- fix %>%
  model(
    ARIMA(ConfirmedCases)
  )
myfc <- myarima %>% forecast(h = 43)
myfc %>% autoplot(ts.tot, level=30)+ 
  labs(title = "Confirmed Cases  --  ARIMA(1,1,1)(0,1,1)[7]")

report(myarima)

Series: ConfirmedCases 
Model: ARIMA(1,1,1)(0,1,1)[7] 

Coefficients:
         ar1     ma1     sma1
      -0.974  0.8241  -0.2879
s.e.   0.034  0.0890   0.1316

sigma^2 estimated as 2918282:  log likelihood=-779.22
AIC=1566.44   AICc=1566.93   BIC=1576.35

- Fatalities

myarima2 <- fix %>%
  model(
    ARIMA(Fatalities)
  )
myfc2 <- myarima2 %>% forecast(h = 43)
myfc2 %>% autoplot(ts.tot, level=30) + 
  labs(title = "Fatalities  --  ARIMA(1,1,1)(1,0,0)[7]")

report(myarima2)

Series: Fatalities 
Model: ARIMA(1,1,1)(1,0,0)[7] 

Coefficients:
         ar1      ma1    sar1
      0.4587  -0.7936  0.4604
s.e.  0.1658   0.1120  0.1007

sigma^2 estimated as 126417:  log likelihood=-692.18
AIC=1392.35   AICc=1392.8   BIC=1402.57

- Evaluation

test.usData <- test %>%
  group_by(Date) %>%
  dplyr::filter(Country_Region == "US") 

colSums(is.na(test.usData))

    ForecastId         County Province_State Country_Region     Population 
             0              0              0              0              0 
        Weight           Date         Target 
             0              0              0

test.usData["TargetValue"] = 0
test.ts.tot <- test.usData %>%
  group_by(Date) %>%
  select(-County, -Province_State) %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)


test.ts.tot <- aggregate(test.ts.tot[,c(6,7)], FUN="sum", by=list(test.ts.tot$Date))
test.ts.tot$Group.1 = as.Date(test.ts.tot$Group.1)
test.ts.tot <- test.ts.tot %>% as_tsibble(index = Group.1)
test.ts.tot = test.ts.tot[c(2:nrow(test.ts.tot)),]

arima.test <- myarima %>% forecast(h = 44)
arima2.test <- myarima2 %>% forecast(h = 44)

test.ts.tot$ConfirmedCases = arima.test$.mean
test.ts.tot$Fatalities = arima2.test$.mean

test.ts.tot[4] = ts.tot[c(97:nrow(ts.tot)),2]
test.ts.tot[5] = ts.tot[c(97:nrow(ts.tot)),3]
names(test.ts.tot)[c(1,4,5)] = c("Date","ActualC","ActualF")


ggplot(data = test.ts.tot, mapping = aes(x = Date)) + 
  geom_line(aes(y = ActualC), color = "red") + 
  geom_point(aes(y = ActualC), color = "red") + 
  geom_line(aes(y = ConfirmedCases), color = "blue4") + 
  geom_point(aes(y = ConfirmedCases), color = "blue4") +
  ggtitle("Actual (red)   v   ARIMA (blue)") + 
  ylab("Confirmed Cases")

ggplot(data = test.ts.tot, mapping = aes(x = Date)) + 
  geom_line(aes(y = Fatalities), color = "blue4") + 
  geom_point(aes(y = Fatalities), color = "blue4") + 
  geom_line(aes(y = ActualF), color = "red") + 
  geom_point(aes(y = ActualF), color = "red") +
  ggtitle("Actual (red)   v   ARIMA (blue)") + 
  ylab("Fatalities")

Limitations

While the models used to predict Confirmed Cases and Fatalities are able to give an accurate short-term prediction, a limitation with these types of models is that their accuracy diminishes greatly as predictions are made further out in time. This is especially apparent with the predictions for Fatalities as the predicted values trend towards the mean. There are many factors that contribute to both Cases and Fatalities, of which there may be an unforeseen spike in either which the model would be unable to predict. While that is the case for this paper as it stands now with it’s limited amount of data, this possibility diminishes as time goes on and more data is collected and fed to the models being used.

Future Work

#
# Hierarchical Time-Series
#

stateAggregate$Target[stateAggregate$Target == "ConfirmedCases"] <- "c"
stateAggregate$Target[stateAggregate$Target == "Fatalities"] <- "f"
stateAggregate$Province_State = state.abb[match(stateAggregate$Province_State, state.name)]

testing <- stateAggregate %>%
  select(Province_State, Date, Target, TargetValue) %>%
  filter(!is.na(Province_State)) %>%
  pivot_wider(
    id_cols = Date, names_from = c(Province_State, Target), values_from = TargetValue
  ) %>%
  as_tsibble(index = Date)

USts <- ts(testing[,2:length(testing)], frequency = 175, start=c(2020))
UShts <- hts(USts, characters = c(2,2))

# forecasts <- forecast(UShts, h=8, method="tdfp", fmethod = "arima")
# plot(forecasts, 
#     xlim=c(2020, 2021))
#
# This works but will not output through RMD

References

Andres Hernandez-Matamoros, Hamido Fujita, Toshitaka Hayashi, Hector Perez-Meana, Forecasting of COVID19 per regions using ARIMA models and polynomial functions, Applied Soft Computing, Volume 96, 2020, 106610, ISSN 1568-4946 https://doi.org/10.1016/j.asoc.2020.106610

Sudhir Bhandari, Amit Tak, Jitendra Gupta et al. Evolving trajectories of COVID-19 curves in India: Prediction using autoregressive integrated moving average modeling., 07 July 2020, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-40385/v1]

Midterm

Silas Selfe

2022-04-12