Packages and Import

Here the packages are imported

Read the data. The data that will be used is the Country Deaths data set that contain that amount of deaths per week by country. This data is interesting because is affected by an event that is not going to be forever, that is the Coronavirus.

data <- readr::read_csv("https://www.mortality.org/Public/STMF/Outputs/stmf.csv", skip=1)
## Rows: 112044 Columns: 19
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): CountryCode, Sex
## dbl (17): Year, Week, D0_14, D15_64, D65_74, D75_84, D85p, DTotal, R0_14, R1...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Description

The purpose of this section is to describe the data. The data selected wasa time serie object that contain deaths by country.

Here is a first view of the data.

data
## # A tibble: 112,044 x 19
##    CountryCode  Year  Week Sex   D0_14 D15_64 D65_74 D75_84  D85p DTotal   R0_14
##    <chr>       <dbl> <dbl> <chr> <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>   <dbl>
##  1 AUS2         2015     1 m      5.06   211.    204    398   394   1212 1.14e-4
##  2 AUS2         2015     1 f      6.76   141.    154    323   676   1301 1.60e-4
##  3 AUS2         2015     1 b     11.8    352.    358    721  1070   2513 1.36e-4
##  4 AUS2         2015     2 m      5.67   166.    216    343   399   1130 1.28e-4
##  5 AUS2         2015     2 f      6.98   149.    147    290   646   1239 1.66e-4
##  6 AUS2         2015     2 b     12.7    315.    363    633  1045   2369 1.46e-4
##  7 AUS2         2015     3 m      3.53   169.    220    354   426   1173 7.93e-5
##  8 AUS2         2015     3 f      5.18   136.    159    286   609   1195 1.23e-4
##  9 AUS2         2015     3 b      8.71   305.    379    640  1035   2368 1.01e-4
## 10 AUS2         2015     4 m      3.37   165.    215    368   373   1124 7.58e-5
## # ... with 112,034 more rows, and 8 more variables: R15_64 <dbl>, R65_74 <dbl>,
## #   R75_84 <dbl>, R85p <dbl>, RTotal <dbl>, Split <dbl>, SplitSex <dbl>,
## #   Forecast <dbl>

Data visualization

deaths <- data %>%
  filter(CountryCode %in% c("AUT", "DEUTNP", "USA"))%>%
  janitor::clean_names() %>%
  dplyr::select(country_code:d_total) %>%
  pivot_longer(5:10,
    names_to = "age", values_to = "deaths",
    names_pattern = "[d_]*([a-z0-9_p]*)"
  ) %>%
  filter(age == "total", sex == "b") %>%
  mutate(
    country = recode(country_code,
      AUT = "Austria",
      DEUTNP = "Germany",
      ESP = "Spain",
      USA = "United States")
  ) %>%
  dplyr::select(year, week, country, deaths)

Here is a data set of just 4 countries previously selected. Austria, Germany, Spain and USA.

deaths
## # A tibble: 2,699 x 4
##     year  week country deaths
##    <dbl> <dbl> <chr>    <dbl>
##  1  2000     1 Austria   1867
##  2  2000     2 Austria   1902
##  3  2000     3 Austria   2027
##  4  2000     4 Austria   1940
##  5  2000     5 Austria   1928
##  6  2000     6 Austria   1760
##  7  2000     7 Austria   1666
##  8  2000     8 Austria   1628
##  9  2000     9 Austria   1566
## 10  2000    10 Austria   1524
## # ... with 2,689 more rows

First we see a visualization of some countries.

deaths %>%
  mutate(thisyear = (year == 2020)) %>%
  ggplot(aes(x=week, y=deaths, group=year)) +
    geom_line(aes(col=thisyear)) +
    facet_wrap(~ country, scales='free_y') +
    scale_color_manual(values=c("FALSE"='gray',"TRUE"='red')) +
    guides(col=FALSE) +
    ggtitle("Weekly deaths")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Here the data is filtered by country, so the data that will be used is the US deaths. This first steps are a preprocessing of the Dates.

us_deaths <- deaths %>% filter( country=="United States")
library(lubridate)

#date
dates <- make_datetime(year = us_deaths$year) + weeks(us_deaths$week)
head(dates)
## [1] "2015-01-15 UTC" "2015-01-22 UTC" "2015-01-29 UTC" "2015-02-05 UTC"
## [5] "2015-02-12 UTC" "2015-02-19 UTC"

Here is a view of the US data.

us_deaths$dates<-dates
head(us_deaths)
## # A tibble: 6 x 5
##    year  week country       deaths dates              
##   <dbl> <dbl> <chr>          <dbl> <dttm>             
## 1  2015     2 United States  61882 2015-01-15 00:00:00
## 2  2015     3 United States  61261 2015-01-22 00:00:00
## 3  2015     4 United States  58738 2015-01-29 00:00:00
## 4  2015     5 United States  57380 2015-02-05 00:00:00
## 5  2015     6 United States  57393 2015-02-12 00:00:00
## 6  2015     7 United States  56535 2015-02-19 00:00:00
us_deaths_aj <- us_deaths %>% dplyr::select(deaths,dates)
head(us_deaths_aj)
## # A tibble: 6 x 2
##   deaths dates              
##    <dbl> <dttm>             
## 1  61882 2015-01-15 00:00:00
## 2  61261 2015-01-22 00:00:00
## 3  58738 2015-01-29 00:00:00
## 4  57380 2015-02-05 00:00:00
## 5  57393 2015-02-12 00:00:00
## 6  56535 2015-02-19 00:00:00

Here the data frame object is converted to a tibble.

us_deaths_aj <- as_tibble(us_deaths_aj)
str(us_deaths_aj)
## tibble [375 x 2] (S3: tbl_df/tbl/data.frame)
##  $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
##  $ dates : POSIXct[1:375], format: "2015-01-15" "2015-01-22" ...

The data format is changed.

us_deaths_aj$Date <- as.Date (us_deaths_aj$dates, formal = "%d/%m/%y")
str(us_deaths_aj)
## tibble [375 x 3] (S3: tbl_df/tbl/data.frame)
##  $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
##  $ dates : POSIXct[1:375], format: "2015-01-15" "2015-01-22" ...
##  $ Date  : Date[1:375], format: "2015-01-15" "2015-01-22" ...
us_deaths_aj <- us_deaths_aj %>% dplyr::select(deaths,Date)
str(us_deaths_aj)
## tibble [375 x 2] (S3: tbl_df/tbl/data.frame)
##  $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
##  $ Date  : Date[1:375], format: "2015-01-15" "2015-01-22" ...
Year <- format(us_deaths_aj$Date, "%y")
Day <- format(us_deaths_aj$Date, "%d")
DT <- us_deaths_aj$deaths
Date <- data.frame(Day, Year, DT)

Here is a plot of the weekly US deaths data.

ggplot(data = us_deaths_aj, aes(x = Date, y = deaths)) + geom_line(colour = "blue", size = 0.5) + labs(title = "Weekly deaths in US", x = "Date", y = "Deaths")

As we expect, the amount of deaths increased since 2020, this because the Coronavirus, that is why this is interesting to analyze this data because we have a event that will no be forever.

Date %>%
ggplot(aes(x = frequency(DT),y = DT)) + geom_boxplot(fill = "Blue") + 
    facet_wrap( ~ Year) + 
    labs(y = "Weekly deaths", x = "Year") + 
    theme_bw(base_size = 10) +
    scale_x_discrete(breaks = 0)

Here we have box plot data of each year, and as we expect 20, 21, 22 have more variability.

Date %>%
ggplot(aes(x = Day, y = DT)) + geom_point(color = "blue") + 
    facet_wrap( ~ Year) + 
    labs(y = "Weekly deaths", x = "Year") + 
    theme_bw(base_size = 10) +
    scale_x_discrete(breaks = c(0,50,100,150,200,250,300))

Date %>%
ggplot(aes(x = DT), y = frequency(DT)) + geom_density(fill = "lightblue") + 
    facet_wrap( ~ Year) + 
    labs(x = "Weekly deaths", y = "Density") + 
    theme_bw(base_size = 10)

In terms of the density per year, we see that 21 and 22 has more values in the right tail.

Test Stationarity

Dicky-Fuller Test

library(aTSA)
## 
## Attaching package: 'aTSA'
## The following objects are masked from 'package:fabletools':
## 
##     estimate, forecast
## The following object is masked from 'package:graphics':
## 
##     identify
adf.test(log(us_deaths_aj$deaths))
## Augmented Dickey-Fuller Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##      lag    ADF p.value
## [1,]   0 -0.266   0.567
## [2,]   1 -0.252   0.572
## [3,]   2 -0.142   0.603
## [4,]   3 -0.108   0.613
## [5,]   4 -0.148   0.601
## [6,]   5 -0.119   0.610
## Type 2: with drift no trend 
##      lag   ADF p.value
## [1,]   0 -1.90  0.3680
## [2,]   1 -2.94  0.0433
## [3,]   2 -3.72  0.0100
## [4,]   3 -3.51  0.0100
## [5,]   4 -3.54  0.0100
## [6,]   5 -3.61  0.0100
## Type 3: with drift and trend 
##      lag   ADF p.value
## [1,]   0 -2.79   0.244
## [2,]   1 -4.03   0.010
## [3,]   2 -5.01   0.010
## [4,]   3 -4.80   0.010
## [5,]   4 -5.01   0.010
## [6,]   5 -5.11   0.010
## ---- 
## Note: in fact, p.value = 0.01 means p.value <= 0.01

Kwiatkowski–Phillips–Schmidt–Shin Test

kpss.test(log(us_deaths_aj$deaths))
## KPSS Unit Root Test 
## alternative: nonstationary 
##  
## Type 1: no drift no trend 
##  lag   stat p.value
##    4 0.0484     0.1
## ----- 
##  Type 2: with drift no trend 
##  lag  stat p.value
##    4 0.138     0.1
## ----- 
##  Type 1: with drift and trend 
##  lag  stat p.value
##    4 0.028     0.1
## ----------- 
## Note: p.value = 0.01 means p.value <= 0.01 
##     : p.value = 0.10 means p.value >= 0.10

The time serie seems to be stationary.

Time Series Decomposition

us_deaths_aj_ts <- ts(us_deaths_aj$deaths, start=c(2015,2),frequency = 52)
us_deaths_aj_ts %>% decompose(type="multiplicative") %>%
  autoplot() + xlab("Year") +
  ggtitle("Classical multiplicative decomposition
    of US total deaths")
## Warning: Removed 104 row(s) containing missing values (geom_path).

us_deaths_aj_ts <- ts(us_deaths_aj$deaths, start=c(2015,2),frequency = 52)
us_deaths_aj_ts %>% decompose(type="additive") %>%
  autoplot() + xlab("Year") +
  ggtitle("Classical additive decomposition
    of US total deaths")
## Warning: Removed 104 row(s) containing missing values (geom_path).

In both decomposition we have an increasing trend that is expected by the coronavirus, and also a seasonal component that shows that deaths are higher at the beginning of the year. ### ACF and PACF

library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:aTSA':
## 
##     forecast
ggtsdisplay(us_deaths_aj_ts)

We see high correlation at the beginning of the serie what is expectedby the coronavirus effect.

Casuality and Invertibility

Stationary Time Series Model

model_1<- Arima(us_deaths_aj$deaths, order = c(3,0,3), method = "ML")
autoplot(model_1)

Non Stationary Time Series Model

model_2 <- Arima(us_deaths_aj$deaths, order = c(3,2,3), method = "ML")
autoplot(model_2)

Stationary Time Series Model First Order Differenced Data

diff <- diff(log(us_deaths_aj$deaths))
model_3 <- Arima(diff, order = c(3,0,3), method = "ML")
autoplot(model_3)

Non Stationary Time Series Model First Order Differenced Data

model_4 <- Arima(diff, order = c(3,2,3), method = "ML")
autoplot(model_4)

These models seems to be no casual and no invertible. Now the next part is the modeling part.

Modeling

ETS and STL models

Basic ETS, STD and NAIVE models

#has_gaps(us_deaths_aj_dt, .full = TRUE)
set.seed(569973)

#c <- pedestrian %>% 
#  fill_gaps(.full = TRUE)

#us_deaths_aj_dt <- us_deaths_aj %>%
#  mutate(Date=yearweek(Date)) %>%
#  as_tsibble(index = Date, interval="week")

#us_deaths_aj_dt_2 <-  as_tsibble(us_deaths_aj_ts,index=Date)

a <- us_deaths_aj_ts %>% as_tsibble()
## Warning: Expected frequency of weekly data: 365.25 / 7 (approx 52.18), not 52.
b <- a %>% mutate(index = yearweek(index))

c <- b %>% 
    fill_gaps(value = mean(value), .full = TRUE)

fit <- c %>%
  model(
    "ETS"=ETS(value),
    "ETS DAMPED" =ETS(value ~ trend("Ad")),
    "NAIVE" = SNAIVE(value),
    "STL" =STL(value),
  )
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
accuracy(fit)
## # A tibble: 4 x 10
##   .model     .type        ME  RMSE   MAE     MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr>     <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS        Training  -14.3 1397.  929. -0.0136  1.54 0.226 0.202 0.115
## 2 ETS DAMPED Training  -14.3 1397.  929. -0.0136  1.54 0.226 0.202 0.115
## 3 NAIVE      Training 2220.  6921. 4106.  3.19    6.27 1     1     0.952
## 4 STL        Training  -16.6 3428. 2145. -0.330   3.43 0.522 0.495 0.920
lambda_d <- c %>% features(value, features = guerrero) %>%
                  pull(lambda_guerrero)
c_trans<- c %>%
  mutate(bc=box_cox(value,lambda_d))

fit2 <- c_trans%>%
  model(
    "STL (BoxCox)" = STL(bc ~ season(window="periodic"),
                         robust=T),
    "ETS (BoxCox)" = ETS(bc)
  )
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
rbind(accuracy(fit),accuracy(fit2))
## # A tibble: 6 x 10
##   .model     .type       ME    RMSE     MAE      MPE    MAPE  MASE RMSSE    ACF1
##   <chr>      <chr>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl> <dbl> <dbl>   <dbl>
## 1 ETS        Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 0.115  
## 2 ETS DAMPED Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 0.115  
## 3 NAIVE      Trai~  2.22e+3 6.92e+3 4.11e+3  3.19e+0 6.27e+0 1     1     0.952  
## 4 STL        Trai~ -1.66e+1 3.43e+3 2.14e+3 -3.30e-1 3.43e+0 0.522 0.495 0.920  
## 5 STL (BoxC~ Trai~  7.18e-7 2.96e-6 1.57e-6  6.46e-5 1.41e-4 0.484 0.592 0.884  
## 6 ETS (BoxC~ Trai~ -1.32e-8 1.11e-6 8.09e-7 -1.19e-6 7.28e-5 0.249 0.223 0.00947

ARIMA models

fit3 <- c %>%
  model("ARIMA (1,1,1)" = ARIMA(value ~ pdq(1,1,1)),
        "ARIMA (2,1,1)" = ARIMA(value ~ pdq(2,1,0)),
        "ARIMA (BEST)"  = ARIMA(value))
fit4 <- c_trans %>%
  model("ARIMA (BOX COX)"  = ARIMA(bc))
## Warning in sqrt(diag(best$var.coef)): Se han producido NaNs
rbind(accuracy(fit3),accuracy(fit4))
## # A tibble: 4 x 10
##   .model    .type       ME    RMSE     MAE     MPE   MAPE   MASE  RMSSE     ACF1
##   <chr>     <chr>    <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>    <dbl>
## 1 ARIMA (1~ Trai~ -1.41e+1 1.37e+3 9.21e+2 -0.0234 1.54    0.224  0.197 -1.54e-2
## 2 ARIMA (2~ Trai~ -1.30e+1 1.36e+3 9.11e+2 -0.0208 1.52    0.222  0.197  1.47e-2
## 3 ARIMA (B~ Trai~ -1.12e+1 1.35e+3 9.01e+2 -0.0218 1.51    0.219  0.194 -7.39e-4
## 4 ARIMA (B~ Trai~  1.54e-4 4.13e-4 1.55e-4  0.0138 0.0139 47.6   82.6    9.80e-1

GARCH models

library(fGarch)
## Warning: package 'fGarch' was built under R version 4.0.5
## Loading required package: timeDate
## Warning: package 'timeDate' was built under R version 4.0.5
## Loading required package: timeSeries
## Warning: package 'timeSeries' was built under R version 4.0.5
## Loading required package: fBasics
## Warning: package 'fBasics' was built under R version 4.0.5
garch11 = garchFit(~ garch(1,1), data = us_deaths_aj_ts, trace = FALSE)
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
##   Consider formula(paste(x, collapse = " ")) instead.
garch11_predict = predict(garch11, n.ahead = 375)
library(forecast)
forecast <- garch11_predict$meanForecast
forecast <- as.data.frame(forecast)
forecast$Date <- us_deaths_aj$Date
forecast_ts <- ts(forecast$forecast, start=c(2015,2),frequency = 52)

accuracy(forecast_ts,us_deaths_aj_ts)
##                ME     RMSE      MAE      MPE     MAPE      ACF1 Theil's U
## Test set 5786.434 9787.081 6461.931 8.620804 9.980362 0.9795714  5.709553
g<-accuracy(forecast_ts,us_deaths_aj_ts)
g<-as.data.frame(g)
g$.model<-"GARCH (1,1)"
g$.type<-"Training"
g$RMSSE <- g$`Theil's U`
g <- subset(g, select=-c(`Theil's U`))
g$MASE <- 0
#tibble
g<-as_tibble(g)

Comparation

Finally the accuracy of the different models is presented, so in the table you can see that the best model based in the metrics is the ETS model with a Box Cox transformation. I think that the transformation is softening the effect of Coronavirus.

rbind(accuracy(fit),accuracy(fit2),accuracy(fit3),accuracy(fit4),g)
## # A tibble: 11 x 10
##    .model .type       ME    RMSE     MAE      MPE    MAPE   MASE  RMSSE     ACF1
##    <chr>  <chr>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl>  <dbl>  <dbl>    <dbl>
##  1 ETS    Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0  0.226  0.202  1.15e-1
##  2 ETS D~ Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0  0.226  0.202  1.15e-1
##  3 NAIVE  Trai~  2.22e+3 6.92e+3 4.11e+3  3.19e+0 6.27e+0  1      1      9.52e-1
##  4 STL    Trai~ -1.66e+1 3.43e+3 2.14e+3 -3.30e-1 3.43e+0  0.522  0.495  9.20e-1
##  5 STL (~ Trai~  7.18e-7 2.96e-6 1.57e-6  6.46e-5 1.41e-4  0.484  0.592  8.84e-1
##  6 ETS (~ Trai~ -1.32e-8 1.11e-6 8.09e-7 -1.19e-6 7.28e-5  0.249  0.223  9.47e-3
##  7 ARIMA~ Trai~ -1.41e+1 1.37e+3 9.21e+2 -2.34e-2 1.54e+0  0.224  0.197 -1.54e-2
##  8 ARIMA~ Trai~ -1.30e+1 1.36e+3 9.11e+2 -2.08e-2 1.52e+0  0.222  0.197  1.47e-2
##  9 ARIMA~ Trai~ -1.12e+1 1.35e+3 9.01e+2 -2.18e-2 1.51e+0  0.219  0.194 -7.39e-4
## 10 ARIMA~ Trai~  1.54e-4 4.13e-4 1.55e-4  1.38e-2 1.39e-2 47.6   82.6    9.80e-1
## 11 GARCH~ Trai~  5.79e+3 9.79e+3 6.46e+3  8.62e+0 9.98e+0  0      5.71   9.80e-1