Assignment: Homework Time Series

Data Description

The purpose of this section is to describe the data. The data selected wasa time serie object that contain deaths by country.

Here is a first view of the data.

data

## # A tibble: 112,044 x 19
##    CountryCode  Year  Week Sex   D0_14 D15_64 D65_74 D75_84  D85p DTotal   R0_14
##    <chr>       <dbl> <dbl> <chr> <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>   <dbl>
##  1 AUS2         2015     1 m      5.06   211.    204    398   394   1212 1.14e-4
##  2 AUS2         2015     1 f      6.76   141.    154    323   676   1301 1.60e-4
##  3 AUS2         2015     1 b     11.8    352.    358    721  1070   2513 1.36e-4
##  4 AUS2         2015     2 m      5.67   166.    216    343   399   1130 1.28e-4
##  5 AUS2         2015     2 f      6.98   149.    147    290   646   1239 1.66e-4
##  6 AUS2         2015     2 b     12.7    315.    363    633  1045   2369 1.46e-4
##  7 AUS2         2015     3 m      3.53   169.    220    354   426   1173 7.93e-5
##  8 AUS2         2015     3 f      5.18   136.    159    286   609   1195 1.23e-4
##  9 AUS2         2015     3 b      8.71   305.    379    640  1035   2368 1.01e-4
## 10 AUS2         2015     4 m      3.37   165.    215    368   373   1124 7.58e-5
## # ... with 112,034 more rows, and 8 more variables: R15_64 <dbl>, R65_74 <dbl>,
## #   R75_84 <dbl>, R85p <dbl>, RTotal <dbl>, Split <dbl>, SplitSex <dbl>,
## #   Forecast <dbl>

Data visualization

deaths <- data %>%
  filter(CountryCode %in% c("AUT", "DEUTNP", "USA"))%>%
  janitor::clean_names() %>%
  dplyr::select(country_code:d_total) %>%
  pivot_longer(5:10,
    names_to = "age", values_to = "deaths",
    names_pattern = "[d_]*([a-z0-9_p]*)"
  ) %>%
  filter(age == "total", sex == "b") %>%
  mutate(
    country = recode(country_code,
      AUT = "Austria",
      DEUTNP = "Germany",
      ESP = "Spain",
      USA = "United States")
  ) %>%
  dplyr::select(year, week, country, deaths)

Here is a data set of just 4 countries previously selected. Austria, Germany, Spain and USA.

deaths

## # A tibble: 2,699 x 4
##     year  week country deaths
##    <dbl> <dbl> <chr>    <dbl>
##  1  2000     1 Austria   1867
##  2  2000     2 Austria   1902
##  3  2000     3 Austria   2027
##  4  2000     4 Austria   1940
##  5  2000     5 Austria   1928
##  6  2000     6 Austria   1760
##  7  2000     7 Austria   1666
##  8  2000     8 Austria   1628
##  9  2000     9 Austria   1566
## 10  2000    10 Austria   1524
## # ... with 2,689 more rows

First we see a visualization of some countries.

deaths %>%
  mutate(thisyear = (year == 2020)) %>%
  ggplot(aes(x=week, y=deaths, group=year)) +
    geom_line(aes(col=thisyear)) +
    facet_wrap(~ country, scales='free_y') +
    scale_color_manual(values=c("FALSE"='gray',"TRUE"='red')) +
    guides(col=FALSE) +
    ggtitle("Weekly deaths")

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Here the data is filtered by country, so the data that will be used is the US deaths. This first steps are a preprocessing of the Dates.

us_deaths <- deaths %>% filter( country=="United States")
library(lubridate)

#date
dates <- make_datetime(year = us_deaths$year) + weeks(us_deaths$week)
head(dates)

## [1] "2015-01-15 UTC" "2015-01-22 UTC" "2015-01-29 UTC" "2015-02-05 UTC"
## [5] "2015-02-12 UTC" "2015-02-19 UTC"

Here is a view of the US data.

us_deaths$dates<-dates
head(us_deaths)

## # A tibble: 6 x 5
##    year  week country       deaths dates              
##   <dbl> <dbl> <chr>          <dbl> <dttm>             
## 1  2015     2 United States  61882 2015-01-15 00:00:00
## 2  2015     3 United States  61261 2015-01-22 00:00:00
## 3  2015     4 United States  58738 2015-01-29 00:00:00
## 4  2015     5 United States  57380 2015-02-05 00:00:00
## 5  2015     6 United States  57393 2015-02-12 00:00:00
## 6  2015     7 United States  56535 2015-02-19 00:00:00

us_deaths_aj <- us_deaths %>% dplyr::select(deaths,dates)
head(us_deaths_aj)

## # A tibble: 6 x 2
##   deaths dates              
##    <dbl> <dttm>             
## 1  61882 2015-01-15 00:00:00
## 2  61261 2015-01-22 00:00:00
## 3  58738 2015-01-29 00:00:00
## 4  57380 2015-02-05 00:00:00
## 5  57393 2015-02-12 00:00:00
## 6  56535 2015-02-19 00:00:00

Here the data frame object is converted to a tibble.

us_deaths_aj <- as_tibble(us_deaths_aj)
str(us_deaths_aj)

## tibble [375 x 2] (S3: tbl_df/tbl/data.frame)
##  $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
##  $ dates : POSIXct[1:375], format: "2015-01-15" "2015-01-22" ...

The data format is changed.

us_deaths_aj$Date <- as.Date (us_deaths_aj$dates, formal = "%d/%m/%y")
str(us_deaths_aj)

## tibble [375 x 3] (S3: tbl_df/tbl/data.frame)
##  $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
##  $ dates : POSIXct[1:375], format: "2015-01-15" "2015-01-22" ...
##  $ Date  : Date[1:375], format: "2015-01-15" "2015-01-22" ...

us_deaths_aj <- us_deaths_aj %>% dplyr::select(deaths,Date)
str(us_deaths_aj)

## tibble [375 x 2] (S3: tbl_df/tbl/data.frame)
##  $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
##  $ Date  : Date[1:375], format: "2015-01-15" "2015-01-22" ...

Year <- format(us_deaths_aj$Date, "%y")
Day <- format(us_deaths_aj$Date, "%d")
DT <- us_deaths_aj$deaths
Date <- data.frame(Day, Year, DT)

Here is a plot of the weekly US deaths data.

ggplot(data = us_deaths_aj, aes(x = Date, y = deaths)) + geom_line(colour = "blue", size = 0.5) + labs(title = "Weekly deaths in US", x = "Date", y = "Deaths")

As we expect, the amount of deaths increased since 2020, this because the Coronavirus, that is why this is interesting to analyze this data because we have a event that will no be forever.

Date %>%
ggplot(aes(x = frequency(DT),y = DT)) + geom_boxplot(fill = "Blue") + 
    facet_wrap( ~ Year) + 
    labs(y = "Weekly deaths", x = "Year") + 
    theme_bw(base_size = 10) +
    scale_x_discrete(breaks = 0)

Here we have box plot data of each year, and as we expect 20, 21, 22 have more variability.

Date %>%
ggplot(aes(x = Day, y = DT)) + geom_point(color = "blue") + 
    facet_wrap( ~ Year) + 
    labs(y = "Weekly deaths", x = "Year") + 
    theme_bw(base_size = 10) +
    scale_x_discrete(breaks = c(0,50,100,150,200,250,300))

Date %>%
ggplot(aes(x = DT), y = frequency(DT)) + geom_density(fill = "lightblue") + 
    facet_wrap( ~ Year) + 
    labs(x = "Weekly deaths", y = "Density") + 
    theme_bw(base_size = 10)

In terms of the density per year, we see that 21 and 22 has more values in the right tail.

Test Stationarity

Dicky-Fuller Test

library(aTSA)

## 
## Attaching package: 'aTSA'

## The following objects are masked from 'package:fabletools':
## 
##     estimate, forecast

## The following object is masked from 'package:graphics':
## 
##     identify

adf.test(log(us_deaths_aj$deaths))

## Augmented Dickey-Fuller Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##      lag    ADF p.value
## [1,]   0 -0.266   0.567
## [2,]   1 -0.252   0.572
## [3,]   2 -0.142   0.603
## [4,]   3 -0.108   0.613
## [5,]   4 -0.148   0.601
## [6,]   5 -0.119   0.610
## Type 2: with drift no trend 
##      lag   ADF p.value
## [1,]   0 -1.90  0.3680
## [2,]   1 -2.94  0.0433
## [3,]   2 -3.72  0.0100
## [4,]   3 -3.51  0.0100
## [5,]   4 -3.54  0.0100
## [6,]   5 -3.61  0.0100
## Type 3: with drift and trend 
##      lag   ADF p.value
## [1,]   0 -2.79   0.244
## [2,]   1 -4.03   0.010
## [3,]   2 -5.01   0.010
## [4,]   3 -4.80   0.010
## [5,]   4 -5.01   0.010
## [6,]   5 -5.11   0.010
## ---- 
## Note: in fact, p.value = 0.01 means p.value <= 0.01

Kwiatkowski–Phillips–Schmidt–Shin Test

kpss.test(log(us_deaths_aj$deaths))

## KPSS Unit Root Test 
## alternative: nonstationary 
##  
## Type 1: no drift no trend 
##  lag   stat p.value
##    4 0.0484     0.1
## ----- 
##  Type 2: with drift no trend 
##  lag  stat p.value
##    4 0.138     0.1
## ----- 
##  Type 1: with drift and trend 
##  lag  stat p.value
##    4 0.028     0.1
## ----------- 
## Note: p.value = 0.01 means p.value <= 0.01 
##     : p.value = 0.10 means p.value >= 0.10

The time serie seems to be stationary.

Time Series Decomposition

us_deaths_aj_ts <- ts(us_deaths_aj$deaths, start=c(2015,2),frequency = 52)
us_deaths_aj_ts %>% decompose(type="multiplicative") %>%
  autoplot() + xlab("Year") +
  ggtitle("Classical multiplicative decomposition
    of US total deaths")

## Warning: Removed 104 row(s) containing missing values (geom_path).

us_deaths_aj_ts <- ts(us_deaths_aj$deaths, start=c(2015,2),frequency = 52)
us_deaths_aj_ts %>% decompose(type="additive") %>%
  autoplot() + xlab("Year") +
  ggtitle("Classical additive decomposition
    of US total deaths")

## Warning: Removed 104 row(s) containing missing values (geom_path).

In both decomposition we have an increasing trend that is expected by the coronavirus, and also a seasonal component that shows that deaths are higher at the beginning of the year. ### ACF and PACF

library(forecast)

## Warning: package 'forecast' was built under R version 4.0.5

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify

## 
## Attaching package: 'forecast'

## The following object is masked from 'package:aTSA':
## 
##     forecast

ggtsdisplay(us_deaths_aj_ts)

We see high correlation at the beginning of the serie what is expectedby the coronavirus effect.

Casuality and Invertibility

Stationary Time Series Model

model_1<- Arima(us_deaths_aj$deaths, order = c(3,0,3), method = "ML")
autoplot(model_1)

Non Stationary Time Series Model

model_2 <- Arima(us_deaths_aj$deaths, order = c(3,2,3), method = "ML")
autoplot(model_2)

Stationary Time Series Model First Order Differenced Data

diff <- diff(log(us_deaths_aj$deaths))
model_3 <- Arima(diff, order = c(3,0,3), method = "ML")
autoplot(model_3)

Non Stationary Time Series Model First Order Differenced Data

model_4 <- Arima(diff, order = c(3,2,3), method = "ML")
autoplot(model_4)

These models seems to be no casual and no invertible. Now the next part is the modeling part.

Modeling

ETS and STL models

Basic ETS, STD and NAIVE models

#has_gaps(us_deaths_aj_dt, .full = TRUE)
set.seed(569973)

#c <- pedestrian %>% 
#  fill_gaps(.full = TRUE)

#us_deaths_aj_dt <- us_deaths_aj %>%
#  mutate(Date=yearweek(Date)) %>%
#  as_tsibble(index = Date, interval="week")

#us_deaths_aj_dt_2 <-  as_tsibble(us_deaths_aj_ts,index=Date)

a <- us_deaths_aj_ts %>% as_tsibble()

## Warning: Expected frequency of weekly data: 365.25 / 7 (approx 52.18), not 52.

b <- a %>% mutate(index = yearweek(index))

c <- b %>% 
    fill_gaps(value = mean(value), .full = TRUE)

fit <- c %>%
  model(
    "ETS"=ETS(value),
    "ETS DAMPED" =ETS(value ~ trend("Ad")),
    "NAIVE" = SNAIVE(value),
    "STL" =STL(value),
  )

## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.

## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.

accuracy(fit)

## # A tibble: 4 x 10
##   .model     .type        ME  RMSE   MAE     MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr>     <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS        Training  -14.3 1397.  929. -0.0136  1.54 0.226 0.202 0.115
## 2 ETS DAMPED Training  -14.3 1397.  929. -0.0136  1.54 0.226 0.202 0.115
## 3 NAIVE      Training 2220.  6921. 4106.  3.19    6.27 1     1     0.952
## 4 STL        Training  -16.6 3428. 2145. -0.330   3.43 0.522 0.495 0.920

lambda_d <- c %>% features(value, features = guerrero) %>%
                  pull(lambda_guerrero)
c_trans<- c %>%
  mutate(bc=box_cox(value,lambda_d))

fit2 <- c_trans%>%
  model(
    "STL (BoxCox)" = STL(bc ~ season(window="periodic"),
                         robust=T),
    "ETS (BoxCox)" = ETS(bc)
  )

## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.

rbind(accuracy(fit),accuracy(fit2))

## # A tibble: 6 x 10
##   .model     .type       ME    RMSE     MAE      MPE    MAPE  MASE RMSSE    ACF1
##   <chr>      <chr>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl> <dbl> <dbl>   <dbl>
## 1 ETS        Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 0.115  
## 2 ETS DAMPED Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 0.115  
## 3 NAIVE      Trai~  2.22e+3 6.92e+3 4.11e+3  3.19e+0 6.27e+0 1     1     0.952  
## 4 STL        Trai~ -1.66e+1 3.43e+3 2.14e+3 -3.30e-1 3.43e+0 0.522 0.495 0.920  
## 5 STL (BoxC~ Trai~  7.18e-7 2.96e-6 1.57e-6  6.46e-5 1.41e-4 0.484 0.592 0.884  
## 6 ETS (BoxC~ Trai~ -1.32e-8 1.11e-6 8.09e-7 -1.19e-6 7.28e-5 0.249 0.223 0.00947

ARIMA models

fit3 <- c %>%
  model("ARIMA (1,1,1)" = ARIMA(value ~ pdq(1,1,1)),
        "ARIMA (2,1,1)" = ARIMA(value ~ pdq(2,1,0)),
        "ARIMA (BEST)"  = ARIMA(value))

fit4 <- c_trans %>%
  model("ARIMA (BOX COX)"  = ARIMA(bc))

## Warning in sqrt(diag(best$var.coef)): Se han producido NaNs

rbind(accuracy(fit3),accuracy(fit4))

## # A tibble: 4 x 10
##   .model    .type       ME    RMSE     MAE     MPE   MAPE   MASE  RMSSE     ACF1
##   <chr>     <chr>    <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>    <dbl>
## 1 ARIMA (1~ Trai~ -1.41e+1 1.37e+3 9.21e+2 -0.0234 1.54    0.224  0.197 -1.54e-2
## 2 ARIMA (2~ Trai~ -1.30e+1 1.36e+3 9.11e+2 -0.0208 1.52    0.222  0.197  1.47e-2
## 3 ARIMA (B~ Trai~ -1.12e+1 1.35e+3 9.01e+2 -0.0218 1.51    0.219  0.194 -7.39e-4
## 4 ARIMA (B~ Trai~  1.54e-4 4.13e-4 1.55e-4  0.0138 0.0139 47.6   82.6    9.80e-1

GARCH models

library(fGarch)

## Warning: package 'fGarch' was built under R version 4.0.5

## Loading required package: timeDate

## Warning: package 'timeDate' was built under R version 4.0.5

## Loading required package: timeSeries

## Warning: package 'timeSeries' was built under R version 4.0.5

## Loading required package: fBasics

## Warning: package 'fBasics' was built under R version 4.0.5

garch11 = garchFit(~ garch(1,1), data = us_deaths_aj_ts, trace = FALSE)

## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
##   Consider formula(paste(x, collapse = " ")) instead.

garch11_predict = predict(garch11, n.ahead = 375)

library(forecast)
forecast <- garch11_predict$meanForecast
forecast <- as.data.frame(forecast)
forecast$Date <- us_deaths_aj$Date
forecast_ts <- ts(forecast$forecast, start=c(2015,2),frequency = 52)

accuracy(forecast_ts,us_deaths_aj_ts)

##                ME     RMSE      MAE      MPE     MAPE      ACF1 Theil's U
## Test set 5786.434 9787.081 6461.931 8.620804 9.980362 0.9795714  5.709553

g<-accuracy(forecast_ts,us_deaths_aj_ts)
g<-as.data.frame(g)
g$.model<-"GARCH (1,1)"
g$.type<-"Training"
g$RMSSE <- g$`Theil's U`
g <- subset(g, select=-c(`Theil's U`))
g$MASE <- 0
#tibble
g<-as_tibble(g)

Comparation

Finally the accuracy of the different models is presented, so in the table you can see that the best model based in the metrics is the ETS model with a Box Cox transformation. I think that the transformation is softening the effect of Coronavirus.

rbind(accuracy(fit),accuracy(fit2),accuracy(fit3),accuracy(fit4),g)

## # A tibble: 11 x 10
##    .model .type       ME    RMSE     MAE      MPE    MAPE   MASE  RMSSE     ACF1
##    <chr>  <chr>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl>  <dbl>  <dbl>    <dbl>
##  1 ETS    Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0  0.226  0.202  1.15e-1
##  2 ETS D~ Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0  0.226  0.202  1.15e-1
##  3 NAIVE  Trai~  2.22e+3 6.92e+3 4.11e+3  3.19e+0 6.27e+0  1      1      9.52e-1
##  4 STL    Trai~ -1.66e+1 3.43e+3 2.14e+3 -3.30e-1 3.43e+0  0.522  0.495  9.20e-1
##  5 STL (~ Trai~  7.18e-7 2.96e-6 1.57e-6  6.46e-5 1.41e-4  0.484  0.592  8.84e-1
##  6 ETS (~ Trai~ -1.32e-8 1.11e-6 8.09e-7 -1.19e-6 7.28e-5  0.249  0.223  9.47e-3
##  7 ARIMA~ Trai~ -1.41e+1 1.37e+3 9.21e+2 -2.34e-2 1.54e+0  0.224  0.197 -1.54e-2
##  8 ARIMA~ Trai~ -1.30e+1 1.36e+3 9.11e+2 -2.08e-2 1.52e+0  0.222  0.197  1.47e-2
##  9 ARIMA~ Trai~ -1.12e+1 1.35e+3 9.01e+2 -2.18e-2 1.51e+0  0.219  0.194 -7.39e-4
## 10 ARIMA~ Trai~  1.54e-4 4.13e-4 1.55e-4  1.38e-2 1.39e-2 47.6   82.6    9.80e-1
## 11 GARCH~ Trai~  5.79e+3 9.79e+3 6.46e+3  8.62e+0 9.98e+0  0      5.71   9.80e-1