Here the packages are imported
Read the data. The data that will be used is the Country Deaths data set that contain that amount of deaths per week by country. This data is interesting because is affected by an event that is not going to be forever, that is the Coronavirus.
data <- readr::read_csv("https://www.mortality.org/Public/STMF/Outputs/stmf.csv", skip=1)
## Rows: 112044 Columns: 19
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): CountryCode, Sex
## dbl (17): Year, Week, D0_14, D15_64, D65_74, D75_84, D85p, DTotal, R0_14, R1...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
The purpose of this section is to describe the data. The data selected wasa time serie object that contain deaths by country.
Here is a first view of the data.
data
## # A tibble: 112,044 x 19
## CountryCode Year Week Sex D0_14 D15_64 D65_74 D75_84 D85p DTotal R0_14
## <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AUS2 2015 1 m 5.06 211. 204 398 394 1212 1.14e-4
## 2 AUS2 2015 1 f 6.76 141. 154 323 676 1301 1.60e-4
## 3 AUS2 2015 1 b 11.8 352. 358 721 1070 2513 1.36e-4
## 4 AUS2 2015 2 m 5.67 166. 216 343 399 1130 1.28e-4
## 5 AUS2 2015 2 f 6.98 149. 147 290 646 1239 1.66e-4
## 6 AUS2 2015 2 b 12.7 315. 363 633 1045 2369 1.46e-4
## 7 AUS2 2015 3 m 3.53 169. 220 354 426 1173 7.93e-5
## 8 AUS2 2015 3 f 5.18 136. 159 286 609 1195 1.23e-4
## 9 AUS2 2015 3 b 8.71 305. 379 640 1035 2368 1.01e-4
## 10 AUS2 2015 4 m 3.37 165. 215 368 373 1124 7.58e-5
## # ... with 112,034 more rows, and 8 more variables: R15_64 <dbl>, R65_74 <dbl>,
## # R75_84 <dbl>, R85p <dbl>, RTotal <dbl>, Split <dbl>, SplitSex <dbl>,
## # Forecast <dbl>
deaths <- data %>%
filter(CountryCode %in% c("AUT", "DEUTNP", "USA"))%>%
janitor::clean_names() %>%
dplyr::select(country_code:d_total) %>%
pivot_longer(5:10,
names_to = "age", values_to = "deaths",
names_pattern = "[d_]*([a-z0-9_p]*)"
) %>%
filter(age == "total", sex == "b") %>%
mutate(
country = recode(country_code,
AUT = "Austria",
DEUTNP = "Germany",
ESP = "Spain",
USA = "United States")
) %>%
dplyr::select(year, week, country, deaths)
Here is a data set of just 4 countries previously selected. Austria, Germany, Spain and USA.
deaths
## # A tibble: 2,699 x 4
## year week country deaths
## <dbl> <dbl> <chr> <dbl>
## 1 2000 1 Austria 1867
## 2 2000 2 Austria 1902
## 3 2000 3 Austria 2027
## 4 2000 4 Austria 1940
## 5 2000 5 Austria 1928
## 6 2000 6 Austria 1760
## 7 2000 7 Austria 1666
## 8 2000 8 Austria 1628
## 9 2000 9 Austria 1566
## 10 2000 10 Austria 1524
## # ... with 2,689 more rows
First we see a visualization of some countries.
deaths %>%
mutate(thisyear = (year == 2020)) %>%
ggplot(aes(x=week, y=deaths, group=year)) +
geom_line(aes(col=thisyear)) +
facet_wrap(~ country, scales='free_y') +
scale_color_manual(values=c("FALSE"='gray',"TRUE"='red')) +
guides(col=FALSE) +
ggtitle("Weekly deaths")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Here the data is filtered by country, so the data that will be used is the US deaths. This first steps are a preprocessing of the Dates.
us_deaths <- deaths %>% filter( country=="United States")
library(lubridate)
#date
dates <- make_datetime(year = us_deaths$year) + weeks(us_deaths$week)
head(dates)
## [1] "2015-01-15 UTC" "2015-01-22 UTC" "2015-01-29 UTC" "2015-02-05 UTC"
## [5] "2015-02-12 UTC" "2015-02-19 UTC"
Here is a view of the US data.
us_deaths$dates<-dates
head(us_deaths)
## # A tibble: 6 x 5
## year week country deaths dates
## <dbl> <dbl> <chr> <dbl> <dttm>
## 1 2015 2 United States 61882 2015-01-15 00:00:00
## 2 2015 3 United States 61261 2015-01-22 00:00:00
## 3 2015 4 United States 58738 2015-01-29 00:00:00
## 4 2015 5 United States 57380 2015-02-05 00:00:00
## 5 2015 6 United States 57393 2015-02-12 00:00:00
## 6 2015 7 United States 56535 2015-02-19 00:00:00
us_deaths_aj <- us_deaths %>% dplyr::select(deaths,dates)
head(us_deaths_aj)
## # A tibble: 6 x 2
## deaths dates
## <dbl> <dttm>
## 1 61882 2015-01-15 00:00:00
## 2 61261 2015-01-22 00:00:00
## 3 58738 2015-01-29 00:00:00
## 4 57380 2015-02-05 00:00:00
## 5 57393 2015-02-12 00:00:00
## 6 56535 2015-02-19 00:00:00
Here the data frame object is converted to a tibble.
us_deaths_aj <- as_tibble(us_deaths_aj)
str(us_deaths_aj)
## tibble [375 x 2] (S3: tbl_df/tbl/data.frame)
## $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
## $ dates : POSIXct[1:375], format: "2015-01-15" "2015-01-22" ...
The data format is changed.
us_deaths_aj$Date <- as.Date (us_deaths_aj$dates, formal = "%d/%m/%y")
str(us_deaths_aj)
## tibble [375 x 3] (S3: tbl_df/tbl/data.frame)
## $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
## $ dates : POSIXct[1:375], format: "2015-01-15" "2015-01-22" ...
## $ Date : Date[1:375], format: "2015-01-15" "2015-01-22" ...
us_deaths_aj <- us_deaths_aj %>% dplyr::select(deaths,Date)
str(us_deaths_aj)
## tibble [375 x 2] (S3: tbl_df/tbl/data.frame)
## $ deaths: num [1:375] 61882 61261 58738 57380 57393 ...
## $ Date : Date[1:375], format: "2015-01-15" "2015-01-22" ...
Year <- format(us_deaths_aj$Date, "%y")
Day <- format(us_deaths_aj$Date, "%d")
DT <- us_deaths_aj$deaths
Date <- data.frame(Day, Year, DT)
Here is a plot of the weekly US deaths data.
ggplot(data = us_deaths_aj, aes(x = Date, y = deaths)) + geom_line(colour = "blue", size = 0.5) + labs(title = "Weekly deaths in US", x = "Date", y = "Deaths")
As we expect, the amount of deaths increased since 2020, this because the Coronavirus, that is why this is interesting to analyze this data because we have a event that will no be forever.
Date %>%
ggplot(aes(x = frequency(DT),y = DT)) + geom_boxplot(fill = "Blue") +
facet_wrap( ~ Year) +
labs(y = "Weekly deaths", x = "Year") +
theme_bw(base_size = 10) +
scale_x_discrete(breaks = 0)
Here we have box plot data of each year, and as we expect 20, 21, 22 have more variability.
Date %>%
ggplot(aes(x = Day, y = DT)) + geom_point(color = "blue") +
facet_wrap( ~ Year) +
labs(y = "Weekly deaths", x = "Year") +
theme_bw(base_size = 10) +
scale_x_discrete(breaks = c(0,50,100,150,200,250,300))
Date %>%
ggplot(aes(x = DT), y = frequency(DT)) + geom_density(fill = "lightblue") +
facet_wrap( ~ Year) +
labs(x = "Weekly deaths", y = "Density") +
theme_bw(base_size = 10)
In terms of the density per year, we see that 21 and 22 has more values in the right tail.
Dicky-Fuller Test
library(aTSA)
##
## Attaching package: 'aTSA'
## The following objects are masked from 'package:fabletools':
##
## estimate, forecast
## The following object is masked from 'package:graphics':
##
## identify
adf.test(log(us_deaths_aj$deaths))
## Augmented Dickey-Fuller Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag ADF p.value
## [1,] 0 -0.266 0.567
## [2,] 1 -0.252 0.572
## [3,] 2 -0.142 0.603
## [4,] 3 -0.108 0.613
## [5,] 4 -0.148 0.601
## [6,] 5 -0.119 0.610
## Type 2: with drift no trend
## lag ADF p.value
## [1,] 0 -1.90 0.3680
## [2,] 1 -2.94 0.0433
## [3,] 2 -3.72 0.0100
## [4,] 3 -3.51 0.0100
## [5,] 4 -3.54 0.0100
## [6,] 5 -3.61 0.0100
## Type 3: with drift and trend
## lag ADF p.value
## [1,] 0 -2.79 0.244
## [2,] 1 -4.03 0.010
## [3,] 2 -5.01 0.010
## [4,] 3 -4.80 0.010
## [5,] 4 -5.01 0.010
## [6,] 5 -5.11 0.010
## ----
## Note: in fact, p.value = 0.01 means p.value <= 0.01
Kwiatkowski–Phillips–Schmidt–Shin Test
kpss.test(log(us_deaths_aj$deaths))
## KPSS Unit Root Test
## alternative: nonstationary
##
## Type 1: no drift no trend
## lag stat p.value
## 4 0.0484 0.1
## -----
## Type 2: with drift no trend
## lag stat p.value
## 4 0.138 0.1
## -----
## Type 1: with drift and trend
## lag stat p.value
## 4 0.028 0.1
## -----------
## Note: p.value = 0.01 means p.value <= 0.01
## : p.value = 0.10 means p.value >= 0.10
The time serie seems to be stationary.
us_deaths_aj_ts <- ts(us_deaths_aj$deaths, start=c(2015,2),frequency = 52)
us_deaths_aj_ts %>% decompose(type="multiplicative") %>%
autoplot() + xlab("Year") +
ggtitle("Classical multiplicative decomposition
of US total deaths")
## Warning: Removed 104 row(s) containing missing values (geom_path).
us_deaths_aj_ts <- ts(us_deaths_aj$deaths, start=c(2015,2),frequency = 52)
us_deaths_aj_ts %>% decompose(type="additive") %>%
autoplot() + xlab("Year") +
ggtitle("Classical additive decomposition
of US total deaths")
## Warning: Removed 104 row(s) containing missing values (geom_path).
In both decomposition we have an increasing trend that is expected by the coronavirus, and also a seasonal component that shows that deaths are higher at the beginning of the year. ### ACF and PACF
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
##
## Attaching package: 'forecast'
## The following object is masked from 'package:aTSA':
##
## forecast
ggtsdisplay(us_deaths_aj_ts)
We see high correlation at the beginning of the serie what is expectedby the coronavirus effect.
Stationary Time Series Model
model_1<- Arima(us_deaths_aj$deaths, order = c(3,0,3), method = "ML")
autoplot(model_1)
Non Stationary Time Series Model
model_2 <- Arima(us_deaths_aj$deaths, order = c(3,2,3), method = "ML")
autoplot(model_2)
Stationary Time Series Model First Order Differenced Data
diff <- diff(log(us_deaths_aj$deaths))
model_3 <- Arima(diff, order = c(3,0,3), method = "ML")
autoplot(model_3)
Non Stationary Time Series Model First Order Differenced Data
model_4 <- Arima(diff, order = c(3,2,3), method = "ML")
autoplot(model_4)
These models seems to be no casual and no invertible. Now the next part is the modeling part.
Basic ETS, STD and NAIVE models
#has_gaps(us_deaths_aj_dt, .full = TRUE)
set.seed(569973)
#c <- pedestrian %>%
# fill_gaps(.full = TRUE)
#us_deaths_aj_dt <- us_deaths_aj %>%
# mutate(Date=yearweek(Date)) %>%
# as_tsibble(index = Date, interval="week")
#us_deaths_aj_dt_2 <- as_tsibble(us_deaths_aj_ts,index=Date)
a <- us_deaths_aj_ts %>% as_tsibble()
## Warning: Expected frequency of weekly data: 365.25 / 7 (approx 52.18), not 52.
b <- a %>% mutate(index = yearweek(index))
c <- b %>%
fill_gaps(value = mean(value), .full = TRUE)
fit <- c %>%
model(
"ETS"=ETS(value),
"ETS DAMPED" =ETS(value ~ trend("Ad")),
"NAIVE" = SNAIVE(value),
"STL" =STL(value),
)
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
accuracy(fit)
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS Training -14.3 1397. 929. -0.0136 1.54 0.226 0.202 0.115
## 2 ETS DAMPED Training -14.3 1397. 929. -0.0136 1.54 0.226 0.202 0.115
## 3 NAIVE Training 2220. 6921. 4106. 3.19 6.27 1 1 0.952
## 4 STL Training -16.6 3428. 2145. -0.330 3.43 0.522 0.495 0.920
lambda_d <- c %>% features(value, features = guerrero) %>%
pull(lambda_guerrero)
c_trans<- c %>%
mutate(bc=box_cox(value,lambda_d))
fit2 <- c_trans%>%
model(
"STL (BoxCox)" = STL(bc ~ season(window="periodic"),
robust=T),
"ETS (BoxCox)" = ETS(bc)
)
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
rbind(accuracy(fit),accuracy(fit2))
## # A tibble: 6 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 0.115
## 2 ETS DAMPED Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 0.115
## 3 NAIVE Trai~ 2.22e+3 6.92e+3 4.11e+3 3.19e+0 6.27e+0 1 1 0.952
## 4 STL Trai~ -1.66e+1 3.43e+3 2.14e+3 -3.30e-1 3.43e+0 0.522 0.495 0.920
## 5 STL (BoxC~ Trai~ 7.18e-7 2.96e-6 1.57e-6 6.46e-5 1.41e-4 0.484 0.592 0.884
## 6 ETS (BoxC~ Trai~ -1.32e-8 1.11e-6 8.09e-7 -1.19e-6 7.28e-5 0.249 0.223 0.00947
fit3 <- c %>%
model("ARIMA (1,1,1)" = ARIMA(value ~ pdq(1,1,1)),
"ARIMA (2,1,1)" = ARIMA(value ~ pdq(2,1,0)),
"ARIMA (BEST)" = ARIMA(value))
fit4 <- c_trans %>%
model("ARIMA (BOX COX)" = ARIMA(bc))
## Warning in sqrt(diag(best$var.coef)): Se han producido NaNs
rbind(accuracy(fit3),accuracy(fit4))
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA (1~ Trai~ -1.41e+1 1.37e+3 9.21e+2 -0.0234 1.54 0.224 0.197 -1.54e-2
## 2 ARIMA (2~ Trai~ -1.30e+1 1.36e+3 9.11e+2 -0.0208 1.52 0.222 0.197 1.47e-2
## 3 ARIMA (B~ Trai~ -1.12e+1 1.35e+3 9.01e+2 -0.0218 1.51 0.219 0.194 -7.39e-4
## 4 ARIMA (B~ Trai~ 1.54e-4 4.13e-4 1.55e-4 0.0138 0.0139 47.6 82.6 9.80e-1
library(fGarch)
## Warning: package 'fGarch' was built under R version 4.0.5
## Loading required package: timeDate
## Warning: package 'timeDate' was built under R version 4.0.5
## Loading required package: timeSeries
## Warning: package 'timeSeries' was built under R version 4.0.5
## Loading required package: fBasics
## Warning: package 'fBasics' was built under R version 4.0.5
garch11 = garchFit(~ garch(1,1), data = us_deaths_aj_ts, trace = FALSE)
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
## Consider formula(paste(x, collapse = " ")) instead.
garch11_predict = predict(garch11, n.ahead = 375)
library(forecast)
forecast <- garch11_predict$meanForecast
forecast <- as.data.frame(forecast)
forecast$Date <- us_deaths_aj$Date
forecast_ts <- ts(forecast$forecast, start=c(2015,2),frequency = 52)
accuracy(forecast_ts,us_deaths_aj_ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 5786.434 9787.081 6461.931 8.620804 9.980362 0.9795714 5.709553
g<-accuracy(forecast_ts,us_deaths_aj_ts)
g<-as.data.frame(g)
g$.model<-"GARCH (1,1)"
g$.type<-"Training"
g$RMSSE <- g$`Theil's U`
g <- subset(g, select=-c(`Theil's U`))
g$MASE <- 0
#tibble
g<-as_tibble(g)
Finally the accuracy of the different models is presented, so in the table you can see that the best model based in the metrics is the ETS model with a Box Cox transformation. I think that the transformation is softening the effect of Coronavirus.
rbind(accuracy(fit),accuracy(fit2),accuracy(fit3),accuracy(fit4),g)
## # A tibble: 11 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 1.15e-1
## 2 ETS D~ Trai~ -1.43e+1 1.40e+3 9.29e+2 -1.36e-2 1.54e+0 0.226 0.202 1.15e-1
## 3 NAIVE Trai~ 2.22e+3 6.92e+3 4.11e+3 3.19e+0 6.27e+0 1 1 9.52e-1
## 4 STL Trai~ -1.66e+1 3.43e+3 2.14e+3 -3.30e-1 3.43e+0 0.522 0.495 9.20e-1
## 5 STL (~ Trai~ 7.18e-7 2.96e-6 1.57e-6 6.46e-5 1.41e-4 0.484 0.592 8.84e-1
## 6 ETS (~ Trai~ -1.32e-8 1.11e-6 8.09e-7 -1.19e-6 7.28e-5 0.249 0.223 9.47e-3
## 7 ARIMA~ Trai~ -1.41e+1 1.37e+3 9.21e+2 -2.34e-2 1.54e+0 0.224 0.197 -1.54e-2
## 8 ARIMA~ Trai~ -1.30e+1 1.36e+3 9.11e+2 -2.08e-2 1.52e+0 0.222 0.197 1.47e-2
## 9 ARIMA~ Trai~ -1.12e+1 1.35e+3 9.01e+2 -2.18e-2 1.51e+0 0.219 0.194 -7.39e-4
## 10 ARIMA~ Trai~ 1.54e-4 4.13e-4 1.55e-4 1.38e-2 1.39e-2 47.6 82.6 9.80e-1
## 11 GARCH~ Trai~ 5.79e+3 9.79e+3 6.46e+3 8.62e+0 9.98e+0 0 5.71 9.80e-1