Forecasting Deaths By Covid19 In US (Apr. 9-Apr. 15)

packages load

library(tidyverse)

## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggthemes)
library("readxl")
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

##Chuẩn bị dữ liệu:

dat19 <- read_excel("C:/Users/THIEU/Downloads/nCoV/USACoV19.xlsx")
head(dat19)

## # A tibble: 6 x 4
##      ID Date                Cases Deaths
##   <dbl> <dttm>              <dbl>  <dbl>
## 1     1 2020-03-09 00:00:00   211      4
## 2     2 2020-03-10 00:00:00   290      4
## 3     3 2020-03-11 00:00:00   307      8
## 4     4 2020-03-12 00:00:00   329      3
## 5     5 2020-03-13 00:00:00   553      7
## 6     6 2020-03-14 00:00:00   587      9

dat19 %>%
  select("Date","Deaths") -> datD1
setDT(datD1, keep.rownames = TRUE)[]

##     rn       Date Deaths
##  1:  1 2020-03-09      4
##  2:  2 2020-03-10      4
##  3:  3 2020-03-11      8
##  4:  4 2020-03-12      3
##  5:  5 2020-03-13      7
##  6:  6 2020-03-14      9
##  7:  7 2020-03-15     12
##  8:  8 2020-03-16     18
##  9:  9 2020-03-17     23
## 10: 10 2020-03-18     40
## 11: 11 2020-03-19     56
## 12: 12 2020-03-20     49
## 13: 13 2020-03-21     46
## 14: 14 2020-03-22    113
## 15: 15 2020-03-23    141
## 16: 16 2020-03-24    225
## 17: 17 2020-03-25    247
## 18: 18 2020-03-26    268
## 19: 19 2020-03-27    400
## 20: 20 2020-03-28    525
## 21: 21 2020-03-29    363
## 22: 22 2020-03-30    558
## 23: 23 2020-03-31    912
## 24: 24 2020-04-01   1049
## 25: 25 2020-04-02    968
## 26: 26 2020-04-03   1321
## 27: 27 2020-04-04   1330
## 28: 28 2020-04-05   1165
## 29: 29 2020-04-06   1255
## 30: 30 2020-04-07   1970
##     rn       Date Deaths

as.numeric(datD1$Date)

##  [1] 1583712000 1583798400 1583884800 1583971200 1584057600 1584144000
##  [7] 1584230400 1584316800 1584403200 1584489600 1584576000 1584662400
## [13] 1584748800 1584835200 1584921600 1585008000 1585094400 1585180800
## [19] 1585267200 1585353600 1585440000 1585526400 1585612800 1585699200
## [25] 1585785600 1585872000 1585958400 1586044800 1586131200 1586217600

##Vẽ biểu đồ Deaths bằng ggplot2:

p <- ggplot(datD1, aes(x= Date, y = Deaths))
p + geom_line(group=1,colour = "blue") +
  theme(axis.text.x = element_text(color="#993333", size=7, angle=0)) +
   scale_y_continuous("Deaths") +
  ggtitle("USA Daily Deaths By Covid19 - (Mar.,09 - Apr.,8) ")

##Sử dụng hàm auto.arima() của package forecast để tìm thông số cho time series:

df <- (datD1[,3])
colnames(df) <- NULL
df <- unlist(c(df))
covseries <- ts(df, start = c(1))
auto.arima(covseries)

## Series: covseries 
## ARIMA(2,2,1) 
## 
## Coefficients:
##           ar1      ar2      ma1
##       -0.2149  -0.7772  -0.7175
## s.e.   0.2117   0.1571   0.1283
## 
## sigma^2 estimated as 19873:  log likelihood=-178.44
## AIC=364.88   AICc=366.62   BIC=370.21

##Tạo forecast data khi biết thông số ARIMA(2,2,1):

covseriesarima <- arima(covseries, order=c(2,2,1))
covseriesforecasts <- forecast(covseriesarima, h=7)
covseriesforecasts

##    Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 31       2024.027 1853.315 2194.739 1762.945 2285.108
## 32       1734.299 1484.573 1984.024 1352.376 2116.221
## 33       2032.177 1764.874 2299.480 1623.372 2440.981
## 34       2470.976 2162.851 2779.101 1999.739 2942.213
## 35       2422.778 2021.785 2823.771 1809.512 3036.044
## 36       2369.693 1903.585 2835.801 1656.842 3082.544
## 37       2696.176 2185.109 3207.244 1914.566 3477.786

##Lấy data từ forecast() để kết hợp với dữ liệu đã có để vẽ biểu đồ trong ggplot2:

df1 <- as.data.frame(covseriesforecasts)
colnames(df1) <- c("Deaths","Lo80", "Hi80", "Lo95","Hi95")
Date <- seq(as.Date("2020/4/8"), by="day", length.out = 7)
df1$Date <- Date
df2 <- rbind.fill(datD1, df1)
tail(df2,15)

##      rn       Date   Deaths     Lo80     Hi80     Lo95     Hi95
## 23   23 2020-03-31  912.000       NA       NA       NA       NA
## 24   24 2020-04-01 1049.000       NA       NA       NA       NA
## 25   25 2020-04-02  968.000       NA       NA       NA       NA
## 26   26 2020-04-03 1321.000       NA       NA       NA       NA
## 27   27 2020-04-04 1330.000       NA       NA       NA       NA
## 28   28 2020-04-05 1165.000       NA       NA       NA       NA
## 29   29 2020-04-06 1255.000       NA       NA       NA       NA
## 30   30 2020-04-07 1970.000       NA       NA       NA       NA
## 31 <NA> 2020-04-08 2024.027 1853.315 2194.739 1762.945 2285.108
## 32 <NA> 2020-04-09 1734.299 1484.573 1984.024 1352.376 2116.221
## 33 <NA> 2020-04-10 2032.177 1764.874 2299.480 1623.372 2440.981
## 34 <NA> 2020-04-11 2470.976 2162.851 2779.101 1999.739 2942.213
## 35 <NA> 2020-04-12 2422.778 2021.785 2823.771 1809.512 3036.044
## 36 <NA> 2020-04-13 2369.693 1903.585 2835.801 1656.842 3082.544
## 37 <NA> 2020-04-14 2696.176 2185.109 3207.244 1914.566 3477.786

##Vẽ biểu đồ forecast bằng ggplot2:

p1a <-ggplot(data=df2,aes(x=Date,y=Deaths))+  
  labs(x ="Date", y = "Deaths", title="Deaths By Covid19 Forecast \n from April 8th - April 14th") +
  geom_line(aes(y=Deaths))+
  geom_ribbon(aes(ymin=Lo95,ymax=Hi95),fill='green',alpha=.25)+
  geom_ribbon(aes(ymin=Lo80,ymax=Hi80),fill='red',alpha=.25)
p1a

##Kết luận

Tuy biểu đồ forecast chưa thật đẹp mắt nhưng đã diễn tả được những thông tin cần thiết. Cám ơn các bạn đã theo dõi.

Forecasting Deaths By Covid19 In US (Apr. 9-Apr. 15)

Nguyen Ngoc Thieu

4/9/2020

packages load