packages load

library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library("readxl")
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact

##Chuẩn bị dữ liệu:

dat19 <- read_excel("C:/Users/THIEU/Downloads/nCoV/USACoV19.xlsx")
head(dat19)
## # A tibble: 6 x 4
##      ID Date                Cases Deaths
##   <dbl> <dttm>              <dbl>  <dbl>
## 1     1 2020-03-09 00:00:00   211      4
## 2     2 2020-03-10 00:00:00   290      4
## 3     3 2020-03-11 00:00:00   307      8
## 4     4 2020-03-12 00:00:00   329      3
## 5     5 2020-03-13 00:00:00   553      7
## 6     6 2020-03-14 00:00:00   587      9
dat19 %>%
  select("Date","Deaths") -> datD1
setDT(datD1, keep.rownames = TRUE)[]
##     rn       Date Deaths
##  1:  1 2020-03-09      4
##  2:  2 2020-03-10      4
##  3:  3 2020-03-11      8
##  4:  4 2020-03-12      3
##  5:  5 2020-03-13      7
##  6:  6 2020-03-14      9
##  7:  7 2020-03-15     12
##  8:  8 2020-03-16     18
##  9:  9 2020-03-17     23
## 10: 10 2020-03-18     40
## 11: 11 2020-03-19     56
## 12: 12 2020-03-20     49
## 13: 13 2020-03-21     46
## 14: 14 2020-03-22    113
## 15: 15 2020-03-23    141
## 16: 16 2020-03-24    225
## 17: 17 2020-03-25    247
## 18: 18 2020-03-26    268
## 19: 19 2020-03-27    400
## 20: 20 2020-03-28    525
## 21: 21 2020-03-29    363
## 22: 22 2020-03-30    558
## 23: 23 2020-03-31    912
## 24: 24 2020-04-01   1049
## 25: 25 2020-04-02    968
## 26: 26 2020-04-03   1321
## 27: 27 2020-04-04   1330
## 28: 28 2020-04-05   1165
## 29: 29 2020-04-06   1255
## 30: 30 2020-04-07   1970
##     rn       Date Deaths
as.numeric(datD1$Date)
##  [1] 1583712000 1583798400 1583884800 1583971200 1584057600 1584144000
##  [7] 1584230400 1584316800 1584403200 1584489600 1584576000 1584662400
## [13] 1584748800 1584835200 1584921600 1585008000 1585094400 1585180800
## [19] 1585267200 1585353600 1585440000 1585526400 1585612800 1585699200
## [25] 1585785600 1585872000 1585958400 1586044800 1586131200 1586217600

##Vẽ biểu đồ Deaths bằng ggplot2:

p <- ggplot(datD1, aes(x= Date, y = Deaths))
p + geom_line(group=1,colour = "blue") +
  theme(axis.text.x = element_text(color="#993333", size=7, angle=0)) +
   scale_y_continuous("Deaths") +
  ggtitle("USA Daily Deaths By Covid19 - (Mar.,09 - Apr.,8) ") 

##Sử dụng hàm auto.arima() của package forecast để tìm thông số cho time series:

df <- (datD1[,3])
colnames(df) <- NULL
df <- unlist(c(df))
covseries <- ts(df, start = c(1))
auto.arima(covseries)
## Series: covseries 
## ARIMA(2,2,1) 
## 
## Coefficients:
##           ar1      ar2      ma1
##       -0.2149  -0.7772  -0.7175
## s.e.   0.2117   0.1571   0.1283
## 
## sigma^2 estimated as 19873:  log likelihood=-178.44
## AIC=364.88   AICc=366.62   BIC=370.21

##Tạo forecast data khi biết thông số ARIMA(2,2,1):

covseriesarima <- arima(covseries, order=c(2,2,1))
covseriesforecasts <- forecast(covseriesarima, h=7)
covseriesforecasts
##    Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 31       2024.027 1853.315 2194.739 1762.945 2285.108
## 32       1734.299 1484.573 1984.024 1352.376 2116.221
## 33       2032.177 1764.874 2299.480 1623.372 2440.981
## 34       2470.976 2162.851 2779.101 1999.739 2942.213
## 35       2422.778 2021.785 2823.771 1809.512 3036.044
## 36       2369.693 1903.585 2835.801 1656.842 3082.544
## 37       2696.176 2185.109 3207.244 1914.566 3477.786

##Lấy data từ forecast() để kết hợp với dữ liệu đã có để vẽ biểu đồ trong ggplot2:

df1 <- as.data.frame(covseriesforecasts)
colnames(df1) <- c("Deaths","Lo80", "Hi80", "Lo95","Hi95")
Date <- seq(as.Date("2020/4/8"), by="day", length.out = 7)
df1$Date <- Date
df2 <- rbind.fill(datD1, df1)
tail(df2,15)
##      rn       Date   Deaths     Lo80     Hi80     Lo95     Hi95
## 23   23 2020-03-31  912.000       NA       NA       NA       NA
## 24   24 2020-04-01 1049.000       NA       NA       NA       NA
## 25   25 2020-04-02  968.000       NA       NA       NA       NA
## 26   26 2020-04-03 1321.000       NA       NA       NA       NA
## 27   27 2020-04-04 1330.000       NA       NA       NA       NA
## 28   28 2020-04-05 1165.000       NA       NA       NA       NA
## 29   29 2020-04-06 1255.000       NA       NA       NA       NA
## 30   30 2020-04-07 1970.000       NA       NA       NA       NA
## 31 <NA> 2020-04-08 2024.027 1853.315 2194.739 1762.945 2285.108
## 32 <NA> 2020-04-09 1734.299 1484.573 1984.024 1352.376 2116.221
## 33 <NA> 2020-04-10 2032.177 1764.874 2299.480 1623.372 2440.981
## 34 <NA> 2020-04-11 2470.976 2162.851 2779.101 1999.739 2942.213
## 35 <NA> 2020-04-12 2422.778 2021.785 2823.771 1809.512 3036.044
## 36 <NA> 2020-04-13 2369.693 1903.585 2835.801 1656.842 3082.544
## 37 <NA> 2020-04-14 2696.176 2185.109 3207.244 1914.566 3477.786

##Vẽ biểu đồ forecast bằng ggplot2:

p1a <-ggplot(data=df2,aes(x=Date,y=Deaths))+  
  labs(x ="Date", y = "Deaths", title="Deaths By Covid19 Forecast \n from April 8th - April 14th") +
  geom_line(aes(y=Deaths))+
  geom_ribbon(aes(ymin=Lo95,ymax=Hi95),fill='green',alpha=.25)+
  geom_ribbon(aes(ymin=Lo80,ymax=Hi80),fill='red',alpha=.25)
p1a

##Kết luận

Tuy biểu đồ forecast chưa thật đẹp mắt nhưng đã diễn tả được những thông tin cần thiết. Cám ơn các bạn đã theo dõi.