library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library("readxl")
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
##Chuẩn bị dữ liệu:
dat19 <- read_excel("C:/Users/THIEU/Downloads/nCoV/USACoV19.xlsx")
head(dat19)
## # A tibble: 6 x 4
## ID Date Cases Deaths
## <dbl> <dttm> <dbl> <dbl>
## 1 1 2020-03-09 00:00:00 211 4
## 2 2 2020-03-10 00:00:00 290 4
## 3 3 2020-03-11 00:00:00 307 8
## 4 4 2020-03-12 00:00:00 329 3
## 5 5 2020-03-13 00:00:00 553 7
## 6 6 2020-03-14 00:00:00 587 9
dat19 %>%
select("Date","Deaths") -> datD1
setDT(datD1, keep.rownames = TRUE)[]
## rn Date Deaths
## 1: 1 2020-03-09 4
## 2: 2 2020-03-10 4
## 3: 3 2020-03-11 8
## 4: 4 2020-03-12 3
## 5: 5 2020-03-13 7
## 6: 6 2020-03-14 9
## 7: 7 2020-03-15 12
## 8: 8 2020-03-16 18
## 9: 9 2020-03-17 23
## 10: 10 2020-03-18 40
## 11: 11 2020-03-19 56
## 12: 12 2020-03-20 49
## 13: 13 2020-03-21 46
## 14: 14 2020-03-22 113
## 15: 15 2020-03-23 141
## 16: 16 2020-03-24 225
## 17: 17 2020-03-25 247
## 18: 18 2020-03-26 268
## 19: 19 2020-03-27 400
## 20: 20 2020-03-28 525
## 21: 21 2020-03-29 363
## 22: 22 2020-03-30 558
## 23: 23 2020-03-31 912
## 24: 24 2020-04-01 1049
## 25: 25 2020-04-02 968
## 26: 26 2020-04-03 1321
## 27: 27 2020-04-04 1330
## 28: 28 2020-04-05 1165
## 29: 29 2020-04-06 1255
## 30: 30 2020-04-07 1970
## rn Date Deaths
as.numeric(datD1$Date)
## [1] 1583712000 1583798400 1583884800 1583971200 1584057600 1584144000
## [7] 1584230400 1584316800 1584403200 1584489600 1584576000 1584662400
## [13] 1584748800 1584835200 1584921600 1585008000 1585094400 1585180800
## [19] 1585267200 1585353600 1585440000 1585526400 1585612800 1585699200
## [25] 1585785600 1585872000 1585958400 1586044800 1586131200 1586217600
##Vẽ biểu đồ Deaths bằng ggplot2:
p <- ggplot(datD1, aes(x= Date, y = Deaths))
p + geom_line(group=1,colour = "blue") +
theme(axis.text.x = element_text(color="#993333", size=7, angle=0)) +
scale_y_continuous("Deaths") +
ggtitle("USA Daily Deaths By Covid19 - (Mar.,09 - Apr.,8) ")
##Sử dụng hàm auto.arima() của package forecast để tìm thông số cho time series:
df <- (datD1[,3])
colnames(df) <- NULL
df <- unlist(c(df))
covseries <- ts(df, start = c(1))
auto.arima(covseries)
## Series: covseries
## ARIMA(2,2,1)
##
## Coefficients:
## ar1 ar2 ma1
## -0.2149 -0.7772 -0.7175
## s.e. 0.2117 0.1571 0.1283
##
## sigma^2 estimated as 19873: log likelihood=-178.44
## AIC=364.88 AICc=366.62 BIC=370.21
##Tạo forecast data khi biết thông số ARIMA(2,2,1):
covseriesarima <- arima(covseries, order=c(2,2,1))
covseriesforecasts <- forecast(covseriesarima, h=7)
covseriesforecasts
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 31 2024.027 1853.315 2194.739 1762.945 2285.108
## 32 1734.299 1484.573 1984.024 1352.376 2116.221
## 33 2032.177 1764.874 2299.480 1623.372 2440.981
## 34 2470.976 2162.851 2779.101 1999.739 2942.213
## 35 2422.778 2021.785 2823.771 1809.512 3036.044
## 36 2369.693 1903.585 2835.801 1656.842 3082.544
## 37 2696.176 2185.109 3207.244 1914.566 3477.786
##Lấy data từ forecast() để kết hợp với dữ liệu đã có để vẽ biểu đồ trong ggplot2:
df1 <- as.data.frame(covseriesforecasts)
colnames(df1) <- c("Deaths","Lo80", "Hi80", "Lo95","Hi95")
Date <- seq(as.Date("2020/4/8"), by="day", length.out = 7)
df1$Date <- Date
df2 <- rbind.fill(datD1, df1)
tail(df2,15)
## rn Date Deaths Lo80 Hi80 Lo95 Hi95
## 23 23 2020-03-31 912.000 NA NA NA NA
## 24 24 2020-04-01 1049.000 NA NA NA NA
## 25 25 2020-04-02 968.000 NA NA NA NA
## 26 26 2020-04-03 1321.000 NA NA NA NA
## 27 27 2020-04-04 1330.000 NA NA NA NA
## 28 28 2020-04-05 1165.000 NA NA NA NA
## 29 29 2020-04-06 1255.000 NA NA NA NA
## 30 30 2020-04-07 1970.000 NA NA NA NA
## 31 <NA> 2020-04-08 2024.027 1853.315 2194.739 1762.945 2285.108
## 32 <NA> 2020-04-09 1734.299 1484.573 1984.024 1352.376 2116.221
## 33 <NA> 2020-04-10 2032.177 1764.874 2299.480 1623.372 2440.981
## 34 <NA> 2020-04-11 2470.976 2162.851 2779.101 1999.739 2942.213
## 35 <NA> 2020-04-12 2422.778 2021.785 2823.771 1809.512 3036.044
## 36 <NA> 2020-04-13 2369.693 1903.585 2835.801 1656.842 3082.544
## 37 <NA> 2020-04-14 2696.176 2185.109 3207.244 1914.566 3477.786
##Vẽ biểu đồ forecast bằng ggplot2:
p1a <-ggplot(data=df2,aes(x=Date,y=Deaths))+
labs(x ="Date", y = "Deaths", title="Deaths By Covid19 Forecast \n from April 8th - April 14th") +
geom_line(aes(y=Deaths))+
geom_ribbon(aes(ymin=Lo95,ymax=Hi95),fill='green',alpha=.25)+
geom_ribbon(aes(ymin=Lo80,ymax=Hi80),fill='red',alpha=.25)
p1a
##Kết luận
Tuy biểu đồ forecast chưa thật đẹp mắt nhưng đã diễn tả được những thông tin cần thiết. Cám ơn các bạn đã theo dõi.