시간에 따른 우한-코로나 바이러스 감염자 및 사망자 관계 분석 (누적 감염자에 대해서)

Data Source: https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_Wuhan_coronavirus_outbreak

rm(list=ls())
library(readxl); 
setwd("/Users/wooddekk/Desktop/project_R/for_fun/Wuhan")

Read Data

wuhan_df = read_xlsx("data/20200207.xlsx")
## New names:
## * Released -> Released...12
## * Released -> Released...13
head(wuhan_df,15)
## # A tibble: 15 x 14
##    Date                Suspected Confirmed `Daily Incread … Serious Deaths
##    <dttm>                  <dbl>     <dbl>            <dbl>   <dbl>  <dbl>
##  1 2019-12-31 00:00:00        27        NA           NA          NA     NA
##  2 2020-01-03 00:00:00        44        NA           NA          NA     NA
##  3 2020-01-05 00:00:00        59        NA           NA          NA     NA
##  4 2020-01-10 00:00:00        NA        41           NA          NA      1
##  5 2020-01-11 00:00:00        NA        41           NA          NA      1
##  6 2020-01-12 00:00:00        NA        41           NA          NA      1
##  7 2020-01-13 00:00:00        NA        41           NA          NA      1
##  8 2020-01-15 00:00:00        NA        NA           NA          NA      2
##  9 2020-01-16 00:00:00        NA        45           NA           5      2
## 10 2020-01-17 00:00:00        NA        62            0.378       8      2
## 11 2020-01-18 00:00:00        NA       121            0.952      NA     NA
## 12 2020-01-19 00:00:00        NA       198            0.636      44      3
## 13 2020-01-20 00:00:00        54       291            0.47       NA      6
## 14 2020-01-21 00:00:00        37       440            0.512     102      9
## 15 2020-01-22 00:00:00       257       571            0.298      95     17
## # … with 8 more variables: Recovered <dbl>, `Deaths+Recovered` <dbl>,
## #   `D/(D+R)` <dbl>, `D/C` <dbl>, Quarantined <dbl>, Released...12 <dbl>,
## #   Released...13 <dbl>, Total <dbl>

2020-01-16 데이테부터 사용

wuhan_df = wuhan_df[9:nrow(wuhan_df),]

Tranform Date as an Index

wuhan_df$Index = seq(1,nrow(wuhan_df))

Edited Data Frame

wuhan_df=data.frame(Index=wuhan_df$Index, Date=wuhan_df$Date,Confirmed=wuhan_df$`Confirmed`, Deaths=wuhan_df$Deaths)
head(wuhan_df,10)
##    Index       Date Confirmed Deaths
## 1      1 2020-01-16        45      2
## 2      2 2020-01-17        62      2
## 3      3 2020-01-18       121     NA
## 4      4 2020-01-19       198      3
## 5      5 2020-01-20       291      6
## 6      6 2020-01-21       440      9
## 7      7 2020-01-22       571     17
## 8      8 2020-01-23       830     25
## 9      9 2020-01-24      1287     41
## 10    10 2020-01-25      1975     56

Plotting

plot(wuhan_df$Date, wuhan_df$Confirmed,
     main="Date - Confirmed")

plot(wuhan_df$Date, wuhan_df$Deaths,
     main="Date - Deaths")

#### => 증가하는 이차 곡선 형태를 보인다

\(Confirmed = \beta_{0} + \beta_{1}Date^2 + \beta_{2}Date\)

꼴 일 것으로 추측

fit1 (선형회귀)

fit1 = lm(Confirmed ~ Index, data=wuhan_df )
summary(fit1)
## 
## Call:
## lm(formula = Confirmed ~ Index, data = wuhan_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5237  -4039  -1353   3510   8434 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8638.5     1977.6  -4.368 0.000269 ***
## Index         1510.9      144.2  10.476 8.54e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4588 on 21 degrees of freedom
## Multiple R-squared:  0.8394, Adjusted R-squared:  0.8317 
## F-statistic: 109.7 on 1 and 21 DF,  p-value: 8.536e-10
plot(wuhan_df$Index, wuhan_df$Confirmed)
abline(coef(fit1),col='red')

fit2 (다항회귀)

fit2 = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_df)
summary(fit2)
## 
## Call:
## lm(formula = Confirmed ~ I(Index^2) + Index, data = wuhan_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1356.50  -461.23   -95.09   506.07   899.26 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2438.261    419.238   5.816 1.09e-05 ***
## I(Index^2)    110.767      3.256  34.021  < 2e-16 ***
## Index       -1147.524     80.480 -14.258 6.11e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 612.8 on 20 degrees of freedom
## Multiple R-squared:  0.9973, Adjusted R-squared:  0.997 
## F-statistic:  3655 on 2 and 20 DF,  p-value: < 2.2e-16
pred_fit2 = predict(fit2)
plot(wuhan_df$Index, wuhan_df$Confirmed)
lines(pred_fit2, col='red')

=> \(R^2 0.9973\), fitting 완벽해 보인다

Train - Test Split

2020-01-16 ~ 2020-02-02 : Train (18ea) 2020-02-03 ~ 2020-02-07 : Test (5ea)

wuhan_tr = wuhan_df[1:18,]
wuhan_ts = wuhan_df[19:dim(wuhan_df)[1],]

Fitting Train Set

fit_tr = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_tr)

Fitting Test set

fit_ts = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_ts)
pred_tr = predict(fit_tr)
pred_ts = predict(fit_ts)

Prediction Plot

plot(wuhan_df$Index, wuhan_df$Confirmed, 
     main="Index - Confirmed")
lines(wuhan_tr$Index, pred_tr, col="blue", lwd=3)
lines(wuhan_ts$Index, pred_ts, col="red", lwd=3)
legend("topleft", legend=c("Train", "Test"),
       col=c("blue", "red"), lty=1:1, cex=0.8)

향후 예측

2020-02-08(Index:24) ~ 2020-02-29(Index:45)

fut.vec = seq(24,45)
fut_confirm.vec = rep(NA,22)
fut_df = data.frame(Index=fut.vec)

Train on whole data set

fit2 = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_df)
pred_present = predict(fit2)
pred_fut = predict(fit2, fut_df)

Prediction Plot

plot(c(wuhan_df$Index,fut.vec), c(wuhan_df$Confirmed,fut_confirm.vec), 
     main="Index - Confirmed",
     ylim=c(1,200000))
lines(wuhan_df$Index, pred_present, col="blue", lwd=3)
lines(fut.vec, pred_fut, col="red", lwd=3)
legend("topleft", legend=c("Train", "Test"),
       col=c("blue", "red"), lty=1:1, cex=0.8)