시간에 따른 우한-코로나 바이러스 감염자 및 사망자 관계 분석 (누적 감염자에 대해서)
wuhan_df = read_xlsx("data/20200207.xlsx")
## New names:
## * Released -> Released...12
## * Released -> Released...13
head(wuhan_df,15)
## # A tibble: 15 x 14
## Date Suspected Confirmed `Daily Incread … Serious Deaths
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-12-31 00:00:00 27 NA NA NA NA
## 2 2020-01-03 00:00:00 44 NA NA NA NA
## 3 2020-01-05 00:00:00 59 NA NA NA NA
## 4 2020-01-10 00:00:00 NA 41 NA NA 1
## 5 2020-01-11 00:00:00 NA 41 NA NA 1
## 6 2020-01-12 00:00:00 NA 41 NA NA 1
## 7 2020-01-13 00:00:00 NA 41 NA NA 1
## 8 2020-01-15 00:00:00 NA NA NA NA 2
## 9 2020-01-16 00:00:00 NA 45 NA 5 2
## 10 2020-01-17 00:00:00 NA 62 0.378 8 2
## 11 2020-01-18 00:00:00 NA 121 0.952 NA NA
## 12 2020-01-19 00:00:00 NA 198 0.636 44 3
## 13 2020-01-20 00:00:00 54 291 0.47 NA 6
## 14 2020-01-21 00:00:00 37 440 0.512 102 9
## 15 2020-01-22 00:00:00 257 571 0.298 95 17
## # … with 8 more variables: Recovered <dbl>, `Deaths+Recovered` <dbl>,
## # `D/(D+R)` <dbl>, `D/C` <dbl>, Quarantined <dbl>, Released...12 <dbl>,
## # Released...13 <dbl>, Total <dbl>
wuhan_df = wuhan_df[9:nrow(wuhan_df),]
wuhan_df$Index = seq(1,nrow(wuhan_df))
wuhan_df=data.frame(Index=wuhan_df$Index, Date=wuhan_df$Date,Confirmed=wuhan_df$`Confirmed`, Deaths=wuhan_df$Deaths)
head(wuhan_df,10)
## Index Date Confirmed Deaths
## 1 1 2020-01-16 45 2
## 2 2 2020-01-17 62 2
## 3 3 2020-01-18 121 NA
## 4 4 2020-01-19 198 3
## 5 5 2020-01-20 291 6
## 6 6 2020-01-21 440 9
## 7 7 2020-01-22 571 17
## 8 8 2020-01-23 830 25
## 9 9 2020-01-24 1287 41
## 10 10 2020-01-25 1975 56
plot(wuhan_df$Date, wuhan_df$Confirmed,
main="Date - Confirmed")
plot(wuhan_df$Date, wuhan_df$Deaths,
main="Date - Deaths")
#### => 증가하는 이차 곡선 형태를 보인다
\(Confirmed = \beta_{0} + \beta_{1}Date^2 + \beta_{2}Date\)
꼴 일 것으로 추측
fit1 = lm(Confirmed ~ Index, data=wuhan_df )
summary(fit1)
##
## Call:
## lm(formula = Confirmed ~ Index, data = wuhan_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5237 -4039 -1353 3510 8434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8638.5 1977.6 -4.368 0.000269 ***
## Index 1510.9 144.2 10.476 8.54e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4588 on 21 degrees of freedom
## Multiple R-squared: 0.8394, Adjusted R-squared: 0.8317
## F-statistic: 109.7 on 1 and 21 DF, p-value: 8.536e-10
plot(wuhan_df$Index, wuhan_df$Confirmed)
abline(coef(fit1),col='red')
fit2 = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_df)
summary(fit2)
##
## Call:
## lm(formula = Confirmed ~ I(Index^2) + Index, data = wuhan_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1356.50 -461.23 -95.09 506.07 899.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2438.261 419.238 5.816 1.09e-05 ***
## I(Index^2) 110.767 3.256 34.021 < 2e-16 ***
## Index -1147.524 80.480 -14.258 6.11e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 612.8 on 20 degrees of freedom
## Multiple R-squared: 0.9973, Adjusted R-squared: 0.997
## F-statistic: 3655 on 2 and 20 DF, p-value: < 2.2e-16
pred_fit2 = predict(fit2)
plot(wuhan_df$Index, wuhan_df$Confirmed)
lines(pred_fit2, col='red')
=> \(R^2 0.9973\), fitting 완벽해 보인다
2020-01-16 ~ 2020-02-02 : Train (18ea) 2020-02-03 ~ 2020-02-07 : Test (5ea)
wuhan_tr = wuhan_df[1:18,]
wuhan_ts = wuhan_df[19:dim(wuhan_df)[1],]
fit_tr = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_tr)
fit_ts = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_ts)
pred_tr = predict(fit_tr)
pred_ts = predict(fit_ts)
plot(wuhan_df$Index, wuhan_df$Confirmed,
main="Index - Confirmed")
lines(wuhan_tr$Index, pred_tr, col="blue", lwd=3)
lines(wuhan_ts$Index, pred_ts, col="red", lwd=3)
legend("topleft", legend=c("Train", "Test"),
col=c("blue", "red"), lty=1:1, cex=0.8)
2020-02-08(Index:24) ~ 2020-02-29(Index:45)
fut.vec = seq(24,45)
fut_confirm.vec = rep(NA,22)
fut_df = data.frame(Index=fut.vec)
fit2 = lm(Confirmed ~ I(Index^2)+Index, data=wuhan_df)
pred_present = predict(fit2)
pred_fut = predict(fit2, fut_df)
plot(c(wuhan_df$Index,fut.vec), c(wuhan_df$Confirmed,fut_confirm.vec),
main="Index - Confirmed",
ylim=c(1,200000))
lines(wuhan_df$Index, pred_present, col="blue", lwd=3)
lines(fut.vec, pred_fut, col="red", lwd=3)
legend("topleft", legend=c("Train", "Test"),
col=c("blue", "red"), lty=1:1, cex=0.8)