우한-코로나 바이러스 감염자 시간 경과에 따른 추세 분석 (Not Cumulative, Count)

rm(list=ls())

library(readxl); 

setwd("/Users/wooddekk/Desktop/project_R/for_fun/Wuhan")

wuhan_df = read_xlsx("data/20200207.xlsx")
## New names:
## * Released -> Released...12
## * Released -> Released...13
head(wuhan_df)
## # A tibble: 6 x 14
##   Date                Suspected Confirmed `Daily Incread … Serious Deaths
##   <dttm>                  <dbl>     <dbl>            <dbl>   <dbl>  <dbl>
## 1 2019-12-31 00:00:00        27        NA               NA      NA     NA
## 2 2020-01-03 00:00:00        44        NA               NA      NA     NA
## 3 2020-01-05 00:00:00        59        NA               NA      NA     NA
## 4 2020-01-10 00:00:00        NA        41               NA      NA      1
## 5 2020-01-11 00:00:00        NA        41               NA      NA      1
## 6 2020-01-12 00:00:00        NA        41               NA      NA      1
## # … with 8 more variables: Recovered <dbl>, `Deaths+Recovered` <dbl>,
## #   `D/(D+R)` <dbl>, `D/C` <dbl>, Quarantined <dbl>, Released...12 <dbl>,
## #   Released...13 <dbl>, Total <dbl>
wuhan_df$Index = seq(1,nrow(wuhan_df))
wuhan_df=data.frame(Index=wuhan_df$Index,
                    Date=wuhan_df$Date,
                    Confirmed=wuhan_df$`Confirmed`, 
                    Deaths=wuhan_df$Deaths)

confirm_2 = wuhan_df$Confirmed[2:dim(wuhan_df)[1]]
confirm_1 = wuhan_df$Confirmed[1:dim(wuhan_df)[1]-1]

confirm_diff = confirm_2 - confirm_1
wuhan_diff = wuhan_df[2:dim(wuhan_df)[1],]
wuhan_diff$diff_confirm =confirm_diff

wuhan_diff_2 = wuhan_diff[9:dim(wuhan_diff)[1],]
fit_diff = lm(diff_confirm ~ Index, data=wuhan_diff_2)
summary(fit_diff)
## 
## Call:
## lm(formula = diff_confirm ~ Index, data = wuhan_diff_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -495.20 -319.70  -97.86  285.98  795.65 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2586.77     267.18  -9.682 5.43e-09 ***
## Index         202.68      12.45  16.279 5.27e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 370.5 on 20 degrees of freedom
## Multiple R-squared:  0.9298, Adjusted R-squared:  0.9263 
## F-statistic:   265 on 1 and 20 DF,  p-value: 5.274e-13
plot(wuhan_diff_2$Index, wuhan_diff_2$diff_confirm)
abline(coef(fit_diff))

#################################################
# Future Prediction
#################################################
fut.idx.vec = seq(32,53)
fut.confirm.vec = rep(NA,22)
fut_df = data.frame(Index=fut.idx.vec, Confirmed=fut.confirm.vec)


pred_present = predict(fit_diff)
pred_fut = predict(fit_diff, fut_df)


plot(c(wuhan_diff_2$Index,fut.idx.vec), c(wuhan_diff_2$diff_confirm,fut.confirm.vec), 
     main="Index - Confirmed",
     ylim=c(0,10000))
lines(wuhan_diff_2$Index, pred_present, col="blue", lwd=3)
lines(fut.idx.vec, pred_fut, col="red", lwd=3)
legend("topleft", legend=c("Train", "Test"),
       col=c("blue", "red"), lty=1:1, cex=0.8)