Data were gained from NHK homepage, https://www3.nhk.or.jp/news/special/coronavirus/data-widget/#mokuji0 .
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.6.3
library(readr)
url = "https://www3.nhk.or.jp/n-data/opendata/coronavirus/nhk_news_covid19_domestic_daily_data.csv"
myfile <- getURL(url=url,.encoding='UTF-8', ssl.verifypeer=FALSE)
nhk_covid19 <- read_csv(myfile,col_types = cols('日付' = col_date(format = "%Y/%m/%d")))
Daily infection and death number and cumurative number of both from 2020/1/16 were pairwisely ploted.
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
For convenience names of column changed as follow.
colnames(nhk_covid19)<-c('date','infected_daily','infected_accum','death_daily','death_accum')
Daily correlation between infected and dead was poor (0.696) but cumulative number of both was well correlated (0.995).
It may supposed that daily_death correlated with daily_infected with some date interval. daily_death(t) ~ daily_infected(t-delta) + b When delta changes regression r.square may be changed. 2021-07-12 is the day when the last A State o Emergency was declared.
lastnum <- which(nhk_covid19$date =="2021-07-12")
d <-c()
r <-c()
s <-c()
i <-c()
for (delta in seq(from=10,to=30,by=1)){
DeadT <- nhk_covid19$death_daily[delta:lastnum]
InfectedT <-nhk_covid19$infected_daily[1:(lastnum-delta+1)]
covidT <- data.frame(Infected=InfectedT,Dead=DeadT)
covid19lm <- lm(Dead~Infected ,data = covidT)
covidsumr <-summary(covid19lm)$r.squared
d<-c(d,delta)
r<-c(r,covidsumr)
s<-c(s,covid19lm$coefficients[2])
i<-c(i,covid19lm$coefficients[1])
}
rsqr=data.frame(interval=d,rsq=r,slope=s,intercept=i)
rsqr
## interval rsq slope intercept
## 1 10 0.6391063 0.01473347 5.788079
## 2 11 0.6692939 0.01506737 5.348471
## 3 12 0.6862238 0.01524620 5.139771
## 4 13 0.7343568 0.01576083 4.425999
## 5 14 0.7862312 0.01629674 3.682999
## 6 15 0.7574897 0.01598458 4.201565
## 7 16 0.7123905 0.01549134 4.981555
## 8 17 0.7106161 0.01546094 5.074339
## 9 18 0.7219184 0.01557201 4.964725
## 10 19 0.7387675 0.01574123 4.770299
## 11 20 0.7944860 0.01631205 3.971397
## 12 21 0.8215312 0.01657534 3.638648
## 13 22 0.7656188 0.01598935 4.571852
## 14 23 0.6942217 0.01521595 5.770572
## 15 24 0.6872026 0.01512751 5.952892
## 16 25 0.6973095 0.01522670 5.859285
## 17 26 0.7056580 0.01530587 5.799026
## 18 27 0.7368883 0.01562883 5.370201
## 19 28 0.7492397 0.01574723 5.254161
## 20 29 0.6805348 0.01499618 6.437204
## 21 30 0.6013476 0.01408783 7.842182
p <- ggplot(data=rsqr,aes(x=interval,y=rsq))
p <- p+geom_line()
p <- p+ggtitle("Date interval and r.squared values")
p
Interval 21 days may be most probable that the infected patients will die.
#lastnum <- which(nhk_covid19$date =="2021-07-12")
delta=rsqr[rsqr$rsq==max(rsqr$rsq),]$interval
DeadT <- nhk_covid19$death_daily[delta:lastnum]
InfectedT <-nhk_covid19$infected_daily[1:(lastnum-delta+1)]
Deadhat <- InfectedT*rsqr[rsqr$interval==delta,]$slope+rsqr[rsqr$interval==delta,]$intercept
Deadhat_df <- data.frame(obs=DeadT,calc=Deadhat)
p <- ggplot(data=Deadhat_df,aes(x=obs,y=calc,alpha=0.1))+geom_point()
p <- p+ ggtitle("daily_death observed vs. calculated by the number of daily_infected 21 days before")
p
cor(DeadT,Deadhat)
## [1] 0.9063836