Previoussly 21 days interval made the correlation high between daily infected and dead numbers. There were also slight increase of correlation 14 and 28 days interval. These may make by the difference of the amount of work depend on the day of the week. To solve this 7 days syntropy, 7 day moving average may be useful.
Data were gained from NHK homepage, https://www3.nhk.or.jp/news/special/coronavirus/data-widget/#mokuji0 .
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.6.3
library(readr)
## Warning: package 'readr' was built under R version 3.6.3
## Registered S3 methods overwritten by 'tibble':
## method from
## format.tbl pillar
## print.tbl pillar
url = "https://www3.nhk.or.jp/n-data/opendata/coronavirus/nhk_news_covid19_domestic_daily_data.csv"
myfile <- getURL(url=url,.encoding='UTF-8', ssl.verifypeer=FALSE)
nhk_covid19 <- read_csv(myfile,col_types = cols('日付' = col_date(format = "%Y/%m/%d")))
For convenient column names chenged.
colnames(nhk_covid19)<-c('date','infected_daily','infected_accum','death_daily','death_accum')
##Package RcppRoll roll_mean
RcppRoll package implements a number of ‘roll’-ing functions for R vectors and matrices.
library(RcppRoll)
## Warning: package 'RcppRoll' was built under R version 3.6.3
nhk_covid19$infectedr7ds<-roll_mean(nhk_covid19$infected_daily,n=7,fill=NA)
nhk_covid19$deathr7ds<-roll_mean(nhk_covid19$death_daily,n=7,fill=NA)
Slice rows with NA, upper 3 and lowe 3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ls <- length(nhk_covid19$date)
nhk_covid197ds<-slice(nhk_covid19,4:(ls-3))
nhk_covid197ds
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 600 x 7
## date infected_daily infected_accum death_daily death_accum infectedr7ds
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-01-19 0 1 0 0 0.143
## 2 2020-01-20 0 1 0 0 0
## 3 2020-01-21 0 1 0 0 0.143
## 4 2020-01-22 0 1 0 0 0.286
## 5 2020-01-23 0 1 0 0 0.429
## 6 2020-01-24 1 2 0 0 0.429
## 7 2020-01-25 1 3 0 0 0.857
## 8 2020-01-26 1 4 0 0 1
## 9 2020-01-27 0 4 0 0 1.86
## 10 2020-01-28 3 7 0 0 2.14
## # ... with 590 more rows, and 1 more variable: deathr7ds <dbl>
It may supposed that daily_death correlated with daily_infected with some date interval. daily_death(t) ~ daily_infected(t-delta) + b When delta changes regression r.square may be changed. 2021-07-12 is the day when the last A State o Emergency was declared.
lastnum <- which(nhk_covid197ds$date =="2021-07-12")
d <-c()
r <-c()
s <-c()
i <-c()
for (delta in seq(from=10,to=30,by=1)){
DeadT <- nhk_covid197ds$deathr7ds[delta:lastnum]
InfectedT <-nhk_covid197ds$infectedr7ds[1:(lastnum-delta+1)]
covidT <- data.frame(Infected=InfectedT,Dead=DeadT)
covid19lm <- lm(Dead~Infected ,data = covidT)
covidsumr <-summary(covid19lm)$r.squared
d<-c(d,delta)
r<-c(r,covidsumr)
s<-c(s,covid19lm$coefficients[2])
i<-c(i,covid19lm$coefficients[1])
}
rsqr=data.frame(interval=d,rsq=r,slope=s,intercept=i)
rsqr
## interval rsq slope intercept
## 1 10 0.8095496 0.01600038 3.938484
## 2 11 0.8280837 0.01616922 3.739647
## 3 12 0.8443465 0.01631370 3.576660
## 4 13 0.8589689 0.01644066 3.440310
## 5 14 0.8725388 0.01655612 3.320670
## 6 15 0.8844239 0.01665443 3.226062
## 7 16 0.8939454 0.01672964 3.166307
## 8 17 0.9006965 0.01677837 3.146431
## 9 18 0.9050095 0.01680407 3.161394
## 10 19 0.9073615 0.01681138 3.204620
## 11 20 0.9075223 0.01679829 3.278287
## 12 21 0.9047921 0.01675838 3.392846
## 13 22 0.8994575 0.01669426 3.543716
## 14 23 0.8917966 0.01660837 3.727324
## 15 24 0.8821665 0.01650398 3.938305
## 16 25 0.8704509 0.01637959 4.179462
## 17 26 0.8564083 0.01623254 4.455059
## 18 27 0.8399992 0.01606195 4.766829
## 19 28 0.8215425 0.01587027 5.111118
## 20 29 0.8012503 0.01565888 5.487318
## 21 30 0.7794860 0.01543073 5.890484
library(ggplot2)
p <- ggplot(data=rsqr,aes(x=interval,y=rsq))
p <- p+geom_line()
p <- p+ggtitle("Date interval and r.squared values")
p
Interval 20 days may be most probable that the infected patients will die.
#lastnum <- which(nhk_covid19$date =="2021-07-12")
delta=rsqr[rsqr$rsq==max(rsqr$rsq),]$interval
DeadT <- nhk_covid197ds$deathr7ds[delta:lastnum]
InfectedT <-nhk_covid197ds$infectedr7ds[1:(lastnum-delta+1)]
Deadhat <- InfectedT*rsqr[rsqr$interval==delta,]$slope+rsqr[rsqr$interval==delta,]$intercept
Deadhat_df <- data.frame(obs=DeadT,calc=Deadhat)
p <- ggplot(data=Deadhat_df,aes(x=obs,y=calc,alpha=0.1))+geom_point()
p <- p+ ggtitle("daily_death observed vs. calculated by the number of daily_infected 20 days before")
p
cor(DeadT,Deadhat)
## [1] 0.9526397