Why 7 day moving average ?

Previoussly 21 days interval made the correlation high between daily infected and dead numbers. There were also slight increase of correlation 14 and 28 days interval. These may make by the difference of the amount of work depend on the day of the week. To solve this 7 days syntropy, 7 day moving average may be useful.

Covid19 Data

Data were gained from NHK homepage, https://www3.nhk.or.jp/news/special/coronavirus/data-widget/#mokuji0 .

library(RCurl)
## Warning: package 'RCurl' was built under R version 3.6.3
library(readr)
## Warning: package 'readr' was built under R version 3.6.3
## Registered S3 methods overwritten by 'tibble':
##   method     from  
##   format.tbl pillar
##   print.tbl  pillar
url = "https://www3.nhk.or.jp/n-data/opendata/coronavirus/nhk_news_covid19_domestic_daily_data.csv"
myfile <- getURL(url=url,.encoding='UTF-8', ssl.verifypeer=FALSE)

nhk_covid19 <- read_csv(myfile,col_types = cols('日付' = col_date(format = "%Y/%m/%d")))

For convenient column names chenged.

colnames(nhk_covid19)<-c('date','infected_daily','infected_accum','death_daily','death_accum')

##Package RcppRoll roll_mean

RcppRoll package implements a number of ‘roll’-ing functions for R vectors and matrices.

library(RcppRoll)
## Warning: package 'RcppRoll' was built under R version 3.6.3
nhk_covid19$infectedr7ds<-roll_mean(nhk_covid19$infected_daily,n=7,fill=NA)
nhk_covid19$deathr7ds<-roll_mean(nhk_covid19$death_daily,n=7,fill=NA)

Slice rows with NA, upper 3 and lowe 3

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ls <- length(nhk_covid19$date)
nhk_covid197ds<-slice(nhk_covid19,4:(ls-3))
nhk_covid197ds
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 600 x 7
##    date       infected_daily infected_accum death_daily death_accum infectedr7ds
##    <date>              <dbl>          <dbl>       <dbl>       <dbl>        <dbl>
##  1 2020-01-19              0              1           0           0        0.143
##  2 2020-01-20              0              1           0           0        0    
##  3 2020-01-21              0              1           0           0        0.143
##  4 2020-01-22              0              1           0           0        0.286
##  5 2020-01-23              0              1           0           0        0.429
##  6 2020-01-24              1              2           0           0        0.429
##  7 2020-01-25              1              3           0           0        0.857
##  8 2020-01-26              1              4           0           0        1    
##  9 2020-01-27              0              4           0           0        1.86 
## 10 2020-01-28              3              7           0           0        2.14 
## # ... with 590 more rows, and 1 more variable: deathr7ds <dbl>

Regression death from infected with some date interval.

It may supposed that daily_death correlated with daily_infected with some date interval. daily_death(t) ~ daily_infected(t-delta) + b When delta changes regression r.square may be changed. 2021-07-12 is the day when the last A State o Emergency was declared.

lastnum <- which(nhk_covid197ds$date =="2021-07-12")
d <-c()
r <-c()
s <-c()
i <-c()
for (delta in seq(from=10,to=30,by=1)){
  DeadT <- nhk_covid197ds$deathr7ds[delta:lastnum]
  InfectedT <-nhk_covid197ds$infectedr7ds[1:(lastnum-delta+1)]
  covidT <- data.frame(Infected=InfectedT,Dead=DeadT)
  covid19lm <- lm(Dead~Infected ,data = covidT)
  covidsumr <-summary(covid19lm)$r.squared
  d<-c(d,delta)
  r<-c(r,covidsumr)
  s<-c(s,covid19lm$coefficients[2])
  i<-c(i,covid19lm$coefficients[1])
}
rsqr=data.frame(interval=d,rsq=r,slope=s,intercept=i)
rsqr
##    interval       rsq      slope intercept
## 1        10 0.8095496 0.01600038  3.938484
## 2        11 0.8280837 0.01616922  3.739647
## 3        12 0.8443465 0.01631370  3.576660
## 4        13 0.8589689 0.01644066  3.440310
## 5        14 0.8725388 0.01655612  3.320670
## 6        15 0.8844239 0.01665443  3.226062
## 7        16 0.8939454 0.01672964  3.166307
## 8        17 0.9006965 0.01677837  3.146431
## 9        18 0.9050095 0.01680407  3.161394
## 10       19 0.9073615 0.01681138  3.204620
## 11       20 0.9075223 0.01679829  3.278287
## 12       21 0.9047921 0.01675838  3.392846
## 13       22 0.8994575 0.01669426  3.543716
## 14       23 0.8917966 0.01660837  3.727324
## 15       24 0.8821665 0.01650398  3.938305
## 16       25 0.8704509 0.01637959  4.179462
## 17       26 0.8564083 0.01623254  4.455059
## 18       27 0.8399992 0.01606195  4.766829
## 19       28 0.8215425 0.01587027  5.111118
## 20       29 0.8012503 0.01565888  5.487318
## 21       30 0.7794860 0.01543073  5.890484
library(ggplot2)
p <- ggplot(data=rsqr,aes(x=interval,y=rsq))
p <- p+geom_line()
p <- p+ggtitle("Date interval and r.squared values")
p

Interval 20 days may be most probable that the infected patients will die.

Commpair dead(observed) and dead-hat(calculated)

#lastnum <- which(nhk_covid19$date =="2021-07-12")
delta=rsqr[rsqr$rsq==max(rsqr$rsq),]$interval
DeadT <- nhk_covid197ds$deathr7ds[delta:lastnum]
InfectedT <-nhk_covid197ds$infectedr7ds[1:(lastnum-delta+1)]
Deadhat <- InfectedT*rsqr[rsqr$interval==delta,]$slope+rsqr[rsqr$interval==delta,]$intercept
Deadhat_df <- data.frame(obs=DeadT,calc=Deadhat)

p <- ggplot(data=Deadhat_df,aes(x=obs,y=calc,alpha=0.1))+geom_point()
p <- p+ ggtitle("daily_death observed vs. calculated by the number of daily_infected 20 days before")
p

cor(DeadT,Deadhat)
## [1] 0.9526397