dataECDC <- read.csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", na.strings = "", fileEncoding = "UTF-8-BOM")
names(dataECDC)
## [1] "dateRep"
## [2] "day"
## [3] "month"
## [4] "year"
## [5] "cases"
## [6] "deaths"
## [7] "countriesAndTerritories"
## [8] "geoId"
## [9] "countryterritoryCode"
## [10] "popData2019"
## [11] "continentExp"
## [12] "Cumulative_number_for_14_days_of_COVID.19_cases_per_100000"
Table of the last 6 days of COVID19 UK data:
| Date | Days | cases | survivors | deaths | Cum_14days | incidence | CFR_100 | CMR_100 |
|---|---|---|---|---|---|---|---|---|
| 07/10/2020 | 282 | 14542 | 14466 | 76 | 189.899 | 0.99 | 0.52 | 11.40 |
| 06/10/2020 | 281 | 12593 | 12574 | 19 | 175.470 | 1.00 | 0.15 | 2.85 |
| 05/10/2020 | 280 | 22961 | 22928 | 33 | 163.129 | 1.00 | 0.14 | 4.95 |
| 04/10/2020 | 279 | 12871 | 12822 | 49 | 134.528 | 1.00 | 0.38 | 7.35 |
| 03/10/2020 | 278 | 6968 | 6902 | 66 | 121.851 | 0.99 | 0.95 | 9.90 |
| 02/10/2020 | 277 | 6914 | 6855 | 59 | 117.881 | 0.99 | 0.85 | 8.85 |
require(zoo)
require(ggplot2)
ggplot(Covid19_UK, aes(Days, cases)) +
geom_point(position=position_jitter(1,3), pch=21, fill="#FF0000AA") +
geom_line(aes(y=rollmean(cases, 7, na.pad=TRUE))) +
theme_bw()+
labs(x="Time(days)",y="Number of Cases", title="UK Covid19 Cases distribution in time"
,caption="Data Source: ECDC")+
scale_x_continuous(breaks = seq(from = 0, to = length(Covid19_UK$Days), by = 30))
ggplot(Covid19_UK, aes(Days, incidence)) +
geom_point(position=position_jitter(1,3), pch=21, fill="#FF0000AA") +
geom_line(aes(y=rollmean(incidence, 7, na.pad=TRUE))) +
labs(x="Time(days)",y="Incidence proportion",title="Incidence proportion distribution",subtitle="on going infections on total cases")+
scale_x_continuous(breaks = seq(from = 0, to = length(Covid19_UK$Days), by = 30))+
theme_bw()
Data are sparse and it is necessary a further analysis to see if the behaviour is Normal. To do that:
Covid19_UK$incidence[is.nan(Covid19_UK$incidence)]<- 0 #elimination of NaN values
incidence<- Covid19_UK$incidence #creating a incidence variable
n<- length(incidence)
inc_avg<- mean(incidence)#0.7679004
inc_sd<- sd(incidence)# 0.363562
inc_se<- inc_sd/sqrt(n)#0.02168829
inc_med<- median(incidence)#0.93
CI_95<-qnorm(0.975)
error<- CI_95*inc_se #margin of error 0.04250827
CI_interv_1<-inc_avg+CI_95*inc_se
CI_interv_2<-inc_avg-CI_95*inc_se
round(CI_interv_2,2);round(CI_interv_1,2);round(error,2)
## [1] 0.73
## [1] 0.81
## [1] 0.04
A Normal distribution of the incicence:
inc_norm<-rnorm(incidence)
qqnorm(inc_norm)
qqline(inc_norm,col="blue")
The following graph show the difference in shape distribution for the right identification of the shape of Incidence proportion distribution:
qplot(inc_norm, geom="histogram",bins = 30,
main = "Histogram for Norm Incidence",
xlab = "Norm Inc",
fill=I("blue"),
col=I("red"),alpha=I(.2))+
theme_bw()
While if the moving average of the observed incidence data is taken, the value are very difficult to distinguish, as well as the behavioural and so is the shape for further forecasting:
prev_Norm_inc<- Covid19_UK %>%
mutate(rolling_avg = slide_dbl(incidence, mean, .before = 3, .complete = F))
ggplot()+
geom_line(data=Covid19_UK,aes(x=Days,y=incidence),col="white")+
geom_line(data=prev_Norm_inc,aes(x=Days,y=rolling_avg),col="red")+
xlim(60,290)+ylim(0.75,1)+
labs(title="Rolling Average for UK Incidence proportion of Infection",
x="Time(days)",y="proportion",caption="Data Source: ECDC")+
theme_bw()
The density shows a clear increase and prominence on the right tail of the observed distribution:
inc_norm_s<- pnorm((incidence-inc_avg)/inc_sd)
plot(density(inc_norm_s),main= "UK Incidence Density")
To make it clearer and to hazard an estimation, four samples are replicated to obtain sample distribution of the incidence proportion at 100,200,300 and 400 time-frame:
Ns<-seq(100,400,100)
B <- 1000
par(mfrow=c(2,2))
LIM <- c(-4,4)
for(incidence_s in Ns){
ts <- replicate(B, {
X <- rnorm(incidence_s)
sqrt(incidence_s)*mean(X)/sd(X)
})
ps <- seq(1/(B+1),1-1/(B+1),len=B)
qqplot(qt(ps,df=incidence_s-1),ts,main=incidence_s,
xlab="Theoretical",ylab="Observed",
xlim=LIM, ylim=LIM)
abline(0,1)
}
A further 10 000 replications of the observed incidence distribution is taken and matched with the normal sample:
set.seed(711)#710
inc_samp<- sample(incidence,1000,replace=TRUE)
mean(inc_samp);sd(inc_samp);sd(inc_samp)/sqrt(10000)
## [1] 0.76299
## [1] 0.3671282
## [1] 0.003671282
samp_inc_norm<- rnorm(inc_samp)
par(mfrow=c(1,2))
qqnorm(samp_inc_norm)
qqline(samp_inc_norm,col="blue")
qqnorm(inc_norm)
qqline(inc_norm,col="red")
The updated histogram of the cases in UK shows some outliners:
ggplot()+
geom_histogram(data=Covid19_UK, aes(x=cases),bins=35, fill = "darkgreen",
col = "black")+
labs(x="Cases", y="Frequency",title="UK Cases")#+
# scale_x_continuous(breaks = seq(from = 0, to = 7000, by = 500)) + theme_bw()
Table of the last 6 days of COVID19 Italy data:
| Date | Days | cases | survivors | deaths | Cum_14days | incidence | CFR_100 | CMR_100 |
|---|---|---|---|---|---|---|---|---|
| 07/10/2020 | 282 | 2677 | 2649 | 28 | 49 | 0.99 | 1.05 | 4.64 |
| 06/10/2020 | 281 | 2257 | 2241 | 16 | 47 | 0.99 | 0.71 | 2.65 |
| 05/10/2020 | 280 | 2578 | 2560 | 18 | 45 | 0.99 | 0.70 | 2.98 |
| 04/10/2020 | 279 | 2843 | 2816 | 27 | 43 | 0.99 | 0.95 | 4.47 |
| 03/10/2020 | 278 | 2499 | 2476 | 23 | 41 | 0.99 | 0.92 | 3.81 |
| 02/10/2020 | 277 | 2548 | 2524 | 24 | 40 | 0.99 | 0.94 | 3.98 |
Plot of the two curves of cumulated cases for UK and Italy for the whole period:
ggplot()+
geom_line(data=Covid19_UK,aes(x=Days,y=cases),col="blue",size = 0.3)+
geom_line(data=Covid19_IT,aes(x=Days,y=cases),col="red",size = 0.3)+
annotate(geom="text", x=50, y=6000, label="Italy",
color="red")+
annotate(geom="text", x=150, y=4000, label="UK",
color="blue")+
theme_bw(base_size = 12) + theme(legend.position = "bottom")+
xlab("Time(days)")+ylab("Cases")+
labs("UK and Italy cumulated cases trend",color = "Legend")
UK Cases, Deaths and Survivors trends:
ggplot()+
geom_line(data=Covid19_UK,aes(x=Days,y=cases),col="green",size=0.3)+
geom_line(data=Covid19_UK,aes(x=Days,y=deaths),col="red",size=0.3)+
geom_line(data=Covid19_UK,aes(x=Days,y=survivors),col="blue",size=0.3)+
theme_bw(base_size = 12) + theme(legend.position = "bottom")+
labs(x="Time(days)", y="Cumulate numbers", title="UK Cases, deaths and CFR trends")
UK Crude Mortality rates (CMR) and Case Fatality Rates (CFR):
plot(density(Covid19_UK$CMR_100),col="green",xlim=c(-20,50),ylim=c(0,0.1),main="UK CMR and CFR")
lines(density(Covid19_UK$CFR_100,na.rm = T),col="red")
axis(side = 1, at = round(seq(0, 50, by = 10)))
text(x=median(Covid19_UK$CMR_100), y=0.04, labels="CMR")
text(x=mean(Covid19_UK$CFR_100,na.rm=T), y=0.07, labels="CFR")
The CFR curve shows a right(positive) skewed distribution of fatalities along the time:
ggplot()+
geom_histogram(data=Covid19_UK,aes(x=CFR_100, main="UK CFR"),bins=30,
fill = "darkgreen",
col = "black")+
labs(x="CFR_100", y="Frequency",title="UK CFR_100")
In this section is represented the basic of the SIR model with given beta and gamma, it is expected a further calibration of the model considering other factors, such as: - effect of treatment - effect of social distancing - age impact - co-morbidity - symptomatic and asymptomatic behavior - vaccination strategies
To begin with a theoretical value of beta to calibrate the model using a simple linear regression model function: the sum of squares values (SSQ). The line that best predict the lowest distance between theoretical predictors and real data, to assess how far is the model from real data.
The prevalence of the theoretical model infection is represented in the graph below:
N<- max(Covid19_UK$cases)
initial_state_values<- c(S=N-1,I=1,R=0)
parameters<- c(beta= 1/3,gamma=1/22)#beta (infection rate) is hypothesized to be 3 days^-1,
#gamma (recovery rate) is 22 days^-1
times<- seq(0,length(Covid19_UK$Date),by=1)
SIR_model<- function(time,state,parameters){
with(as.list(c(state,parameters)),{
N<- S+I+R
lambda<- beta*I/N
dS<- -lambda*S
dI<- lambda*S-gamma*I
dR<- gamma*I
return(list(c(dS,dI,dR)))
})
}
output<- as.data.frame(ode(y=initial_state_values,
times=times,
func=SIR_model,
parms=parameters))
ggplot()+
geom_line(data=output,aes(x=time,y=I))+
labs(x="Time(days)",y="Infected", title="UK Prevalence of infection")
output_long<- melt(as.data.frame(output), id="time")
ggplot(data=output_long, aes(x=time, y=value,colour=variable,group=variable))+
geom_line()+
labs( x="Time(days)",y="proportions", title="UK Covid19 theoretical Prevalence proportion: basic SIR model")
Simple Linear Regression:
SIR_fun<- function(time,state,parameters){
with(as.list(c(state,parameters)),{
N<- S+I+R
dS<- -beta*S*(I/N)
dI<- beta*S*(I/N)
dR<- gamma*I
return(list(c(dS,dI,dR)))
})}
SIR_SSQ <- function(parameters, dat) {
result <- as.data.frame(ode(y = initial_state_values
, times = times
, func = SIR_fun
, parms = parameters)
)
dat <- na.omit(dat)
deltas2 <- (result$I[result$time %in% dat$time] - dat$I)^2
SSQ <- sum(deltas2)
return(SSQ)
}
require(reshape2)
dat_UK<- Covid19_UK%>%
mutate(number_infected = (cases-deaths),time=Days)%>%
dplyr::select(time,number_infected)
head(formattable(dat_UK))
| time | number_infected |
|---|---|
| 282 | 14466 |
| 281 | 12574 |
| 280 | 22928 |
| 279 | 12822 |
| 278 | 6902 |
| 277 | 6855 |
initial_state_values <- c(S = 10500-1, I = 1, R = 0)
beta_start <- 0.15
gamma_start <- 0.04
times <- seq(from = 0, to = 14, by = 0.1)
optimised <- optim(par = c(beta = beta_start
, gamma = gamma_start)
, fn = SIR_SSQ
, dat = dat_UK
)
round(optimised$par,2)#0.15 0.04
## beta gamma
## 0.15 0.04
times <- seq(from = 0, to = length(Covid19_UK$cases), by = 1)
opt_mod <- as.data.frame(ode(y = initial_state_values,
times = times,
func = SIR_model,
parms = optimised$par))
require(ggplot2)
opt_plot <- ggplot()
opt_plot <- opt_plot + geom_point(aes(x = time, y = number_infected)
, colour = "red"
, shape = "x"
, data = dat_UK)
opt_plot <- opt_plot + geom_line(aes(x = time, y = I)
, colour = "blue"
, data = opt_mod)
opt_plot