library (dplyr)
library(deSolve)
library(reshape2)
library(ggplot2)
library(rafalib)
library(forecast)
library(rmarkdown)
library(markdown)
Covid19 Data:
Sys.time()
## [1] "2020-11-10 22:59:13 CET"
## [1] "2020-10-01 20:51:40 CEST"
CV19_backup<- read.csv("https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv")
#head(CV19_backup);tail(CV19_backup)
names(CV19_backup)
## [1] "data" "stato"
## [3] "ricoverati_con_sintomi" "terapia_intensiva"
## [5] "totale_ospedalizzati" "isolamento_domiciliare"
## [7] "totale_positivi" "variazione_totale_positivi"
## [9] "nuovi_positivi" "dimessi_guariti"
## [11] "deceduti" "casi_da_sospetto_diagnostico"
## [13] "casi_da_screening" "totale_casi"
## [15] "tamponi" "casi_testati"
## [17] "note"
require(dplyr)
CV19_IT<- CV19_backup%>%
mutate(time=seq(1,length(data),by=1),cases=totale_casi,infected= totale_casi-deceduti,recovered=dimessi_guariti, deceased=deceduti,CFR=round(deceduti/totale_casi*100,2))%>%
select(time, cases, infected, deceased,recovered,CFR)%>%
arrange(desc(cases))
head(CV19_IT)
## time cases infected deceased recovered CFR
## 1 261 995463 953133 42330 363023 4.25
## 2 260 960373 918623 41750 345289 4.35
## 3 259 935104 893710 41394 335074 4.43
## 4 258 902490 861427 41063 328891 4.55
## 5 257 862681 822043 40638 322925 4.71
## 6 256 824879 784687 40192 312339 4.87
CV19_IT_box<-CV19_IT%>%arrange(time)
CV19_IT_box<-CV19_IT_box%>%
mutate(new_cases=c(0,diff(cases)),
new_infected=c(0,diff(infected)),
new_deceased=c(0,diff(deceased)),
new_recovered=c(0,diff(recovered)))%>%
select(new_cases,new_infected,new_deceased,new_recovered)
#head(CV19_IT_box)
boxplot(CV19_IT_box,
main="Covid19 Italy- Distributions of proportions variability",
col=(c("gold","red","green","blue")))
summary(CV19_IT_box)
## new_cases new_infected new_deceased new_recovered
## Min. : -148 Min. : -195 Min. :-31.0 Min. : 0
## 1st Qu.: 328 1st Qu.: 285 1st Qu.: 12.0 1st Qu.: 312
## Median : 1326 Median : 1211 Median : 44.0 Median : 929
## Mean : 3813 Mean : 3651 Mean :162.2 Mean : 1391
## 3rd Qu.: 3491 3rd Qu.: 2933 3rd Qu.:233.0 3rd Qu.: 1908
## Max. :39809 Max. :39384 Max. :969.0 Max. :17734
CV19_IT_long <- melt(as.data.frame(CV19_IT), id = "time")
ggplot(data = CV19_IT_long,
aes(x = time, y = value, colour = variable, group = variable)) +
geom_line() +
xlab("Time (days)")+
ylab("Number") +
labs(colour = "Compartment",
title = "Novel Coronavirus Outbreak - Italy")+theme_bw()
Creating a new dataset with the number of new infected:
data<- data.frame(time= 1:length(CV19_backup$data),
new_infected = CV19_backup$nuovi_positivi[1:length(CV19_backup$data)] ) #real distribution of new infected
head(data);tail(data)
## time new_infected
## 1 1 221
## 2 2 93
## 3 3 78
## 4 4 250
## 5 5 238
## 6 6 240
## time new_infected
## 256 256 34505
## 257 257 37809
## 258 258 39811
## 259 259 32616
## 260 260 25271
## 261 261 35098
Time-length of the epidemic:
length(data$time)
## [1] 261
And then considering the shape of the distribution and the density distribution; the difference can be seen on the right tail as it is going down to zero.
At today even if the cases are still in high number the effect of the lockdown can already be seen, in fact the density graph shows the overcoming of the latest peak. It is not said that if the countermeasures interrupt the infection cannot go back up again. For this reason it is very important to keep going with strick distancing measures.
y_end<-max(data$new_infected)
plot(data$new_infected,type="l", main="New infected curve",
xlab="Time(days)",ylab="New Infected",xlim=c(0,300))
abline(22000,0,col="green")
abline(y_end,0,col="red")
arrows(x0=200,y0=y_end,x1=200,y1=22000, lwd=1,col="red",length = 0.15,code = 1)
arrows(x0=280,y0=30000,x1=280,y1=22000, lwd=1,col="green",length = 0.15,code = 2)
hist(data$new_infected, # histogram
breaks =30,
col="blue1", # column color
border="grey",
prob = TRUE, # show densities instead of frequencies
xlab = "frequency",ylab="Density (probability)",
main = "New Infected density distrib")
lines(density(data$new_infected), # density plot
lwd = 2, # thickness of line
col = "red")
This graph below shows the distribution of the new infected and its moving average:
require(zoo)
ggplot(data, aes(time, new_infected)) +
geom_point(position=position_jitter(1,3), pch=21, fill="#FF0000AA",size=0.9) +
geom_line(aes(y=rollmean(new_infected, 7, na.pad=TRUE))) +
theme_bw()
The prevalence is intended as the number of infected cases at a given piont in time.
The peak prevalence of the duration of the pandemic in Italy:
## [1] "Peak prevalence of the pandemic in Italy:"
## [1] 39811
## [1] "Timing of the peak (days):"
## [1] 258
## [1] "Duration of the pandemic (days):"
## [1] 260
To build a workable SIR model it is important to consider the proportion of individuals who are at risk of infection, setting up a percentage of the population who will more likely be infected by the virus, also considering the latent phase of the infection when the susceptible are infected but still not infectious. To do this the variable “E” has been included in the model as well as “A” for the asymptomatics.
This first part of the model is to identify the second wave and to do thar a SEAIR model has been used: (the model parameters have been hided for quality purposes)
The SEAIR model doesn't take consideration of the possiblilty of herd immunity but considers 6 month of temporary immunity, it considers 30% of asymptomatics and Italian population mortality and birth rates.
SEAIR_model <- function(time, state, parameters) {
with(as.list(c(state, parameters)), {
N <- S+E+A+I+R
lambda <- beta * I/N # force of infection
# The differential equations
dS <- -lambda * S - mu * S + b * N
dE <- lambda * S - r * E
dA <- (1-p_a) * lambda * S - gamma * A
dI <- p_a * lambda * S + r * E - gamma * I - mu * I
dR <- gamma * A + gamma * I - mu * R + b * N
# Output
return(list(c(dS, dE, dA, dI, dR)))
})
}
# MODEL OUTPUT
output <- as.data.frame(ode(y = initial_state_values,
times = times,
func = SEAIR_model,
parms = parameters))
# PLOT OF THE MODEL FIT
ggplot() +
geom_line(data = output, aes(x = time, y = I)) + # plot the model prediction of
# the number infected as a line
geom_point(data = data, aes(x = time, y = new_infected),
colour = "red",size=0.8) + # overlay the data as red dots
xlab("Time (days)")+
ylab("Number of new infected people") +
labs(title = paste("Model fit to the second wave epidemic curve with beta =",
parameters["beta"],"and gamma =", parameters["gamma"]),
caption= "Data source: GitHub_pcm-dpc Civil Protection")+
xlim(100,300)+
ylim(0,50000)+
theme_bw()
#length(data$time)
CV19_IT_dy<-CV19_IT%>%arrange(time)
CV19_IT_dy<-CV19_IT_dy%>%mutate(infections=round((c(0,diff(infected))/infected)*100,2),
infec_rt=round((infections/cases)*100,2),
rec_rt=round((c(0,diff(recovered))/c(0,diff(infected)))*100,2))%>%
select(infections,infec_rt,rec_rt)
CV19_IT_dy$rec_rt[is.nan(CV19_IT_dy$rec_rt)] <- 0
#head(CV19_IT_dy)
fit<-lm(infections~infec_rt+rec_rt,data=CV19_IT_dy)
summary(fit)
##
## Call:
## lm(formula = infections ~ infec_rt + rec_rt, data = CV19_IT_dy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0861 -2.3602 -1.2728 0.5462 21.1226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.157176 0.317971 9.929 < 2e-16 ***
## infec_rt 4.774435 0.344027 13.878 < 2e-16 ***
## rec_rt -0.004877 0.001056 -4.616 6.16e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.262 on 258 degrees of freedom
## Multiple R-squared: 0.4694, Adjusted R-squared: 0.4653
## F-statistic: 114.1 on 2 and 258 DF, p-value: < 2.2e-16
summary(fit)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.15717557 0.31797084 9.929135 6.919755e-20
## infec_rt 4.77443549 0.34402726 13.878073 4.343420e-33
## rec_rt -0.00487658 0.00105637 -4.616357 6.163027e-06
Here is a representation of the forecast provided with Arima forecasting model:
the first is the infections rate, calculated as the proportion of new infected on cumulated number of infections, the plot shows the potential decrease in rate of infection as long as the application of the containment measure are active.
fit_inf_rt <- auto.arima(CV19_IT_dy$infections,1,
trace = TRUE,
approximation = FALSE)
##
## ARIMA(2,1,2) with drift : 1192.319
## ARIMA(0,1,0) with drift : 1303.362
## ARIMA(1,1,0) with drift : 1191.256
## ARIMA(0,1,1) with drift : 1214.176
## ARIMA(0,1,0) : 1301.336
## ARIMA(2,1,0) with drift : 1191.904
## ARIMA(1,1,1) with drift : 1192.045
## ARIMA(2,1,1) with drift : 1190.293
## ARIMA(3,1,1) with drift : 1164.59
## ARIMA(3,1,0) with drift : 1193.044
## ARIMA(4,1,1) with drift : 1181.126
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) with drift : 1186.171
## ARIMA(4,1,2) with drift : 1160.079
## ARIMA(5,1,2) with drift : Inf
## ARIMA(4,1,3) with drift : Inf
## ARIMA(3,1,3) with drift : 1160.131
## ARIMA(5,1,1) with drift : 1183.07
## ARIMA(5,1,3) with drift : Inf
## ARIMA(4,1,2) : 1173.536
##
## Best model: ARIMA(4,1,2) with drift
fit_inf_short<- arima(CV19_IT_dy$infections,
c(4,1,2),fixed = NULL,
transform.pars = TRUE,
include.mean = TRUE,
optim.method = "BFGS")
mypar(2)
plot( forecast(fit_inf_rt,36),main = paste("Covid19 IT second wave of infections rt - ARIMA "))
plot(forecast(fit_inf_short),150,
main = paste("Best model ARIMA(4,1,2) with drift"),
xlim=c(100,270))
The second is the fitting of the recovered rate distribution shows a fair stability since the application of the treatments.
fit_rec<-auto.arima(CV19_IT_dy$rec_rt,trace=FALSE)
fit_rec
## Series: CV19_IT_dy$rec_rt
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.8055
## s.e. 0.0332
##
## sigma^2 estimated as 28341: log likelihood=-1701.71
## AIC=3407.43 AICc=3407.48 BIC=3414.55
plot(forecast(fit_rec))
This second part of the modeling represent the basic of a SIR model applyed to Italian’s first wave of Covid19 outbreak prevalence of infection.
require(deSolve)
N<- max(CV19_IT$cases)
initial_state_values<- c(S=N-1,I=1,R=0)
parameters<- c(beta= 1/3,gamma=1/22)#beta (infection rate) is hypothesized to be 3 days^-1,
#gamma (recovery rate) is 22 days^-1
times<- seq(0,length(CV19_IT$time),by=1)
SIR_model<- function(time,state,parameters){
with(as.list(c(state,parameters)),{
N<- S+I+R
lambda<- beta*I/N
dS<- -lambda*S
dI<- lambda*S-gamma*I
dR<- gamma*I
return(list(c(dS,dI,dR)))
})
}
output_sir<- as.data.frame(ode(y=initial_state_values,
times=times,
func=SIR_model,
parms=parameters))
ggplot()+
geom_line(data=output_sir,aes(x=time,y=I),col="red")+
labs(x="Time(days)",y="Infected", title="IT Prevalence of infection")+
theme_bw()
The peak prevalence is clearly at 50 days the start of the epidemic
output_sir_long<- melt(as.data.frame(output_sir), id="time")
ggplot(data=output_sir_long, aes(x=time, y=value,colour=variable,group=variable))+
geom_line()+
labs( x="Time(days)",y="proportions", title="IT Covid19 theoretical Prevalence proportion: basic SIR model",subtitle=paste("with beta =",round(parameters["beta"],2),"and gamma =", round(parameters["gamma"],2)))+
theme_bw()
To calibrate the theoretical model on the observed data, a simple linear regression has been used:
#data is the obeserved new_infected dataframe
#times <- seq(from = 0, to = 14, by = 0.1)
SIR_SSQ <- function(parameters, dat){
beta<- parameters[1]
gamma<- parameters[2]
output1 <- as.data.frame(ode(y = initial_state_values,
times = times,
func = SIR_model,
parms = c(beta=beta,
gamma=gamma))
)
SSQ<- sum((output1$I[output1$time %in% dat$time] - dat$new_infected)^2)
return(SSQ)
}
optimised_IT<- optim( par = c(0.33,0.05),
fn = SIR_SSQ,
dat =data)
round(optimised_IT$par,2)# 0.45 0.08
## [1] 0.00 -0.04
## [1] 0.21 0.17
# 0.21 0.17
With these two new parameters
initial_state_values <- c(S = 10000-1, I = 1, R = 0)
times <- seq(from = 0, to = 100, by = 1)
parameters<- c(beta=0.45,gamma=0.047)#♠ 0.55 are 18 days for recovery
opt_mod_IT <- as.data.frame(ode(y = initial_state_values,
times = times,
func = SIR_model,
parms = parameters))
require(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
opt_plot <- ggplot()+
geom_point(aes(x = time, y = new_infected),
colour = "red",
shape = "x",
data = data)+
geom_line(aes(x = time, y = I),
colour = "blue",
data = opt_mod_IT,size=0.2)+
labs(x="Time(days)",y="Prevalence of new infections",title="First wave prevalence ", subtitle=paste("beta=",round(parameters[1],2),"and gamma=",round(parameters[2],2)),caption="Data Source: Civil Protection data")+
xlim(0,100)+ylim(0,10000)+
theme_bw()
opt_plot_2<- ggplot()+geom_point(aes(x = time, y = new_infected)
, colour = "red"
, shape = "x"
, data = data)+
geom_line(data = output, aes(x = time, y = I),col="blue",size=0.2)+
labs(x="Time(days)",title="Second wave",subtitle="beta = 0.457 and gamma = 1/4",caption="Data Source: Civil Protection data")+
xlim(100,length(data$time))+
theme_bw()
grid.arrange(opt_plot,opt_plot_2,ncol=2)