library (dplyr)
library(deSolve)
library(reshape2)
library(ggplot2)
library(rafalib)
library(forecast)
library(rmarkdown)
library(markdown)

Covid19 Data:

Sys.time()

## [1] "2020-11-10 22:59:13 CET"

## [1] "2020-10-01 20:51:40 CEST"
CV19_backup<- read.csv("https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv")
#head(CV19_backup);tail(CV19_backup)
names(CV19_backup)

##  [1] "data"                         "stato"                       
##  [3] "ricoverati_con_sintomi"       "terapia_intensiva"           
##  [5] "totale_ospedalizzati"         "isolamento_domiciliare"      
##  [7] "totale_positivi"              "variazione_totale_positivi"  
##  [9] "nuovi_positivi"               "dimessi_guariti"             
## [11] "deceduti"                     "casi_da_sospetto_diagnostico"
## [13] "casi_da_screening"            "totale_casi"                 
## [15] "tamponi"                      "casi_testati"                
## [17] "note"

require(dplyr)
CV19_IT<- CV19_backup%>%
  mutate(time=seq(1,length(data),by=1),cases=totale_casi,infected= totale_casi-deceduti,recovered=dimessi_guariti, deceased=deceduti,CFR=round(deceduti/totale_casi*100,2))%>%
  select(time, cases, infected, deceased,recovered,CFR)%>%
  arrange(desc(cases))

head(CV19_IT)

##   time  cases infected deceased recovered  CFR
## 1  261 995463   953133    42330    363023 4.25
## 2  260 960373   918623    41750    345289 4.35
## 3  259 935104   893710    41394    335074 4.43
## 4  258 902490   861427    41063    328891 4.55
## 5  257 862681   822043    40638    322925 4.71
## 6  256 824879   784687    40192    312339 4.87

CV19_IT_box<-CV19_IT%>%arrange(time)
  
CV19_IT_box<-CV19_IT_box%>%
  mutate(new_cases=c(0,diff(cases)),
         new_infected=c(0,diff(infected)),
         new_deceased=c(0,diff(deceased)),
         new_recovered=c(0,diff(recovered)))%>%
  select(new_cases,new_infected,new_deceased,new_recovered)

#head(CV19_IT_box)
boxplot(CV19_IT_box,
        main="Covid19 Italy- Distributions of proportions variability",
        col=(c("gold","red","green","blue")))

summary(CV19_IT_box)

##    new_cases      new_infected    new_deceased   new_recovered  
##  Min.   : -148   Min.   : -195   Min.   :-31.0   Min.   :    0  
##  1st Qu.:  328   1st Qu.:  285   1st Qu.: 12.0   1st Qu.:  312  
##  Median : 1326   Median : 1211   Median : 44.0   Median :  929  
##  Mean   : 3813   Mean   : 3651   Mean   :162.2   Mean   : 1391  
##  3rd Qu.: 3491   3rd Qu.: 2933   3rd Qu.:233.0   3rd Qu.: 1908  
##  Max.   :39809   Max.   :39384   Max.   :969.0   Max.   :17734

CV19_IT_long <- melt(as.data.frame(CV19_IT), id = "time")

ggplot(data = CV19_IT_long,                                              
       aes(x = time, y = value, colour = variable, group = variable)) + 
  geom_line() +                                                          
  xlab("Time (days)")+                                                  
  ylab("Number") +                                                      
  labs(colour = "Compartment",                                           
       title = "Novel Coronavirus Outbreak - Italy")+theme_bw()

Creating a new dataset with the number of new infected:

data<- data.frame(time= 1:length(CV19_backup$data), 
                  new_infected = CV19_backup$nuovi_positivi[1:length(CV19_backup$data)] )  #real distribution of new infected

head(data);tail(data)

##   time new_infected
## 1    1          221
## 2    2           93
## 3    3           78
## 4    4          250
## 5    5          238
## 6    6          240

##     time new_infected
## 256  256        34505
## 257  257        37809
## 258  258        39811
## 259  259        32616
## 260  260        25271
## 261  261        35098

Time-length of the epidemic:

length(data$time)

## [1] 261

And then considering the shape of the distribution and the density distribution; the difference can be seen on the right tail as it is going down to zero.

At today even if the cases are still in high number the effect of the lockdown can already be seen, in fact the density graph shows the overcoming of the latest peak. It is not said that if the countermeasures interrupt the infection cannot go back up again. For this reason it is very important to keep going with strick distancing measures.

y_end<-max(data$new_infected)

plot(data$new_infected,type="l", main="New infected curve",
     xlab="Time(days)",ylab="New Infected",xlim=c(0,300))
abline(22000,0,col="green")
abline(y_end,0,col="red")
arrows(x0=200,y0=y_end,x1=200,y1=22000, lwd=1,col="red",length = 0.15,code = 1)
arrows(x0=280,y0=30000,x1=280,y1=22000, lwd=1,col="green",length = 0.15,code = 2)

hist(data$new_infected, # histogram
 breaks =30,
 col="blue1", # column color
 border="grey",
 prob = TRUE, # show densities instead of frequencies
 xlab = "frequency",ylab="Density (probability)",
 main = "New Infected density distrib")
 lines(density(data$new_infected), # density plot
 lwd = 2, # thickness of line
 col = "red")

This graph below shows the distribution of the new infected and its moving average:

require(zoo)
ggplot(data, aes(time, new_infected)) + 
  geom_point(position=position_jitter(1,3), pch=21, fill="#FF0000AA",size=0.9) +
  geom_line(aes(y=rollmean(new_infected, 7, na.pad=TRUE))) +
  theme_bw()

The prevalence is intended as the number of infected cases at a given piont in time.

The peak prevalence of the duration of the pandemic in Italy:

## [1] "Peak prevalence of the pandemic in Italy:"

## [1] 39811

## [1] "Timing of the peak (days):"

## [1] 258

## [1] "Duration of the pandemic (days):"

## [1] 260

To build a workable SIR model it is important to consider the proportion of individuals who are at risk of infection, setting up a percentage of the population who will more likely be infected by the virus, also considering the latent phase of the infection when the susceptible are infected but still not infectious. To do this the variable “E” has been included in the model as well as “A” for the asymptomatics.

This first part of the model is to identify the second wave and to do thar a SEAIR model has been used: (the model parameters have been hided for quality purposes)

MODEL FUNCTION

The SEAIR model doesn't take consideration of the possiblilty of herd immunity but considers 6 month of temporary immunity, it considers 30% of asymptomatics and Italian population mortality and birth rates.

SEAIR_model <- function(time, state, parameters) {  
  
  with(as.list(c(state, parameters)), {
    
    N <- S+E+A+I+R
    
    lambda <- beta * I/N    # force of infection
    
    # The differential equations
    dS <- -lambda * S - mu * S + b * N 
    
    dE <- lambda * S - r * E
    dA <- (1-p_a) * lambda * S - gamma * A
      
    dI <-  p_a * lambda * S + r * E - gamma * I - mu * I
    dR <- gamma * A + gamma * I - mu * R + b * N   
  
    # Output
    return(list(c(dS, dE, dA, dI, dR))) 
  })
}


# MODEL OUTPUT

output <- as.data.frame(ode(y = initial_state_values, 
                            times = times, 
                            func = SEAIR_model,
                            parms = parameters))

# PLOT OF THE MODEL FIT

ggplot() +
  geom_line(data = output, aes(x = time, y = I)) +        # plot the model prediction of
                                                          # the number infected as a line
  geom_point(data = data, aes(x = time, y = new_infected), 
             colour = "red",size=0.8) +  # overlay the data as red dots
  xlab("Time (days)")+                                              
  ylab("Number of new infected people") +                                 
  labs(title = paste("Model fit to the second wave epidemic curve with beta =", 
                     parameters["beta"],"and gamma =", parameters["gamma"]),
       caption= "Data source: GitHub_pcm-dpc Civil Protection")+
  xlim(100,300)+
  ylim(0,50000)+
  theme_bw()

#length(data$time)

CV19_IT_dy<-CV19_IT%>%arrange(time)

CV19_IT_dy<-CV19_IT_dy%>%mutate(infections=round((c(0,diff(infected))/infected)*100,2),
                             infec_rt=round((infections/cases)*100,2),
                             rec_rt=round((c(0,diff(recovered))/c(0,diff(infected)))*100,2))%>%
  select(infections,infec_rt,rec_rt)

CV19_IT_dy$rec_rt[is.nan(CV19_IT_dy$rec_rt)] <- 0

#head(CV19_IT_dy)

fit<-lm(infections~infec_rt+rec_rt,data=CV19_IT_dy)
summary(fit)

## 
## Call:
## lm(formula = infections ~ infec_rt + rec_rt, data = CV19_IT_dy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.0861  -2.3602  -1.2728   0.5462  21.1226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.157176   0.317971   9.929  < 2e-16 ***
## infec_rt     4.774435   0.344027  13.878  < 2e-16 ***
## rec_rt      -0.004877   0.001056  -4.616 6.16e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.262 on 258 degrees of freedom
## Multiple R-squared:  0.4694, Adjusted R-squared:  0.4653 
## F-statistic: 114.1 on 2 and 258 DF,  p-value: < 2.2e-16

summary(fit)$coef

##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  3.15717557 0.31797084  9.929135 6.919755e-20
## infec_rt     4.77443549 0.34402726 13.878073 4.343420e-33
## rec_rt      -0.00487658 0.00105637 -4.616357 6.163027e-06

Here is a representation of the forecast provided with Arima forecasting model:

the first is the infections rate, calculated as the proportion of new infected on cumulated number of infections, the plot shows the potential decrease in rate of infection as long as the application of the containment measure are active.

fit_inf_rt <- auto.arima(CV19_IT_dy$infections,1, 
                             trace = TRUE, 
                             approximation = FALSE)

## 
##  ARIMA(2,1,2) with drift         : 1192.319
##  ARIMA(0,1,0) with drift         : 1303.362
##  ARIMA(1,1,0) with drift         : 1191.256
##  ARIMA(0,1,1) with drift         : 1214.176
##  ARIMA(0,1,0)                    : 1301.336
##  ARIMA(2,1,0) with drift         : 1191.904
##  ARIMA(1,1,1) with drift         : 1192.045
##  ARIMA(2,1,1) with drift         : 1190.293
##  ARIMA(3,1,1) with drift         : 1164.59
##  ARIMA(3,1,0) with drift         : 1193.044
##  ARIMA(4,1,1) with drift         : 1181.126
##  ARIMA(3,1,2) with drift         : Inf
##  ARIMA(4,1,0) with drift         : 1186.171
##  ARIMA(4,1,2) with drift         : 1160.079
##  ARIMA(5,1,2) with drift         : Inf
##  ARIMA(4,1,3) with drift         : Inf
##  ARIMA(3,1,3) with drift         : 1160.131
##  ARIMA(5,1,1) with drift         : 1183.07
##  ARIMA(5,1,3) with drift         : Inf
##  ARIMA(4,1,2)                    : 1173.536
## 
##  Best model: ARIMA(4,1,2) with drift

fit_inf_short<- arima(CV19_IT_dy$infections,
                  c(4,1,2),fixed = NULL,
                  transform.pars = TRUE,
                  include.mean = TRUE,
                  optim.method = "BFGS")

mypar(2)
plot( forecast(fit_inf_rt,36),main = paste("Covid19 IT second wave of infections rt - ARIMA "))

plot(forecast(fit_inf_short),150,
     main = paste("Best model ARIMA(4,1,2) with drift"),
     xlim=c(100,270))

The second is the fitting of the recovered rate distribution shows a fair stability since the application of the treatments.

fit_rec<-auto.arima(CV19_IT_dy$rec_rt,trace=FALSE)
fit_rec

## Series: CV19_IT_dy$rec_rt 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.8055
## s.e.   0.0332
## 
## sigma^2 estimated as 28341:  log likelihood=-1701.71
## AIC=3407.43   AICc=3407.48   BIC=3414.55

plot(forecast(fit_rec))

This second part of the modeling represent the basic of a SIR model applyed to Italian’s first wave of Covid19 outbreak prevalence of infection.

require(deSolve)


N<- max(CV19_IT$cases)

initial_state_values<- c(S=N-1,I=1,R=0)

parameters<- c(beta= 1/3,gamma=1/22)#beta (infection rate) is hypothesized to be 3 days^-1, 
                                    #gamma (recovery rate) is 22 days^-1

times<- seq(0,length(CV19_IT$time),by=1)

SIR_model<- function(time,state,parameters){
  with(as.list(c(state,parameters)),{
       
    N<- S+I+R
    lambda<- beta*I/N
       
       dS<- -lambda*S
       dI<- lambda*S-gamma*I
       dR<- gamma*I
       
       return(list(c(dS,dI,dR)))
       })
}

output_sir<- as.data.frame(ode(y=initial_state_values,
                           times=times,
                           func=SIR_model,
                           parms=parameters))

ggplot()+
  geom_line(data=output_sir,aes(x=time,y=I),col="red")+
  labs(x="Time(days)",y="Infected", title="IT Prevalence of infection")+
  theme_bw()

The peak prevalence is clearly at 50 days the start of the epidemic

output_sir_long<- melt(as.data.frame(output_sir), id="time")
ggplot(data=output_sir_long, aes(x=time, y=value,colour=variable,group=variable))+
  geom_line()+
 labs( x="Time(days)",y="proportions", title="IT Covid19 theoretical Prevalence proportion: basic SIR model",subtitle=paste("with beta =",round(parameters["beta"],2),"and gamma =", round(parameters["gamma"],2)))+
  theme_bw()

To calibrate the theoretical model on the observed data, a simple linear regression has been used:

#data is the obeserved new_infected dataframe 

#times <- seq(from = 0, to = 14, by = 0.1) 

SIR_SSQ <- function(parameters, dat){
  beta<- parameters[1]
  gamma<- parameters[2]
            
  output1 <- as.data.frame(ode(y = initial_state_values,
                                 times = times,
                                 func = SIR_model,
                                 parms = c(beta=beta,
                                           gamma=gamma))
                             ) 
    
    SSQ<- sum((output1$I[output1$time %in% dat$time] - dat$new_infected)^2)
                          
    return(SSQ)
}

optimised_IT<- optim( par = c(0.33,0.05),
                      fn = SIR_SSQ,
                      dat =data)

round(optimised_IT$par,2)# 0.45 0.08

## [1]  0.00 -0.04

## [1] 0.21 0.17
# 0.21 0.17

With these two new parameters

♥

initial_state_values <- c(S = 10000-1, I = 1, R = 0)

times <- seq(from = 0, to = 100, by = 1) 

parameters<- c(beta=0.45,gamma=0.047)#♠ 0.55 are 18 days for recovery

opt_mod_IT <- as.data.frame(ode(y = initial_state_values,
                             times = times,
                             func = SIR_model,
                             parms = parameters))
require(ggplot2)


library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
opt_plot <- ggplot()+
  
  geom_point(aes(x = time, y = new_infected),
             colour = "red",
             shape  = "x",
             data = data)+
  
  geom_line(aes(x = time, y = I),
            colour = "blue",
            data   = opt_mod_IT,size=0.2)+
  
  labs(x="Time(days)",y="Prevalence of new infections",title="First wave prevalence ", subtitle=paste("beta=",round(parameters[1],2),"and gamma=",round(parameters[2],2)),caption="Data Source: Civil Protection data")+
  xlim(0,100)+ylim(0,10000)+
  theme_bw()

opt_plot_2<- ggplot()+geom_point(aes(x = time, y = new_infected)
                                , colour = "red"
                                , shape  = "x" 
                                , data = data)+
  geom_line(data = output, aes(x = time, y = I),col="blue",size=0.2)+ 
  
  labs(x="Time(days)",title="Second wave",subtitle="beta = 0.457 and gamma = 1/4",caption="Data Source: Civil Protection data")+
  xlim(100,length(data$time))+
  theme_bw()
  
  grid.arrange(opt_plot,opt_plot_2,ncol=2)

Covid19_IT_model.R - update: November 2020

Federica Gazzelloni

10/25/2020

MODEL FUNCTION

♥