dataECDC <- read.csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", na.strings = "", fileEncoding = "UTF-8-BOM")
names(dataECDC)
##  [1] "dateRep"                                                   
##  [2] "day"                                                       
##  [3] "month"                                                     
##  [4] "year"                                                      
##  [5] "cases"                                                     
##  [6] "deaths"                                                    
##  [7] "countriesAndTerritories"                                   
##  [8] "geoId"                                                     
##  [9] "countryterritoryCode"                                      
## [10] "popData2019"                                               
## [11] "continentExp"                                              
## [12] "Cumulative_number_for_14_days_of_COVID.19_cases_per_100000"

Table of the last 6 days of COVID19 UK data:

Date Days cases survivors deaths Cum_14days incidence CFR_100 CMR_100
07/10/2020 282 14542 14466 76 189.899 0.99 0.52 11.40
06/10/2020 281 12593 12574 19 175.470 1.00 0.15 2.85
05/10/2020 280 22961 22928 33 163.129 1.00 0.14 4.95
04/10/2020 279 12871 12822 49 134.528 1.00 0.38 7.35
03/10/2020 278 6968 6902 66 121.851 0.99 0.95 9.90
02/10/2020 277 6914 6855 59 117.881 0.99 0.85 8.85
require(zoo)
require(ggplot2)

ggplot(Covid19_UK, aes(Days, cases)) + 
  geom_point(position=position_jitter(1,3), pch=21, fill="#FF0000AA") +
  geom_line(aes(y=rollmean(cases, 7, na.pad=TRUE))) +
  theme_bw()+
  labs(x="Time(days)",y="Number of Cases", title="UK Covid19 Cases distribution in time"
                  ,caption="Data Source: ECDC")+
  scale_x_continuous(breaks = seq(from = 0, to = length(Covid19_UK$Days), by = 30))

ggplot(Covid19_UK, aes(Days, incidence)) + 
  geom_point(position=position_jitter(1,3), pch=21, fill="#FF0000AA") +
  geom_line(aes(y=rollmean(incidence, 7, na.pad=TRUE))) +
  labs(x="Time(days)",y="Incidence proportion",title="Incidence proportion distribution",subtitle="on going infections on total cases")+
  scale_x_continuous(breaks = seq(from = 0, to = length(Covid19_UK$Days), by = 30))+
  theme_bw()

Data are sparse and it is necessary a further analysis to see if the behaviour is Normal. To do that:

Covid19_UK$incidence[is.nan(Covid19_UK$incidence)]<- 0   #elimination of NaN values

incidence<- Covid19_UK$incidence #creating a incidence variable

n<- length(incidence)
inc_avg<- mean(incidence)#0.7679004
inc_sd<- sd(incidence)# 0.363562
inc_se<- inc_sd/sqrt(n)#0.02168829
inc_med<- median(incidence)#0.93

CI_95<-qnorm(0.975)

error<- CI_95*inc_se #margin of error 0.04250827

CI_interv_1<-inc_avg+CI_95*inc_se
CI_interv_2<-inc_avg-CI_95*inc_se

round(CI_interv_2,2);round(CI_interv_1,2);round(error,2)
## [1] 0.73
## [1] 0.81
## [1] 0.04

A Normal distribution of the incicence:

inc_norm<-rnorm(incidence)

qqnorm(inc_norm)
qqline(inc_norm,col="blue")

The following graph show the difference in shape distribution for the right identification of the shape of Incidence proportion distribution:

qplot(inc_norm, geom="histogram",bins = 30,  
      main = "Histogram for Norm Incidence", 
      xlab = "Norm Inc",  
      fill=I("blue"), 
      col=I("red"),alpha=I(.2))+
    theme_bw()

While if the moving average of the observed incidence data is taken, the value are very difficult to distinguish, as well as the behavioural and so is the shape for further forecasting:

prev_Norm_inc<- Covid19_UK %>% 
  mutate(rolling_avg = slide_dbl(incidence, mean, .before = 3, .complete = F))


  ggplot()+
  geom_line(data=Covid19_UK,aes(x=Days,y=incidence),col="white")+
  geom_line(data=prev_Norm_inc,aes(x=Days,y=rolling_avg),col="red")+
    xlim(60,290)+ylim(0.75,1)+
    labs(title="Rolling Average for UK Incidence proportion of Infection",
         x="Time(days)",y="proportion",caption="Data Source: ECDC")+
    theme_bw()

The density shows a clear increase and prominence on the right tail of the observed distribution:

inc_norm_s<- pnorm((incidence-inc_avg)/inc_sd)
plot(density(inc_norm_s),main= "UK Incidence Density")

To make it clearer and to hazard an estimation, four samples are replicated to obtain sample distribution of the incidence proportion at 100,200,300 and 400 time-frame:

Ns<-seq(100,400,100)
B <- 1000
par(mfrow=c(2,2))
LIM <- c(-4,4)

for(incidence_s in Ns){
    ts <- replicate(B, {
      
    X <- rnorm(incidence_s)
    
    sqrt(incidence_s)*mean(X)/sd(X)
    })
    ps <- seq(1/(B+1),1-1/(B+1),len=B)
    
    qqplot(qt(ps,df=incidence_s-1),ts,main=incidence_s,
           
           xlab="Theoretical",ylab="Observed",
           xlim=LIM, ylim=LIM)
    abline(0,1)
}

A further 10 000 replications of the observed incidence distribution is taken and matched with the normal sample:

set.seed(711)#710
inc_samp<- sample(incidence,1000,replace=TRUE)

mean(inc_samp);sd(inc_samp);sd(inc_samp)/sqrt(10000)
## [1] 0.76299
## [1] 0.3671282
## [1] 0.003671282
samp_inc_norm<- rnorm(inc_samp)

par(mfrow=c(1,2))
qqnorm(samp_inc_norm)
qqline(samp_inc_norm,col="blue")

qqnorm(inc_norm)
qqline(inc_norm,col="red")

The updated histogram of the cases in UK shows some outliners:

ggplot()+
  geom_histogram(data=Covid19_UK, aes(x=cases),bins=35, fill = "darkgreen", 
                 col = "black")+
  labs(x="Cases", y="Frequency",title="UK Cases")#+

 # scale_x_continuous(breaks = seq(from = 0, to = 7000, by = 500)) + theme_bw()

Table of the last 6 days of COVID19 Italy data:

Date Days cases survivors deaths Cum_14days incidence CFR_100 CMR_100
07/10/2020 282 2677 2649 28 49 0.99 1.05 4.64
06/10/2020 281 2257 2241 16 47 0.99 0.71 2.65
05/10/2020 280 2578 2560 18 45 0.99 0.70 2.98
04/10/2020 279 2843 2816 27 43 0.99 0.95 4.47
03/10/2020 278 2499 2476 23 41 0.99 0.92 3.81
02/10/2020 277 2548 2524 24 40 0.99 0.94 3.98

Plot of the two curves of cumulated cases for UK and Italy for the whole period:

ggplot()+
  geom_line(data=Covid19_UK,aes(x=Days,y=cases),col="blue",size = 0.3)+
  geom_line(data=Covid19_IT,aes(x=Days,y=cases),col="red",size = 0.3)+
  annotate(geom="text", x=50, y=6000, label="Italy",
              color="red")+
  annotate(geom="text", x=150, y=4000, label="UK",
              color="blue")+
    theme_bw(base_size = 12) + theme(legend.position = "bottom")+
  xlab("Time(days)")+ylab("Cases")+
  labs("UK and Italy cumulated cases trend",color = "Legend")

UK Cases, Deaths and Survivors trends:

ggplot()+
  geom_line(data=Covid19_UK,aes(x=Days,y=cases),col="green",size=0.3)+
  geom_line(data=Covid19_UK,aes(x=Days,y=deaths),col="red",size=0.3)+
  geom_line(data=Covid19_UK,aes(x=Days,y=survivors),col="blue",size=0.3)+
  theme_bw(base_size = 12) + theme(legend.position = "bottom")+
  labs(x="Time(days)", y="Cumulate numbers", title="UK Cases, deaths and CFR trends")

UK Crude Mortality rates (CMR) and Case Fatality Rates (CFR):

plot(density(Covid19_UK$CMR_100),col="green",xlim=c(-20,50),ylim=c(0,0.1),main="UK CMR and CFR")
lines(density(Covid19_UK$CFR_100,na.rm = T),col="red")
axis(side = 1, at = round(seq(0, 50, by = 10)))
text(x=median(Covid19_UK$CMR_100), y=0.04, labels="CMR")
text(x=mean(Covid19_UK$CFR_100,na.rm=T), y=0.07, labels="CFR")

The CFR curve shows a right(positive) skewed distribution of fatalities along the time:

ggplot()+
  geom_histogram(data=Covid19_UK,aes(x=CFR_100, main="UK CFR"),bins=30,
                 fill = "darkgreen", 
                 col = "black")+
  labs(x="CFR_100", y="Frequency",title="UK CFR_100")

In this section is represented the basic of the SIR model with given beta and gamma, it is expected a further calibration of the model considering other factors, such as: - effect of treatment - effect of social distancing - age impact - co-morbidity - symptomatic and asymptomatic behavior - vaccination strategies

To begin with a theoretical value of beta to calibrate the model using a simple linear regression model function: the sum of squares values (SSQ). The line that best predict the lowest distance between theoretical predictors and real data, to assess how far is the model from real data.

The prevalence of the theoretical model infection is represented in the graph below:

N<- max(Covid19_UK$cases)

initial_state_values<- c(S=N-1,I=1,R=0)

parameters<- c(beta= 1/3,gamma=1/22)#beta (infection rate) is hypothesized to be 3 days^-1, 
                                    #gamma (recovery rate) is 22 days^-1

times<- seq(0,length(Covid19_UK$Date),by=1)

SIR_model<- function(time,state,parameters){
  with(as.list(c(state,parameters)),{
       N<- S+I+R
       lambda<- beta*I/N
       dS<- -lambda*S
       dI<- lambda*S-gamma*I
       dR<- gamma*I
       return(list(c(dS,dI,dR)))
       })
}

output<- as.data.frame(ode(y=initial_state_values,
                           times=times,
                           func=SIR_model,
                           parms=parameters))

ggplot()+
  geom_line(data=output,aes(x=time,y=I))+
  labs(x="Time(days)",y="Infected", title="UK Prevalence of infection")

output_long<- melt(as.data.frame(output), id="time")
ggplot(data=output_long, aes(x=time, y=value,colour=variable,group=variable))+
  geom_line()+
 labs( x="Time(days)",y="proportions", title="UK Covid19 theoretical Prevalence proportion: basic SIR model")

Simple Linear Regression:

SIR_fun<- function(time,state,parameters){
  with(as.list(c(state,parameters)),{
       N<- S+I+R
      
       dS<- -beta*S*(I/N)
       dI<- beta*S*(I/N)
       dR<- gamma*I
       return(list(c(dS,dI,dR)))
       })}
  
SIR_SSQ <- function(parameters, dat) {  
            
    result <- as.data.frame(ode(y = initial_state_values  
                              , times = times            
                              , func = SIR_fun           
                              , parms = parameters)      
                            )
    dat <- na.omit(dat)  
    
    deltas2 <- (result$I[result$time %in% dat$time] - dat$I)^2 
    
    SSQ   <- sum(deltas2)
    
    return(SSQ)
  }
require(reshape2)

dat_UK<- Covid19_UK%>%
  mutate(number_infected = (cases-deaths),time=Days)%>%
  dplyr::select(time,number_infected)

head(formattable(dat_UK))
time number_infected
282 14466
281 12574
280 22928
279 12822
278 6902
277 6855
initial_state_values <- c(S = 10500-1, I = 1, R = 0)

beta_start  <- 0.15
gamma_start <- 0.04

times <- seq(from = 0, to = 14, by = 0.1) 

optimised <- optim(par = c(beta = beta_start
                        , gamma = gamma_start)     
                        , fn = SIR_SSQ
                        , dat = dat_UK  
      )
round(optimised$par,2)#0.15  0.04
##  beta gamma 
##  0.15  0.04
times <- seq(from = 0, to = length(Covid19_UK$cases), by = 1) 

opt_mod <- as.data.frame(ode(y = initial_state_values,
                             times = times,
                             func = SIR_model,
                             parms = optimised$par))
require(ggplot2)


opt_plot <- ggplot()
opt_plot <- opt_plot + geom_point(aes(x = time, y = number_infected)
                                , colour = "red"
                                , shape  = "x" 
                                , data = dat_UK)
opt_plot <- opt_plot + geom_line(aes(x = time, y = I)
                                 , colour = "blue"
                                 , data   = opt_mod)
opt_plot