# required libraries

library(deSolve)
library(shape)  # for plotting arrows
library(progress)  # for drawing the progress bar


####################################
##
## The basic model (which does not result in the exact solution)
## Adapted from the previous blogpost but with small adaptations
##
####################################

# the data in China
#https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_Wuhan_coronavirus_outbreak#Cases_Chronology_in_Mainland_China
#https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/China_medical_cases

IpRpD <- c(45,62,121,198,291,440,571,830,1287,1975,2744,4515,5974,7711,9692,11791,14380,17205,20440, 24324,28018,31161,34546,37198,40171,42638,44653,59804,63851,66492,68500,70548,72436,74185,74576,75465,76288,76936,77150,77658,78064,78497,78824,79251,79824,80026,80151,80270,80409,80552,80651,80695,80735,80754,80778,80793,80813,80824,80844,80860,80881,80894,80928,80967,81008)

R <-     c(15,19, 24, 25, 25, 25, 25, 34,  38,  49,  51,  60, 103, 124, 171,  243,  328,  475,  632,   892, 1153, 1540, 2050, 2649, 3281, 3996, 4749, 5911, 6723, 8096, 9419,10844,12552,14376,16155,18264,20659,22888,24734,27323,29745,32495,36117,39002,41625,44462,47204,49856,52045,53726,55404,57065,58600,59897,61475,62793,64111,65541,66911,67749,68679,69601,70420,71150,71740)
D <-     c( 2, 2, 2,   3,  6,  9, 17, 25,  41,  56,  80, 106, 132, 170, 213,  259,  304,  361,  425,   490,  563,  637,  722,  811,  908, 1016, 1113, 1259, 1380, 1523, 1665, 1770, 1868, 2004, 2118, 2236, 2345, 2442, 2592, 2663, 2715, 2744, 2788, 2835, 2870, 2912, 2943, 2981, 3012, 3042, 3070, 3097, 3119, 3136, 3158, 3169, 3176, 3189, 3199, 3213, 3226, 3237, 3245, 3248, 3255)



# we take the first 19 to make the analysis relate to the previous blog post
Infected <- (IpRpD)[1:19] 

Day <- 1:(length(Infected))
N <- 1400000000 # population of mainland china

# ODE equation used for fitting
#
# I have removed the R(t) in comparison 
# to the function used in the odler blogpost
# because we are not gonna use that value
# also we have anyway: R(t) = N(0) - N(t) - I(t)
SIR <- function(time, state, parameters) {
  par <- as.list(c(state, parameters))
  with(par, {
    dS <- -beta/N * I * S
    dI <- beta/N * I * S - gamma * I
    list(c(dS, dI))
  })
}

#
# cost function to be optimized in the fitting
#
RSS <- function(parameters) {
  names(parameters) <- c("beta", "gamma")
  out <- ode(y = init, times = Day, func = SIR, parms = parameters)
  fitInfected <- out[,3]
  # fitInfected <- N-out[,2] # this would be a better comparison since the data is not the number of Infectious people
  sum((Infected - fitInfected)^2)
}

# starting condition
init <- c(S = N-Infected[1], I = Infected[1])
# init <- c(S = N-Infected[1], I = Infected[1]-R[1]-D[1])  use this starting condition when applying the different line in the RSS function

# performing the fit
Opt <- optim(c(0.5, 0.5), RSS, method = "L-BFGS-B", lower = c(0, 0), upper = c(1, 1)) # optimize with some sensible conditions
Opt$message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

Opt_par <- setNames(Opt$par, c("beta", "gamma"))
Opt_par
##      beta     gamma 
## 0.6746088 0.3253912
##     beta     gamma 
## 0.693018  0.306982

# plotting the result
t <- 1:70 # time in days
fit <- data.frame(ode(y = init, times = t, func = SIR, parms = Opt_par))

plot(Day,Infected, xlim = c(0,30), ylim = c(0,3*10^4))
lines(t,fit[,3])

###########################
##
## Alternative model which provides a better fit
##
############################

# We transform the equations and instead of 
# parameters beta and gamma
# we use parameters 
#
#    K = beta-gamma
#    R0 = beta/gamma
#
#    or    
#
#    beta =   K * R0/(R0-1)  
#    gamma =  K *  1/(R0-1)
#  
# then the equations become
#
# dS  = I * K * (-S/N *  R0)/(R0-1)
# dI  = I * K * ( S/N *  R0 - 1)/(R0-1)  
# note in the beginning, S/N = 1
# then in the start you get this approximate exponential growth
# dI = I * K * (1)


SIR2 <- function(time, state, parameters) {
  par <- as.list(c(state, parameters))
  with(par, {
    dS <- I * K * (-S/N *  R0/(R0-1))
    dI <- I * K * ( S/N *  R0/(R0-1) - 1/(R0-1))  
    list(c(dS, dI))
  })
}

RSS2 <- function(parameters) {
  names(parameters) <- c("K", "R0")
  out <- ode(y = init, times = Day, func = SIR2, parms = parameters)
  fitInfected <- out[,3]
  #fitInfected <- N-out[,2]
  sum((Infected - fitInfected)^2)
}

### Two functions RSS to do the optimization in a nested way
###
### This nesting requires a lot more computational power
### However, it makes that we have to worry less about the different scale 
### of the parameters

Infected_MC <- Infected
SIRMC2 <- function(R0,K) {
  parameters <- c(K=K, R0=R0)
  out <- ode(y = init, times = Day, func = SIR2, parms = parameters)
  fitInfected <- out[,3]
  #fitInfected <- N-out[,2]
  RSS <- sum((Infected_MC - fitInfected)^2)
  return(RSS)  
}
SIRMC <- function(K) {
  optimize(SIRMC2, lower=1,upper=10^5,K=K, tol = .Machine$double.eps)$objective
}

# wrapper to optimize and return estimated values
getOptim <- function() {
  opt1 <- optimize(SIRMC,lower=0,upper=1, tol = .Machine$double.eps)
  opt2 <- optimize(SIRMC2, lower=1,upper=10^5,K=opt1$minimum, tol = .Machine$double.eps)
  return(list(RSS=opt2$objective,K=opt1$minimum,R0=opt2$minimum))
}

# starting condition
#init <- c(S = N-Infected[1], I = Infected[1]-R[1]-D[1])
init <- c(S = N-Infected[1], I = Infected[1])

# performing the fit 
# starting K=0.3, R0 = 2
Opt2 <- optim(c(0.3, 3), RSS2, method = "L-BFGS-B", 
              hessian = TRUE, control = list(parscale = c(10^0,10^0), factr = 1)) 
Opt2
## $par
## [1] 0.3987135 1.0054626
## 
## $value
## [1] 6510695
## 
## $counts
## function gradient 
##      129      129 
## 
## $convergence
## [1] 52
## 
## $message
## [1] "ERROR: ABNORMAL_TERMINATION_IN_LNSRCH"
## 
## $hessian
##              [,1]         [,2]
## [1,] 1.214301e+11 3.058955e+12
## [2,] 3.058955e+12 1.275578e+14
Opt3 <- getOptim()
Opt3
## $RSS
## [1] 5926880
## 
## $K
## [1] 0.4020269
## 
## $R0
## [1] 1.005332
Opt_par2 <- setNames(Opt2$par, c("K", "R0"))
Opt_par3 <- setNames(Opt3[2:3], c("K", "R0"))


# plotting the result
t <- seq(1,70,0.1) # time in days
fit1 <- data.frame(ode(y = init, times = t, func = SIR , parms = Opt_par))
fit2 <- data.frame(ode(y = init, times = t, func = SIR2, parms = Opt_par2))
fit3 <- data.frame(ode(y = init, times = t, func = SIR2, parms = Opt_par3))

plot(Day,Infected, xlim = c(0,30), ylim = c(0,3*10^4), 
     log = "", xaxt = "n",
     main = "Infected in China (including Recovered and Death)", xlab = "", ylab = "number infected")
lines(t, fit3[,3], col = 1)
lines(t, fit2[,3], col = 4, lty = 2)
lines(t, fit1[,3], col = 2, lty = 3)
axis(1, at = 1:30, labels = rep("",30), tck = -0.01)
axis(1, at = c(1,8,15,22), labels = c("Jan 16", "Jan 23", "Jan 30", "Feb 6"))

text(t[183]+2,fit1[183,3]+1800,"old optim fit",pos=4, col=2)
text(t[183]+2,fit1[183,3],expression(R[0] == 2.07),pos=4, col=2)
text(t[183]+2,fit1[183,3]-1400,expression(RSS == 74.3 %*% 10^6),pos=4, col=2)

text(t[220]+3,fit2[220,3]+3200,"new optim fit",pos=3, col=4)
text(t[220]+3,fit2[220,3]+1400,expression(R[0] == 1.0054626),pos=3, col=4)
text(t[220]+3,fit2[220,3],expression(RSS == 6.5 %*% 10^6),pos=3, col=4)

text(t[240]-3,fit3[240,3],"nested algorithm",pos=1, col=1)
text(t[240]-3,fit3[240,3]+700-2500,expression(R[0] == 1.005332),pos=1, col=1)
text(t[240]-3,fit3[240,3]-700-2500,expression(RSS == 5.9 %*% 10^6),pos=1, col=1)

x1 <- t[240]-3;    x2 <- t[225];
y1 <- fit3[240,3]; y2 <- fit3[225,3]
Arrows(x1,y1,x1+(x2-x1)*0.65,y1+(y2-y1)*0.65, col = 1)

x1 <- t[220]+2;    x2 <- t[227];
y1 <- fit2[220,3]; y2 <- fit2[227,3]
Arrows(x1,y1,x1+(x2-x1)*0.6,y1+(y2-y1)*0.6, col = 4)

x1 <- t[183]+2;    x2 <- t[183];
y1 <- fit1[183,3]; y2 <- fit1[183,3]
Arrows(x1,y1,x1+(x2-x1)*0.6,y1+(y2-y1)*0.6, col = 2)

#################
##
## Plotting the cost function and how 
## the optimize approaches the minimum
##
################

# use this value to select between the different figures of the blogpost
figure = 2

# compute loss function on a grid
n=101
K <- seq(0.25,0.45,length.out=n)
R0 <- seq(1.001,1.03,length.out=n) #be sure not to use R0= 1 this gives a devision by 0
if (figure == 2) {
  K <- seq(0.34920,0.34924,length.out=n)
  R0 <- seq(1.8,2.01,length.out=n)
}
z <- matrix(rep(0,n^2),n)
for (i in 1:n) {
  for(j in 1:n) {
    z[i,j] <- log(RSS2(c(K[i],R0[j])))
  }
}

# levels for plotting
levels <- log(10^seq(6,12,0.5))
key <- head(c(t(matrix(c(10^c(6:12),rep("",7)),7))),-1)
if (figure == 2) {
  levels <- seq(18.12349,18.12354,0.000005)
  key <- head(c(t(matrix(c(seq(18.12349,18.12354,0.00001),rep("",5)),5))),-1)
}
key
## [1] "18.12349" ""         "18.1235"  ""         "18.12351" ""         "18.12352"
## [8] ""         "18.12353"
# colours for plotting
colours <- function(n) {hsv(c(seq(0.15,0.7,length.out=n),0),
                            c(seq(0.2,0.4,length.out=n),0),
                            c(seq(1,1,length.out=n),0.9))}
# empty plot
ylims = c(1,1.03)
if (figure == 2)  {
  ylims = c(1.8,2.01)
}
plot(-1000,-1000,
     xlab="K",ylab=expression(R[0]), 
     xlim=range(K), 
     ylim = ylims
     #ylim=range(R0)
)
title("RSS as function of parameters \n and path of optimization algorithm",cex.main=1)

# add contours
.filled.contour(K,(R0),z,
                col=colours(length(levels)),
                levels=levels)

contour(K,(R0),z,add=1, levels=levels, labels = key)

# compute path
# start value
new=c(0.3, 1.025)
if (figure == 2) {
  new = c(0.34920,2) # with this starting condition convergence will be more difficult
}

# make lots of small steps
for (i in 1:20) {
  ### safe old value
  old <- new
  ### compute 1 step
  
  OptTemp <- optim(new, RSS2, method = "L-BFGS-B", lower = c(0,1.00001),
                   hessian = TRUE, 
                   control = list(parscale = c(10^0,10^0), 
                                  factr = 1,
                                  maxit=1))
  OptTemp
  new <- OptTemp$par
  
  ### plot path
  dist <- (old[1]-new[1])^2+(old[2]-new[2])^2
  lines(c(old[1],new[1]),(c(old[2],new[2])), col=2)
  points(new[1],new[2], col=2, bg = 2, pch = 21, cex = 0.7)
  if (dist>0.01^2) {
    step <- new-old
    Arrows(old[1], old[2], old[1]+step[1]*0.85,old[2]+step[2]*0.85,
           col=2,
           arr.type="curved",
           arr.length=0.25,
           lty=1)
  } 
}

OptTemp
## $par
## [1] 0.3492195 2.0000000
## 
## $value
## [1] 74292937
## 
## $counts
## function gradient 
##       21       21 
## 
## $convergence
## [1] 52
## 
## $message
## [1] "ERROR: ABNORMAL_TERMINATION_IN_LNSRCH"
## 
## $hessian
##              [,1]       [,2]
## [1,] 688553069717 2440483.40
## [2,]      2440483  -18721.76
# make lots of small steps again (but different parscale)
for (i in 1:20) {
  ### safe old value
  old <- new
  ### compute 1 step
  
  
  OptTemp <- optim(new, RSS2, method = "L-BFGS-B", lower = c(0,1.00001),
                   hessian = TRUE, control = list(parscale = c(10^-4,10^-4),
                                                  factr = 1,
                                                  maxit=1)) 
  # alternatively use: ndeps = c(10^-7,10^-7), parscale = c(1,1)
  OptTemp
  new <- OptTemp$par
  
  ### plot path
  dist <- (old[1]-new[1])^2+(old[2]-new[2])^2
  lines(c(old[1],new[1]),(c(old[2],new[2])), col=1)
  points(new[1],new[2], col=1, bg = 1, pch = 21, cex = 0.7)
  if (dist>0.01^2) {
    step <- new-old
    Arrows(old[1], old[2], old[1]+step[1]*0.85,old[2]+step[2]*0.85,
           col=1,
           arr.type="curved",
           arr.length=0.25,
           lty=1)
  } 
}

OptTemp
## $par
## [1] 0.3492171 1.7865381
## 
## $value
## [1] 74290781
## 
## $counts
## function gradient 
##        4        4 
## 
## $convergence
## [1] 1
## 
## $message
## [1] "NEW_X"
## 
## $hessian
##              [,1]       [,2]
## [1,] 688175746612 4475843.16
## [2,]      4475843  -43287.87
if (figure == 1) {
  legend(0.255,1.003, c("path with parscale 1","path with parscale 0.0001"),
         col = c(2,1), pch = 21, lty=1, cex = 0.9, pt.bg = c(2,1) , yjust = 0)
}

if (figure == 2) {
  legend(0.349201,1.81, c("path with parscale 1","path with parscale 0.0001"),
         col = c(2,1), pch = 21, lty=1, cex = 0.9, pt.bg = c(2,1) , yjust = 0)
}

####################
##
## Graph with various values of R0
##
#######################

# starting condition
#init <- c(S = N-Infected[1], I = Infected[1]-R[1]-D[1])
init <- c(S = N-Infected[1], I = Infected[1])

Infected_MC <- Infected
SIRMC3 <- function(R0,K) {
  parameters <- c(K=K, R0=R0)
  out <- ode(y = init, times = Day, func = SIR2, parms = parameters)
  fitInfected <- out[,3]
  #fitInfected <- N-out[,2]
  RSS <- sum((Infected_MC - fitInfected)^2)
  return(RSS)  
}

plot(Day,Infected, xlim = c(0,80), ylim = c(1,10^9), 
     log = "y", xaxt = "n",
     main = "scenario's for different R0", xlab = "", ylab = "number infected")

axis(1, at = 1:30, labels = rep("",30), tck = -0.01)
axis(1, at = c(1,8,15,22), labels = c("Jan 16", "Jan 23", "Jan 30", "Feb 6"))


for (i in 1:10) {
  R0 <- c(1.005,1.01,1.05,1.1,1.2,1.5,2,2.5,4,20)[i]
  K <- optimize(SIRMC3, lower=0,upper=1,R0=R0, tol = .Machine$double.eps)$minimum
  parameters <- c(K,R0)
  xd <- seq(1,60,0.01)
  if (i == 1) {
    xd <- seq(1,40,0.01)
  }
  if (i == 2) {
    xd <- seq(1,50,0.01)
  }
  out <- ode(y = init, times = xd, func = SIR2, parms = parameters)
  lines(xd,out[,3])
  text(tail(xd,1),tail(out[,3],1),bquote(R[0] == .(R0)), pos =4)
}

#####################
##
## computing variance with Hessian
##
###################

### The output of optim will store valures for RSS and the hessian
mod <- optim(c(0.3, 1.04), RSS2, method = "L-BFGS-B", hessian = TRUE,
             control = list(parscale = c(10^-4,10^-4), factr = 1)) 

# unbiased estimate of standard deviation
# we divide by n-p 
# where n is the number of data points
# and p is the number of estimated parameters
sigma_estimate <- sqrt(mod$value/(length(Infected)-2))

# compute the inverse of the hessian 
# The hessian = the second order partial derivative of the objective function
# in our case this is the RSS
# we multiply by 1/(2 * sigma^2)
covpar <- solve(1/(2*sigma_estimate^2)*mod$hessian)
covpar
##               [,1]          [,2]
## [1,]  1.236666e-05 -2.349611e-07
## [2,] -2.349611e-07  9.175736e-09
#              [,1]          [,2]
#[1,]  1.236666e-05 -2.349611e-07
#[2,] -2.349611e-07  9.175736e-09

## the marginal variance of R0 is then approximately at least
##

covpar[2,2]^0.5
## [1] 9.579006e-05
#####################
##
## computing variance with Monte Carlo method
##
###################

set.seed(1)

# modeled data that we use to repeatedly generate data with noise
Infected_MC <- Infected
Day <- c(1:length(Infected))
modMC <- getOptim()
beginpar <- as.numeric(modMC[2:3]) #getOptim is the nested fitting model defined earlier
names(beginpar) <- c("K", "R0")
beginpar
##         K        R0 
## 0.4020269 1.0053320
modInfected <- data.frame(ode(y = init, times = Day, func = SIR2, parms = beginpar))$I

# use RSS to make a relative noise level
errrate <- modMC$RSS/sum(Infected) 

### generate random data with 
### the mean of modInfected and 
### Noise based on errrate
###               Note: this assumes heterogeneous noise 
###                    the Fisher Matrix assumed homogeneous noise (which is probabily less correct)

par <- c(0,0) # this will store the parameter outcome and grow using rbind
for (i in 1:100) {
  Infected_MC <- rnorm(length(modInfected),modInfected,(modInfected*errrate)^0.5)
  OptMC <- getOptim()
  par <- rbind(par,c(OptMC$K,OptMC$R0))
}
par <- par[-1,]  # strip the first c(0,0) entry

which(par[,2]>2)  # some fits went wrong
## named integer(0)
par <- par[-c(36,55),]

layout(matrix(1:2,1))

plot(par, xlab = "K",ylab=expression(R[0]))
title("Monte Carlo simulation")
cov(par)
##               [,1]         [,2]
## [1,]  1.534513e-05 -3.27662e-07
## [2,] -3.276620e-07  2.37102e-08
#              [,1]          [,2]
#[1,]  4.461422e-04 -1.709287e-05
#[2,] -1.709287e-05  1.599932e-06

beta <- par[,1]*par[,2]/(par[,2]-1)   
gamma <- par[,1]/(par[,2]-1)   
plot(beta,gamma,
     xlab = expression(beta), ylab = expression(gamma))

###########################
##
## The alternative model 2
##
############################

##
##  1 Now we make the starting number of infected I(0) and population N(0)
##    fitting parameters 
##      (the population N(0) is fitted indirectly by scaling the outcome by a factor
##       note that the equations allow scaling all variables by a fixed factor)
##       dS  = I * K * (-S/N *  R0)/(R0-1)
##       dI  = I * K * ( S/N *  R0 - 1)/(R0-1)
##       d(f*S)  = (f*I) * K * (-(f*S)/(f*N) *  R0)/(R0-1)
##       d(f*I)  = (f*I) * K * ( (f*S)/(f*N) *  R0 - 1)/(R0-1)
##  2 Also we consider the 'Infected' from the data cumulatively
##    The data infected is not the infected from the model but 
##    the number of discovered people
##  3 We introduce a factor to scale the number of computed infected with the observed infected
##    This scaling corrects for the idea that not all infectious/infected people are reported
##    A 'problem' with this scaling is that it also relates to a different population size N(0)
##    So this fitting will not allow to determine the effective population size and the over reporting independently

Infected <- IpRpD
Day <- 1:(length(Infected))

N <- 1400000000 # population of mainland china
Ir <- Infected[1]

Days <- 1:length(Infected)


#### Some exploratory stuff to estimate starting values:
#### This is based on the differential equation in
#### Exact analytical solutions of the Susceptible-Infected-Recovered (SIR) epidemic
#    model and of the SIR model with equal death and birth rates
#    from Harko, Lobo and Mak

#x0 = N exp(beta/gamma N3)
#ln(x0) = ln(N) + beta/gamma N3
#S/x0 = u 
#ln(u) = ln(S/N) - beta/gamma N3
#dS/dt = x0/phi = S(beta (N3-N) -gamma ln(S/N)  +  beta S)


# function to estimate parameters based on linear least squares fit 
# and the *linear* relation between dS and S, S ln(S) and S^2
RSSdiff <- function(N) {
  corrInfected <- Infected * c(rep(1.22,27),rep(1,38))
  dS <- -diff(corrInfected)
  S <- (N-corrInfected[-1])
  moddif <- lm(dS ~ 0+I(S)+I(S*log(S/N))+I(S*S))
  sum(moddif$residuals^2)                           # RSS
  #moddif$coefficients[3]/moddif$coefficients[2]*N  # beta
}
RSSdiff <- Vectorize(RSSdiff)

N <- 10:200*10^4
plot(N,RSSdiff(N),log="")

corrInfected <- Infected * c(rep(1.22,27),rep(1,38))

N <- 320000
dS <- diff(corrInfected)
S <- (N-corrInfected[-1])

Sp <- S
Slog <- S*log(S/N)
S2 <- S*S
moddif <- lm(dS ~ 0+Sp+Slog+S2)
moddif
## 
## Call:
## lm(formula = dS ~ 0 + Sp + Slog + S2)
## 
## Coefficients:
##         Sp        Slog          S2  
##  1.632e+00   1.417e+00  -5.098e-06
sum(moddif$residuals)^2
## [1] 0.00231477
beta <- moddif$coefficients[3]
gamma <- -moddif$coefficients[2]
Rstart  <- moddif$coefficients[1]/beta+N  # this is an unrealistic Rstart = -130
beta <- beta*N  #scaling due to different use of beta in Harko et al

beta/gamma
##       S2 
## 1.151174
-(beta - gamma)
##        S2 
## 0.2142512
Rstart
##        Sp 
## -130.3256
plot(diff(corrInfected))
lines(predict(moddif))

# matrix used for computing the scaling
# we use a different sacaling after the 27th day when reporting changed
X <- cbind(c(rep(1,27),rep(0,max(Days)-27)),
           c(rep(0,27),rep(1,max(Days)-27)))

SIR4 <- function(time, state, parameters) {
  par <- as.list(c(state, parameters))
  with(par, {
    dS <- I * K * (-S/Nm *  R0/(R0-1))
    dI <- I * K * ( S/Nm *  R0/(R0-1) - 1/(R0-1))  
    list(c(dS, dI))
  })
}

parameters <- c(0.4,4)
Nm <- 10^5  #using a smaller population than entire China
Istart = 30
# inner optimization loop
# here we optimize K and R0 for a given Nm and Istart
RSS4 <- function(parameters,Nm,Istart) {
  parametersx <- c(parameters, Nm, Istart)
  names(parametersx) <- c("K", "R0", "Nm", "Istart")
  start <- c(parametersx[3]-Infected[1],parametersx[4])
  names(start) <- c("S","I")
  out <- ode(y = start, times = Day, func = SIR4, parms = parametersx[c(1,2,3)])
  fitInfected <- parametersx[3]-out[,2]
  ## perform additional scaling (you could see this as yet another nested optimization)
  Xm <- X*fitInfected
  # the limit for a and b is because we should not scale such that the infected become more than obeserved
  # we introduced I(0) because of uncertainty in R(0)
  modscaled <- nls(Infected ~ a*Xm[,1]+ b* Xm[,2], start = list(a=0.5*Infected[1]/Istart,b=0.5*Infected[1]/Istart), 
                   upper = c(1,2)*Infected[1]/Istart,algorithm= "port")
  sum((Infected - predict(modscaled))^2)
}



# outer optimization loop
# here we optimize Istart
# for each Istart we optimize K and R0
# this is very slow but much more robust
RSS4b <- function(parameters) {
  Istart <- parameters[1]
  RSS4start <- c(0.3,2.5)
  OptInner <- optim(RSS4start, RSS4, method = "L-BFGS-B", 
                    Istart = Istart, Nm = Nm,
                    lower = c(0.1,1.001), 
                    upper = c(1,20),
                    hessian = TRUE, 
                    control = list(parscale = c(10^-6,10^-6),
                                   factr = 1, maxit= 10^4)) 
  OptInner$value
}


### Doing the outer optimization loop by Optimize
OptOuter <- optimize(RSS4b,interval = c(5,45) )
OptOuter
## $minimum
## [1] 10.83284
## 
## $objective
## [1] 31754389
### Doing the outer optimization loop by manual grid search
n=30
Nm <- 10^5  #using a smaller population than entire China
rangeIstart <- 10.83284 + seq(-5,+30,length.out=n)

zRSS <- c(NA,length(rangeIstart)) # empty container for values to be computed

pb <- progress_bar$new(total = n)
for (i in 1:n) {
  pb$tick()
  zRSS[i] <- RSS4b(rangeIstart[i])
}


# plotting RSS as function of I(0)
plot(rangeIstart,zRSS, log="y", yaxt = "n",
     xlab = "I(0)", ylab = "RSS")
title("optimal RSS as function of parameters I(0)",cex.main=1)



### plot a fit with the data for some selected parameters

# Doing model with Nm = 10^5 and Istart = 12
# But multiplied by a factor
# In this way we end up with a scaling in the end of 1
# This is a bit exchangable, correcting for underreporting and for population size is similar
Nm <- 10^5*4.155
Istart <- 10.83284*4.155
RSS4start <- c(0.24,2.5)
Opt4 <- optim(RSS4start, RSS4, method = "L-BFGS-B", 
              Nm = Nm, Istart = Istart,
              lower = c(0.1,1.001), 
              upper = c(1,20),
              hessian = TRUE, 
              control = list(parscale = c(10^-5,10^-4), ndeps = rep(10^-6,2),
                             factr = 1000, maxit= 10^4, trace = 2 )) 
## N = 2, M = 5 machine precision = 2.22045e-16
## 
## iterations 25
## function evaluations 53
## segments explored during Cauchy searches 26
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 0
## norm of the final projected gradient 0.879169
## final function value 2.94611e+07
## 
## final  value 29461131.019445 
## converged
Opt4
## $par
## [1] 0.217721 1.087240
## 
## $value
## [1] 29461131
## 
## $counts
## function gradient 
##       53       53 
## 
## $convergence
## [1] 0
## 
## $message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
## 
## $hessian
##               [,1]          [,2]
## [1,] 3065820783377 -985437072814
## [2,] -985437072814  939285382628
# this may give an abnormal termination
# but that could be due to the small factr parameter 

# Retrieve optimal parameters and compute curve
parameters <- c(Opt4$par,Nm,Istart)
names(parameters) <- c("K", "R0", "Nm","Istart")
start <- c(parameters[3]-Infected[1],parameters[4])
names(start) <- c("S","I")
result <- ode(y = start, times = Day, func = SIR4, parms = parameters[c(1,2,3)])

parameters
##            K           R0           Nm       Istart 
## 2.177210e-01 1.087240e+00 4.155000e+05 4.501045e+01
# scaling
fitInfected <- Nm-result[,2]
Xm <- X*fitInfected
modscaled <- lm(Infected ~ 0 + Xm)
modscaled
## 
## Call:
## lm(formula = Infected ~ 0 + Xm)
## 
## Coefficients:
##   Xm1    Xm2  
## 1.001  1.233
modscaled
## 
## Call:
## lm(formula = Infected ~ 0 + Xm)
## 
## Coefficients:
##   Xm1    Xm2  
## 1.001  1.233
plot(Infected/Nm*100, log = "", xlim = c(0,70), ylim = c(0,100),
     xlab = "day", ylab = "Total people that got sick [%]", xaxt="n")
axis(1, at = c(1,8,15,22,29,36,43), labels = c("Jan 16", "Jan 23", "Jan 30", "Feb 6", "Feb 13", "Feb20", "Feb 27"), cex.axis=0.8)
axis(1, at = c(1:50), labels = rep("",50), cex.axis=0.8, tck=-0.01)

lines((predict(modscaled))/Nm*100)

title("modeling with Ntot = 4.155*10^5, I(0) = 45, K = 0.217721, R0 = 1.087", cex.main = 1)