Although it can often be misleading to try to predict the trend of an epidemic (from limited and flawed data) during its early-to-mid stages, it is often important to try to predict some features during the outbreak, rather than to wait until retrospective data is available. When is the right time? For relatively closed systems, the cumulative case curve will always be S-shaped and well-described by a logistic-type model - with the essential factors being the inflection time (\(t_m\)) and estimated maximum caseload. As long as the available data include this inflection point and a time interval shortly after, the curve fitting (and estimated future case number) will be reasonably accurate. [1]
Here we present a simple method using R to derive the three-parameter logistic model from publicly-available COVID data provided by Johns Hopkins CSSE. Some data formatting steps use the \(tidyverse\) package and model estimation uses the built-in \(SSlogis\) function.
First, load the tidyverse library (install if needed):
#install.packages("tidyverse")
library(tidyverse)
library(ggplot2)
library(ggpubr)
theme_set(theme_pubr())
library(zoo)
library(ggthemes)
mydata <- read.csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"), na.strings = c("", "NA"))
#deaths <- read.csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"), na.strings = c("", "NA"))
State <- "Utah"
k = 20 #future days
CASES <- mydata %>%
filter(Province_State ==State) %>%
select(starts_with("X"))
For example, let’s use Utah. And for any forecasts, we’ll set 20 days. It’s easy to change state, but note in the final figure the spacing of text might be off, so you’ll need to customize to not smoosh text.
Now some data wrangling to get dates, restrict to where positive cases, and take a peek at the temp data.
Y<- colSums(CASES)
n <- as.numeric(length(Y))
temp1 <- data.frame(y = Y, x = seq(1:n))
temp1$date<-row.names(temp1)
temp <- temp1 %>% filter(y > 0)
(date1<-substr(temp[1,]$date, 2, 8))
## [1] "3.7.20"
(date<-substr(temp[temp$x==max(temp$x),]$date, 2, 8))
## [1] "3.25.21"
(Max <- temp[temp$x==max(temp$x),]$y)
## [1] 383260
head(temp)
## y x date
## X3.7.20 1 46 X3.7.20
## X3.8.20 1 47 X3.8.20
## X3.9.20 1 48 X3.9.20
## X3.10.20 2 49 X3.10.20
## X3.11.20 3 50 X3.11.20
## X3.12.20 3 51 X3.12.20
Some more data wrangling and calculations to get percent changes.
last_obs <- max(temp$x)
next_last_obs <- last_obs - 1
seven_days_ago <- last_obs - 7
new_cases <- temp[ temp$x ==last_obs , 1] - temp[ temp$x ==next_last_obs , 1]
one_day_percent <- (temp[ temp$x ==last_obs , 1] - temp[ temp$x ==next_last_obs , 1])/temp[ temp$x ==next_last_obs, 1] *100
seven_days_percent <- (temp[ temp$x ==last_obs , 1] - temp[ temp$x ==seven_days_ago , 1])/temp[ temp$x ==seven_days_ago, 1] *100
Now calculate the logistic curve.
#Z <- nls(y ~ SSlogis(x, Asym, xmid, scal), temp); ZZ<-summary(Z)
# State <- "Utah"
# CASES <- mydata %>%
# filter(Province_State ==State) %>%
# select(starts_with("X"))
#
#
# Y<- colSums(CASES)
# n <- as.numeric(length(Y))
#
# temp1 <- data.frame(y = Y, x = seq(1:n))
# temp1$date<-row.names(temp1)
#
# temp <- temp1 %>% filter(y > 0)
y <- temp$y
x <- temp$x
Z <- nls(y ~ (( k1 / (1 + exp( (-log(81)/a1) * (x - b1)))) +
( k2 / (1 + exp( (-log(81)/a2) * (x - b2))))),
start=c(a1=87,k1=38000,b1=150,
a2=104,k2=400000,b2=300))
ZZ<-summary(Z)
peak <- ZZ$parameters[1]
midpoint <- temp %>%
filter(x == round(ZZ$parameters[2])) %>%
mutate(date = str_sub(date, 2, 8)) %>%
select(date)
Estimated asymptote
round(peak) # estimated asymptote
## [1] 101
Inflection date
midpoint # date of inflection point
## [1] date
## <0 rows> (or 0-length row.names)
AIC (smaller is better)
AIC(Z) # AIC = -2LL + 2k
## [1] 6961.936
BIC(Z)
## [1] 6989.591
# pdf("curve.pdf", width = 12, height = 10)
# png(file = "curve.png", width = 1080, height = 900, units = "px", pointsize = 14)
par(family="serif")
plot(temp$x, temp$y, ylab = "COVID SARS-2 Confirmed Cases", xlab="", xlim =c(min(temp$x),n+k),
ylim=c(0, max(temp$y) + 0.25*max(temp$y)), type="l", col="navy", lwd=3,
axes=F,
main = paste("COVID-19 cases in", State, " (updated", date , ")"),
sub = "Using 2019 Novel Coronavirus COVID-19 (2019-nCoV)
Data Repository by Johns Hopkins CSSE")
axis(2)
axis(1, at=c(temp[1,2], max(temp$x) ), lab=c(date1, date))
lines(seq(1:n), predict(Z, list(x = seq(1:n))), lwd=2, col='blue', lty=3)
abline( h = max(temp$y), lty=3)
#abline( h = ZZ$parameters[1], lty=3)
#abline(v=temp[temp$date == "X8.18.20",]$x, lty=2, col="red")
text(max(temp$x)-120, max(temp$y) -24000, paste("Daily new cases = ", new_cases ))
text(max(temp$x)-120, max(temp$y)-16000, paste("Daily percent increase = ", round(one_day_percent,1), "%" ))
text(max(temp$x)-120, max(temp$y)-8000, paste("Seven-day percent increase = ", round(seven_days_percent,1), "%" ))
#text(max(temp$x)-80, ZZ$parameters[1]-2000, paste("Estimated case asymptote = ", round(ZZ$parameters[1])), col="grey")
#text(n, 1000, paste("model AIC = ", round(AIC(Z),2), "\n (smaller is better)"),col="grey")
Z10 <- round(predict(Z, list(x = seq(n+1,n+k))))
#k-day forecast
Z10[1:k]
## [1] 383179 383457 383726 383987 384239 384483 384720 384948 385170 385384
## [11] 385591 385791 385985 386173 386354 386530 386700 386864 387023 387177
lines(seq(n+1,n+k), Z10[1:k], col="lightblue", lwd=2, lty=2)
# dev.off()
[1] Hsieh YH, Lee JY, Chang HL. SARS epidemiology modeling. Emerg Infect Dis. 2004;10(6):1165-1168. doi:10.3201/eid1006.031023
mydata0 <-read.csv(url("https://raw.githubusercontent.com/CivilServiceUSA/us-governors/master/us-governors/data/us-governors.csv"), na.strings = c("", "NA")) %>%
dplyr::select(state_code, state_name, party) %>%
rename(state = state_code)
mydata2 <-read.csv(url("https://covidtracking.com/api/v1/states/daily.csv"), na.strings = c("", "NA"))
mydf <- merge(mydata0, mydata2 , by="state")
mydf$Date <- as.Date(as.character(mydf$date), "%Y %m %d")
State = "UT"
St <- mydata2 %>%
filter(state==State) %>%
filter(positiveIncrease > 0)
St$Date <- as.Date(as.character(St$date), "%Y %m %d")
theme_set(theme_few(base_size = 14, base_family = "serif"))
min <- as.Date("2020-4-1")
max <- NA
r<- ggplot(
St,
aes(x = Date , y = positiveIncrease)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
#facet_wrap(mydf$state_name, scales = "free") +
geom_line(aes(y=rollmean(positiveIncrease, 7, na.pad=TRUE), size=1.5))+
geom_smooth() + #ylim(0, 1500) +
#geom_point(aes(x = as.Date("2020-09-25"), y=1411), colour="red", size=1.5) +
scale_color_manual(values = c("#00AFBB", "#FC4E07")) +
labs(x = "", y = "Daily Positive Increase", title = paste("COVID Daily positive increase in", State, "
(with 7-day moving average)", "
Update: ", date)) + scale_x_date(limits = c(min, max)) +
theme(legend.position = "none")
# r <- r + geom_vline(xintercept=St[St$Date=="2020-08-05",55],
# linetype=4, colour="red")
#+ annotate("text", x = as.Date("2020-08-5"), y=5, label = "Inflection (2)")
# pdf("UT.pdf", width = 12, height = 10)
# png(file = "UT.png", width = 1080, height = 900, units = "px", pointsize = 14)
r
# dev.off()
library(dygraphs)
library(xts)
cases.ts <- xts(St$positiveIncrease, order.by=as.POSIXct(St$Date))
dygraph(cases.ts, main = paste("Daily increase COVID cases in", State)) %>%
dyRoller(rollPeriod = 5) %>%
dySeries("V1", strokeWidth = 3, label = "5-day rolling daily case increase") %>%
dyShading(from = "2021-01-01", to = Sys.Date(), color = "#FFE6E6") %>%
dyRangeSelector()
\[ N(t) = \sum_{i=1}^{m} \frac{k_i}{1+ e^{(-ln(81)/\Delta t_{mi})(t-t_{mi})}} \]
In the two-wave or two-pulse logistic case, this simplifies to:
\[ N(t) = \frac{k_1}{1+ e^{(-ln(81)/\Delta t_{m1})(t-t_{m1})}}+\frac{k_2}{1+ e^{(-ln(81)/\Delta t_{m2})(t-t_{m2})}} \]
Resetting to Utah as model needs good starting values and not obvious how to pull them out:
State <- "Utah"
CASES <- mydata %>%
filter(Province_State ==State) %>%
select(starts_with("X"))
Y<- colSums(CASES)
n <- as.numeric(length(Y))
temp1 <- data.frame(y = Y, x = seq(1:n))
temp1$date<-row.names(temp1)
temp <- temp1 %>% filter(y > 0)
y <- temp$y
x <- temp$x
bilogistic <- nls(y ~ (( k1 / (1 + exp( (-log(81)/a1) * (x - b1)))) +
( k2 / (1 + exp( (-log(81)/a2) * (x - b2))))),
start=c(a1=87,k1=38000,b1=150,
a2=104,k2=400000,b2=300))
bilogistic
## Nonlinear regression model
## model: y ~ ((k1/(1 + exp((-log(81)/a1) * (x - b1)))) + (k2/(1 + exp((-log(81)/a2) * (x - b2)))))
## data: parent.frame()
## a1 k1 b1 a2 k2 b2
## 101.2 43652.0 160.6 128.4 347998.5 322.1
## residual sum-of-squares: 1.621e+09
##
## Number of iterations to convergence: 9
## Achieved convergence tolerance: 3.107e-06
plot(x,y, ylim=c(0,400000), xlim=c(50,400), ylab = "Cumulative case count", xlab = "days since first case", axes=F); axis(1); axis(2)
newx <- seq(50,400)
lines(newx,predict(bilogistic,newdata=data.frame(x=newx)),lwd=2, lty=2)
AIC(bilogistic)
## [1] 6961.936
BIC(bilogistic)
## [1] 6989.591