library(data.table)
library(ggplot2)
library(lubridate)
# install.packages("kableExtra")
library(knitr)
library(kableExtra)

Assumed constants

# https://github.com/midas-network/COVID-19/blob/master/parameter_estimates/2019_novel_coronavirus/estimates.csv

ICU_LOS = 9
ACUTE_LOS = 7
DAYS_FROM_SYMPTOMS_TO_HOSP = 6
PREV_DT = 6

Load COVID Tracking Time Series Data by State on Cases and Hospitalizations

For states with any hospitalizedCumulative data that is identical to hospitalizedCurrently, only use the hospitalizedCurrently data. For states with only hospitalizedCumulative data that is never identical to hospitalizedCurrently, use hospitalizedCumulative data. For states with no hospitalizedCumulative, use only hospitalizedCurrently if available.

raw_dt = fread('data/daily-covidtracker-0404.csv')[,.(date, state, positive, hospitalizedCurrently, hospitalizedCumulative)]
raw_dt = raw_dt[!is.na(hospitalizedCurrently) | !is.na(hospitalizedCumulative)]
raw_dt[, date := as.Date(as.character(date), format = '%Y %m %d')]

state_data_types = raw_dt[,.(
  useHospitalizedCurrently = pmax(
    min(ifelse(!is.na(hospitalizedCumulative), 0, 1)),
    max(
    c(ifelse(hospitalizedCumulative == hospitalizedCurrently, 1, 0),0), na.rm = TRUE))
  ), by = 'state']

raw_dt = merge(raw_dt, state_data_types)
raw_dt[useHospitalizedCurrently == 1, hospitalizedCumulative := NA]
raw_dt[,useHospitalizedCurrently := NULL]

raw_dt[state == "CA"] %>% kable() %>% kable_styling()
state date positive hospitalizedCurrently hospitalizedCumulative
CA 2020-04-04 12026 2300 NA
CA 2020-04-03 10701 2188 NA
CA 2020-04-02 9191 1922 NA
CA 2020-04-01 8155 1855 NA
CA 2020-03-31 7482 1617 NA
CA 2020-03-30 6447 1432 NA
CA 2020-03-29 5708 1034 NA
CA 2020-03-28 4643 1034 NA
CA 2020-03-27 3879 746 NA

Load Our Data on (Expected % of Cases Hospitalized, and Expected % of Hospitalizations in ICU) by County

Aggregate to State-level and combine with Covid Tracking time series

county_estimates = fread('data/county_age_severity_rates_v6.csv')

state_agg = county_estimates[,.(
  state_population = sum(population_in_age_group),
  wtd_critical_case_rate =  weighted.mean(critical_case_rate, population_in_age_group),
  wtd_acute_case_rate =  weighted.mean(severe_cases_rate, population_in_age_group)
  ), by = 'State']

state_abbrev = fread('data/state_abbrev.csv')[,.(State, Code)]

state_agg = merge(state_abbrev, state_agg, by = 'State')
state_agg = state_agg[,.(
  state = Code,
  state_population,
  exp_prop_hosp_in_icu = ICU_LOS*wtd_critical_case_rate/(ICU_LOS*wtd_critical_case_rate+ACUTE_LOS*wtd_acute_case_rate),
  exp_prop_cases_hos = wtd_critical_case_rate + wtd_acute_case_rate
  )]

state_agg %>% kable() %>% kable_styling()
state state_population exp_prop_hosp_in_icu exp_prop_cases_hos
AL 4864680 0.3611620 0.0728536
AK 738516 0.3070323 0.0594195
AZ 6946685 0.3701946 0.0719562
AR 2990671 0.3665662 0.0721470
CA 39148760 0.3493473 0.0661899
CO 5531141 0.3388134 0.0662268
CT 3581504 0.3689069 0.0757854
DE 949495 0.3663666 0.0764244
DC 684498 0.3369685 0.0610181
FL 20598139 0.3889573 0.0815067
GA 10297484 0.3360362 0.0650482
HI 1422029 0.3785443 0.0750363
ID 1687809 0.3573852 0.0680004
IL 12821497 0.3581274 0.0700571
IN 6637426 0.3584701 0.0700123
IA 3132499 0.3767625 0.0732518
KS 2908776 0.3663775 0.0693705
KY 4440204 0.3566786 0.0717877
LA 4663616 0.3518923 0.0685007
ME 1332813 0.3757650 0.0838364
MD 6003435 0.3501494 0.0703946
MA 6830193 0.3643359 0.0735064
MI 9957488 0.3644694 0.0742530
MN 5527358 0.3602706 0.0708289
MS 2988762 0.3562193 0.0693892
MO 6090062 0.3664460 0.0730646
MT 1041732 0.3700499 0.0770009
NE 1904760 0.3675282 0.0691997
NV 2922849 0.3469394 0.0694797
NH 1343622 0.3605222 0.0782417
NJ 8881845 0.3619384 0.0730358
NM 2092434 0.3653841 0.0721824
NY 19618453 0.3647079 0.0725284
NC 10155624 0.3551984 0.0710851
ND 752201 0.3715142 0.0682487
OH 11641879 0.3663545 0.0739021
OK 3918137 0.3607114 0.0688876
OR 4081943 0.3633328 0.0742762
PA 12791181 0.3768509 0.0773543
RI 1056611 0.3724280 0.0753384
SC 4955925 0.3603599 0.0738609
SD 864289 0.3724149 0.0713408
TN 6651089 0.3567203 0.0718104
TX 27885195 0.3343842 0.0609522
UT 3045350 0.3320692 0.0540195
VT 624977 0.3691260 0.0803374
VA 8413774 0.3495834 0.0696871
WA 7294336 0.3508452 0.0695623
WV 1829054 0.3741721 0.0804689
WI 5778394 0.3648416 0.0737622
WY 581836 0.3550700 0.0705211
state_ts = merge(raw_dt, state_agg)
state_ts[state == "CA"] %>% kable() %>% kable_styling()
state date positive hospitalizedCurrently hospitalizedCumulative state_population exp_prop_hosp_in_icu exp_prop_cases_hos
CA 2020-04-04 12026 2300 NA 39148760 0.3493473 0.0661899
CA 2020-04-03 10701 2188 NA 39148760 0.3493473 0.0661899
CA 2020-04-02 9191 1922 NA 39148760 0.3493473 0.0661899
CA 2020-04-01 8155 1855 NA 39148760 0.3493473 0.0661899
CA 2020-03-31 7482 1617 NA 39148760 0.3493473 0.0661899
CA 2020-03-30 6447 1432 NA 39148760 0.3493473 0.0661899
CA 2020-03-29 5708 1034 NA 39148760 0.3493473 0.0661899
CA 2020-03-28 4643 1034 NA 39148760 0.3493473 0.0661899
CA 2020-03-27 3879 746 NA 39148760 0.3493473 0.0661899

Convert hospitalizedCurrently to hospitalizedCumulative with assumptions about % in ICU vs Floor, and LOS for each

state_ts[,exp_agg_los := ICU_LOS*exp_prop_hosp_in_icu + ACUTE_LOS*(1-exp_prop_hosp_in_icu)]
state_ts[!is.na(hospitalizedCumulative), imputed_cum_hos := FALSE]
state_ts[is.na(hospitalizedCumulative), `:=`(
  hospitalizedCumulative = hospitalizedCurrently/(1 - 2^(-1*exp_agg_los/PREV_DT)),
  imputed_cum_hos = TRUE
  )]


state_ts = state_ts[,.(state, date, state_population,  exp_prop_cases_hos, positive, hospitalizedCumulative, imputed_cum_hos)]
state_ts[state == "CA"] %>% kable() %>% kable_styling()
state date state_population exp_prop_cases_hos positive hospitalizedCumulative imputed_cum_hos
CA 2020-04-04 39148760 0.0661899 12026 3904 TRUE
CA 2020-04-03 39148760 0.0661899 10701 3714 TRUE
CA 2020-04-02 39148760 0.0661899 9191 3262 TRUE
CA 2020-04-01 39148760 0.0661899 8155 3148 TRUE
CA 2020-03-31 39148760 0.0661899 7482 2744 TRUE
CA 2020-03-30 39148760 0.0661899 6447 2430 TRUE
CA 2020-03-29 39148760 0.0661899 5708 1755 TRUE
CA 2020-03-28 39148760 0.0661899 4643 1755 TRUE
CA 2020-03-27 39148760 0.0661899 3879 1266 TRUE
table(state_ts$imputed_cum_hos)
## 
## FALSE  TRUE 
##   351   115

Plot Cumulative Hospitalizations and Cumulative Positive Results by State

Only plot states with at least 5 data points

state_ts[, state_n_obs := .N, by='state']
state_ts = state_ts[state_n_obs>=5][,state_n_obs := NULL]

ggplot(state_ts, aes(x=date, y=hospitalizedCumulative)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)

Drop the stupid looking ones: AR, IN, ND, NH, OH, SC, SD, TX, VT, WA, WV

state_ts = state_ts[!(state %in% c('AR', 'IN', 'ND', 'NH', 'OH', 'SC', 'SD', 'TX', 'VT', "WA", 'WV'))]
ggplot(state_ts, aes(x=date, y=hospitalizedCumulative)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)

Derive multiplier on cases by day and state to make actual hospitalizations equal expected hospitalizations

Expect cumulative hospitalizations to equal exp_prop_cases_hos*lag(positive, DAYS_FROM_SYMPTOMS_TO_HOSP)

# Case time series
raw_case_dt = fread('data/daily-covidtracker-0404.csv')[,.(date = as.Date(as.character(date), format = '%Y %m %d'), state, positive)]
raw_case_dt[, hosp_date := date + DAYS_FROM_SYMPTOMS_TO_HOSP]
lagged_case_dt = raw_case_dt[,.(state, date = hosp_date, lagged_cases = positive)]

state_ts = merge(state_ts, lagged_case_dt, by = c('state', 'date'))
state_ts[, exp_cum_hosp := lagged_cases * exp_prop_cases_hos]
state_ts[state == "CA"] %>% kable() %>% kable_styling()
state date state_population exp_prop_cases_hos positive hospitalizedCumulative imputed_cum_hos lagged_cases exp_cum_hosp
CA 2020-03-27 39148760 0.0661899 3879 1266 TRUE 1279 84.65689
CA 2020-03-28 39148760 0.0661899 4643 1755 TRUE 1536 101.66769
CA 2020-03-29 39148760 0.0661899 5708 1755 TRUE 1733 114.70710
CA 2020-03-30 39148760 0.0661899 6447 2430 TRUE 2102 139.13118
CA 2020-03-31 39148760 0.0661899 7482 2744 TRUE 2355 155.87723
CA 2020-04-01 39148760 0.0661899 8155 3148 TRUE 3006 198.96685
CA 2020-04-02 39148760 0.0661899 9191 3262 TRUE 3879 256.75064
CA 2020-04-03 39148760 0.0661899 10701 3714 TRUE 4643 307.31973
CA 2020-04-04 39148760 0.0661899 12026 3904 TRUE 5708 377.81198
state_ts[, empirical_case_multiplier := hospitalizedCumulative/exp_cum_hosp]

ggplot(state_ts, aes(x=date, y=empirical_case_multiplier)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)

Apply the derived multipliers to case time series by state and compare empirical and adjusted growth rates

mult_dt = state_ts[,.(state, hosp_date = date, empirical_case_multiplier)]

case_dt = merge(raw_case_dt, mult_dt, by = c('state', 'hosp_date'))

case_dt[, adjusted_positive := positive * empirical_case_multiplier]

case_dt = case_dt[positive>0 & adjusted_positive>0]

ggplot(
  melt(case_dt[,.(state, date, adjusted_positive, positive)], id.vars = c('state', 'date')),
  aes(x=date, y=value, color = variable)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6) +
  scale_y_continuous(trans='log2') + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

calculate_raw_and_adjusted_growth_rates = function(full_dt, s) {
  
  subset = full_dt[state == s]
  subset[, `:=`(
    log_2_raw_cases = log(positive, 2),
    log_2_adj_cases = log(adjusted_positive, 2),
    day = as.numeric(date - min(date))
    )]
  
  raw_coef = lm(log_2_raw_cases ~ day, data = subset)$coefficients[['day']]
  
  adj_coef = lm(log_2_adj_cases ~ day, data = subset)$coefficients[['day']]
  
  return(data.table(
    state = s,
    raw_coef = raw_coef,
    raw_dt = 1/raw_coef,
    adj_coef = adj_coef,
    adj_dt = 1/adj_coef
  ))
}

rbindlist(lapply(case_dt[,unique(state)], function(x) calculate_raw_and_adjusted_growth_rates(case_dt, x))) %>% kable() %>% kable_styling()
state raw_coef raw_dt adj_coef adj_dt
AK 0.3856168 2.593248 0.3269181 3.058870
AZ 0.4796230 2.084971 0.4505924 2.219301
CA 0.2710070 3.689941 0.1984210 5.039788
CO 0.3069926 3.257407 0.3269363 3.058700
CT 0.2994571 3.339377 0.1958279 5.106525
DE 0.3044390 3.284731 0.3825318 2.614162
FL 0.3782962 2.643432 0.2303679 4.340883
GA 0.3212578 3.112765 0.1609529 6.212997
HI 0.3079652 3.247120 0.1855999 5.387933
IA 0.3202706 3.122359 0.2153121 4.644422
ID 0.4334699 2.306965 0.1592757 6.278420
KS 0.3267809 3.060154 0.3596481 2.780496
LA 0.3604374 2.774407 0.2425873 4.122228
MA 0.3849194 2.597946 0.3216632 3.108841
MD 0.3383464 2.955551 0.2858288 3.498597
ME 0.1916580 5.217626 0.1457033 6.863261
MN 0.2602138 3.842994 0.2927941 3.415370
MS 0.4556541 2.194647 0.2652563 3.769938
MT 0.3673990 2.721836 0.3941127 2.537345
NC 0.3431944 2.913800 0.3078194 3.248658
NM 0.2912744 3.433189 0.1689978 5.917235
NY 0.4714018 2.121333 0.2979642 3.356108
OK 0.4020649 2.487161 0.3679125 2.718038
OR 0.2837070 3.524763 0.1858814 5.379774
PA 0.4149173 2.410119 0.2551255 3.919640
RI 0.2690255 3.717119 0.2486289 4.022059
TN 0.3112968 3.212368 0.2411562 4.146690
UT 0.2693007 3.713321 0.1581206 6.324286
VA 0.3180828 3.143835 0.2876329 3.476654
WI 0.2337557 4.277971 0.1897288 5.270680
WY 0.2776034 3.602261 0.1129324 8.854854

Load county case time series and apply state-date multipliers to get adjusted time series

raw_county_ts = fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
raw_county_ts = raw_county_ts[,.(State = state, date = as.Date(date), county, cases)]
raw_county_ts = merge(raw_county_ts, state_abbrev)
raw_county_ts = raw_county_ts[,.(state = Code, county_cases = cases, county, date)]
#raw_county_ts  %>% kable() %>% kable_styling()
county_ts = merge(case_dt[,.(state, date, empirical_case_multiplier)],
                  raw_county_ts, by = c('state', 'date'))
county_ts[, adjusted_cases := county_cases * empirical_case_multiplier]
#county_ts  %>% kable() %>% kable_styling()

Plot some example counties. New York City, Los Angeles, Contra Costa, San Francisco, Santa Clara, San Mateo

#unique(county_ts[, .(county, state)])[state == "CA"]

ggplot(
  melt(county_ts[county %in% c('New York City', 'Los Angeles', 'Contra Costa', 'San Francisco', 'Santa Clara', 'San Mateo'),
                 .(county, date, adjusted_cases, county_cases)], id.vars = c('county', 'date')),
  aes(x=date, y=value, color = variable)) + geom_line() + facet_wrap(~county, scales = "free", ncol = 3) +
  scale_y_continuous(trans='log2') + geom_smooth()
## Warning in
## melt.data.table(county_ts[county
## %in% c("New York
## City", "Los Angeles", :
## 'measure.vars' [adjusted_cases,
## county_cases] are not all
## of the same type. By order
## of hierarchy, the molten
## data value column will be of
## type 'double'. All measure
## variables not of type 'double'
## will be coerced too. Check
## DETAILS in ?melt.data.table
## for more on coercion.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Raw and adjusted doubling times

calculate_raw_and_adjusted_growth_rates = function(full_dt, c) {
  
  subset = full_dt[county == c]
  subset[, `:=`(
    log_2_raw_cases = log(county_cases, 2),
    log_2_adj_cases = log(adjusted_cases, 2),
    day = as.numeric(date - min(date))
    )]
  
  raw_coef = lm(log_2_raw_cases ~ day, data = subset)$coefficients[['day']]
  
  adj_coef = lm(log_2_adj_cases ~ day, data = subset)$coefficients[['day']]
  
  return(data.table(
    county = c,
    raw_coef = raw_coef,
    raw_dt = 1/raw_coef,
    adj_coef = adj_coef,
    adj_dt = 1/adj_coef
  ))
}

rbindlist(lapply(c('New York City', 'Los Angeles', 'Contra Costa', 'San Francisco', 'Santa Clara', 'San Mateo'), 
                 function(x) calculate_raw_and_adjusted_growth_rates(county_ts, x))) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover"))
county raw_coef raw_dt adj_coef adj_dt
New York City 0.4983562 2.006597 0.3249186 3.077694
Los Angeles 0.3416305 2.927139 0.2690445 3.716858
Contra Costa 0.2380726 4.200399 0.1654866 6.042785
San Francisco 0.2542546 3.933065 0.1816686 5.504528
Santa Clara 0.1714294 5.833305 0.0988434 10.117015
San Mateo 0.1798531 5.560094 0.1072671 9.322525