state_true_cases.utf8

Assumed constants

# https://github.com/midas-network/COVID-19/blob/master/parameter_estimates/2019_novel_coronavirus/estimates.csv

ICU_LOS = 9
ACUTE_LOS = 7
DAYS_FROM_SYMPTOMS_TO_HOSP = 6
PREV_DT = 6

Load COVID Tracking Time Series Data by State on Cases and Hospitalizations

For states with any hospitalizedCumulative data that is identical to hospitalizedCurrently, only use the hospitalizedCurrently data. For states with only hospitalizedCumulative data that is never identical to hospitalizedCurrently, use hospitalizedCumulative data. For states with no hospitalizedCumulative, use only hospitalizedCurrently if available.

raw_dt = fread('data/daily-covidtracker-0404.csv')[,.(date, state, positive, hospitalizedCurrently, hospitalizedCumulative)]
raw_dt = raw_dt[!is.na(hospitalizedCurrently) | !is.na(hospitalizedCumulative)]
raw_dt[, date := as.Date(as.character(date), format = '%Y %m %d')]

state_data_types = raw_dt[,.(
  useHospitalizedCurrently = pmax(
    min(ifelse(!is.na(hospitalizedCumulative), 0, 1)),
    max(
    c(ifelse(hospitalizedCumulative == hospitalizedCurrently, 1, 0),0), na.rm = TRUE))
  ), by = 'state']

raw_dt = merge(raw_dt, state_data_types)
raw_dt[useHospitalizedCurrently == 1, hospitalizedCumulative := NA]
raw_dt[,useHospitalizedCurrently := NULL]

raw_dt[state == "CA"] %>% kable() %>% kable_styling()

state	date	positive	hospitalizedCurrently	hospitalizedCumulative
CA	2020-04-04	12026	2300	NA
CA	2020-04-03	10701	2188	NA
CA	2020-04-02	9191	1922	NA
CA	2020-04-01	8155	1855	NA
CA	2020-03-31	7482	1617	NA
CA	2020-03-30	6447	1432	NA
CA	2020-03-29	5708	1034	NA
CA	2020-03-28	4643	1034	NA
CA	2020-03-27	3879	746	NA

Load Our Data on (Expected % of Cases Hospitalized, and Expected % of Hospitalizations in ICU) by County

Aggregate to State-level and combine with Covid Tracking time series

county_estimates = fread('data/county_age_severity_rates_v6.csv')

state_agg = county_estimates[,.(
  state_population = sum(population_in_age_group),
  wtd_critical_case_rate =  weighted.mean(critical_case_rate, population_in_age_group),
  wtd_acute_case_rate =  weighted.mean(severe_cases_rate, population_in_age_group)
  ), by = 'State']

state_abbrev = fread('data/state_abbrev.csv')[,.(State, Code)]

state_agg = merge(state_abbrev, state_agg, by = 'State')
state_agg = state_agg[,.(
  state = Code,
  state_population,
  exp_prop_hosp_in_icu = ICU_LOS*wtd_critical_case_rate/(ICU_LOS*wtd_critical_case_rate+ACUTE_LOS*wtd_acute_case_rate),
  exp_prop_cases_hos = wtd_critical_case_rate + wtd_acute_case_rate
  )]

state_agg %>% kable() %>% kable_styling()

state	state_population	exp_prop_hosp_in_icu	exp_prop_cases_hos
AL	4864680	0.3611620	0.0728536
AK	738516	0.3070323	0.0594195
AZ	6946685	0.3701946	0.0719562
AR	2990671	0.3665662	0.0721470
CA	39148760	0.3493473	0.0661899
CO	5531141	0.3388134	0.0662268
CT	3581504	0.3689069	0.0757854
DE	949495	0.3663666	0.0764244
DC	684498	0.3369685	0.0610181
FL	20598139	0.3889573	0.0815067
GA	10297484	0.3360362	0.0650482
HI	1422029	0.3785443	0.0750363
ID	1687809	0.3573852	0.0680004
IL	12821497	0.3581274	0.0700571
IN	6637426	0.3584701	0.0700123
IA	3132499	0.3767625	0.0732518
KS	2908776	0.3663775	0.0693705
KY	4440204	0.3566786	0.0717877
LA	4663616	0.3518923	0.0685007
ME	1332813	0.3757650	0.0838364
MD	6003435	0.3501494	0.0703946
MA	6830193	0.3643359	0.0735064
MI	9957488	0.3644694	0.0742530
MN	5527358	0.3602706	0.0708289
MS	2988762	0.3562193	0.0693892
MO	6090062	0.3664460	0.0730646
MT	1041732	0.3700499	0.0770009
NE	1904760	0.3675282	0.0691997
NV	2922849	0.3469394	0.0694797
NH	1343622	0.3605222	0.0782417
NJ	8881845	0.3619384	0.0730358
NM	2092434	0.3653841	0.0721824
NY	19618453	0.3647079	0.0725284
NC	10155624	0.3551984	0.0710851
ND	752201	0.3715142	0.0682487
OH	11641879	0.3663545	0.0739021
OK	3918137	0.3607114	0.0688876
OR	4081943	0.3633328	0.0742762
PA	12791181	0.3768509	0.0773543
RI	1056611	0.3724280	0.0753384
SC	4955925	0.3603599	0.0738609
SD	864289	0.3724149	0.0713408
TN	6651089	0.3567203	0.0718104
TX	27885195	0.3343842	0.0609522
UT	3045350	0.3320692	0.0540195
VT	624977	0.3691260	0.0803374
VA	8413774	0.3495834	0.0696871
WA	7294336	0.3508452	0.0695623
WV	1829054	0.3741721	0.0804689
WI	5778394	0.3648416	0.0737622
WY	581836	0.3550700	0.0705211

state_ts = merge(raw_dt, state_agg)
state_ts[state == "CA"] %>% kable() %>% kable_styling()

state	date	positive	hospitalizedCurrently	hospitalizedCumulative	state_population	exp_prop_hosp_in_icu	exp_prop_cases_hos
CA	2020-04-04	12026	2300	NA	39148760	0.3493473	0.0661899
CA	2020-04-03	10701	2188	NA	39148760	0.3493473	0.0661899
CA	2020-04-02	9191	1922	NA	39148760	0.3493473	0.0661899
CA	2020-04-01	8155	1855	NA	39148760	0.3493473	0.0661899
CA	2020-03-31	7482	1617	NA	39148760	0.3493473	0.0661899
CA	2020-03-30	6447	1432	NA	39148760	0.3493473	0.0661899
CA	2020-03-29	5708	1034	NA	39148760	0.3493473	0.0661899
CA	2020-03-28	4643	1034	NA	39148760	0.3493473	0.0661899
CA	2020-03-27	3879	746	NA	39148760	0.3493473	0.0661899

Convert hospitalizedCurrently to hospitalizedCumulative with assumptions about % in ICU vs Floor, and LOS for each

state_ts[,exp_agg_los := ICU_LOS*exp_prop_hosp_in_icu + ACUTE_LOS*(1-exp_prop_hosp_in_icu)]
state_ts[!is.na(hospitalizedCumulative), imputed_cum_hos := FALSE]
state_ts[is.na(hospitalizedCumulative), `:=`(
  hospitalizedCumulative = hospitalizedCurrently/(1 - 2^(-1*exp_agg_los/PREV_DT)),
  imputed_cum_hos = TRUE
  )]


state_ts = state_ts[,.(state, date, state_population,  exp_prop_cases_hos, positive, hospitalizedCumulative, imputed_cum_hos)]
state_ts[state == "CA"] %>% kable() %>% kable_styling()

state	date	state_population	exp_prop_cases_hos	positive	hospitalizedCumulative	imputed_cum_hos
CA	2020-04-04	39148760	0.0661899	12026	3904	TRUE
CA	2020-04-03	39148760	0.0661899	10701	3714	TRUE
CA	2020-04-02	39148760	0.0661899	9191	3262	TRUE
CA	2020-04-01	39148760	0.0661899	8155	3148	TRUE
CA	2020-03-31	39148760	0.0661899	7482	2744	TRUE
CA	2020-03-30	39148760	0.0661899	6447	2430	TRUE
CA	2020-03-29	39148760	0.0661899	5708	1755	TRUE
CA	2020-03-28	39148760	0.0661899	4643	1755	TRUE
CA	2020-03-27	39148760	0.0661899	3879	1266	TRUE

table(state_ts$imputed_cum_hos)

## 
## FALSE  TRUE 
##   351   115

Plot Cumulative Hospitalizations and Cumulative Positive Results by State

Only plot states with at least 5 data points

state_ts[, state_n_obs := .N, by='state']
state_ts = state_ts[state_n_obs>=5][,state_n_obs := NULL]

ggplot(state_ts, aes(x=date, y=hospitalizedCumulative)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)

Drop the stupid looking ones: AR, IN, ND, NH, OH, SC, SD, TX, VT, WA, WV

state_ts = state_ts[!(state %in% c('AR', 'IN', 'ND', 'NH', 'OH', 'SC', 'SD', 'TX', 'VT', "WA", 'WV'))]
ggplot(state_ts, aes(x=date, y=hospitalizedCumulative)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)

Derive multiplier on cases by day and state to make actual hospitalizations equal expected hospitalizations

Expect cumulative hospitalizations to equal exp_prop_cases_hos*lag(positive, DAYS_FROM_SYMPTOMS_TO_HOSP)

# Case time series
raw_case_dt = fread('data/daily-covidtracker-0404.csv')[,.(date = as.Date(as.character(date), format = '%Y %m %d'), state, positive)]
raw_case_dt[, hosp_date := date + DAYS_FROM_SYMPTOMS_TO_HOSP]
lagged_case_dt = raw_case_dt[,.(state, date = hosp_date, lagged_cases = positive)]

state_ts = merge(state_ts, lagged_case_dt, by = c('state', 'date'))
state_ts[, exp_cum_hosp := lagged_cases * exp_prop_cases_hos]
state_ts[state == "CA"] %>% kable() %>% kable_styling()

state	date	state_population	exp_prop_cases_hos	positive	hospitalizedCumulative	imputed_cum_hos	lagged_cases	exp_cum_hosp
CA	2020-03-27	39148760	0.0661899	3879	1266	TRUE	1279	84.65689
CA	2020-03-28	39148760	0.0661899	4643	1755	TRUE	1536	101.66769
CA	2020-03-29	39148760	0.0661899	5708	1755	TRUE	1733	114.70710
CA	2020-03-30	39148760	0.0661899	6447	2430	TRUE	2102	139.13118
CA	2020-03-31	39148760	0.0661899	7482	2744	TRUE	2355	155.87723
CA	2020-04-01	39148760	0.0661899	8155	3148	TRUE	3006	198.96685
CA	2020-04-02	39148760	0.0661899	9191	3262	TRUE	3879	256.75064
CA	2020-04-03	39148760	0.0661899	10701	3714	TRUE	4643	307.31973
CA	2020-04-04	39148760	0.0661899	12026	3904	TRUE	5708	377.81198

state_ts[, empirical_case_multiplier := hospitalizedCumulative/exp_cum_hosp]

ggplot(state_ts, aes(x=date, y=empirical_case_multiplier)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)

Apply the derived multipliers to case time series by state and compare empirical and adjusted growth rates

mult_dt = state_ts[,.(state, hosp_date = date, empirical_case_multiplier)]

case_dt = merge(raw_case_dt, mult_dt, by = c('state', 'hosp_date'))

case_dt[, adjusted_positive := positive * empirical_case_multiplier]

case_dt = case_dt[positive>0 & adjusted_positive>0]

ggplot(
  melt(case_dt[,.(state, date, adjusted_positive, positive)], id.vars = c('state', 'date')),
  aes(x=date, y=value, color = variable)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6) +
  scale_y_continuous(trans='log2') + geom_smooth(method="lm")

## `geom_smooth()` using formula 'y ~ x'

calculate_raw_and_adjusted_growth_rates = function(full_dt, s) {
  
  subset = full_dt[state == s]
  subset[, `:=`(
    log_2_raw_cases = log(positive, 2),
    log_2_adj_cases = log(adjusted_positive, 2),
    day = as.numeric(date - min(date))
    )]
  
  raw_coef = lm(log_2_raw_cases ~ day, data = subset)$coefficients[['day']]
  
  adj_coef = lm(log_2_adj_cases ~ day, data = subset)$coefficients[['day']]
  
  return(data.table(
    state = s,
    raw_coef = raw_coef,
    raw_dt = 1/raw_coef,
    adj_coef = adj_coef,
    adj_dt = 1/adj_coef
  ))
}

rbindlist(lapply(case_dt[,unique(state)], function(x) calculate_raw_and_adjusted_growth_rates(case_dt, x))) %>% kable() %>% kable_styling()

state	raw_coef	raw_dt	adj_coef	adj_dt
AK	0.3856168	2.593248	0.3269181	3.058870
AZ	0.4796230	2.084971	0.4505924	2.219301
CA	0.2710070	3.689941	0.1984210	5.039788
CO	0.3069926	3.257407	0.3269363	3.058700
CT	0.2994571	3.339377	0.1958279	5.106525
DE	0.3044390	3.284731	0.3825318	2.614162
FL	0.3782962	2.643432	0.2303679	4.340883
GA	0.3212578	3.112765	0.1609529	6.212997
HI	0.3079652	3.247120	0.1855999	5.387933
IA	0.3202706	3.122359	0.2153121	4.644422
ID	0.4334699	2.306965	0.1592757	6.278420
KS	0.3267809	3.060154	0.3596481	2.780496
LA	0.3604374	2.774407	0.2425873	4.122228
MA	0.3849194	2.597946	0.3216632	3.108841
MD	0.3383464	2.955551	0.2858288	3.498597
ME	0.1916580	5.217626	0.1457033	6.863261
MN	0.2602138	3.842994	0.2927941	3.415370
MS	0.4556541	2.194647	0.2652563	3.769938
MT	0.3673990	2.721836	0.3941127	2.537345
NC	0.3431944	2.913800	0.3078194	3.248658
NM	0.2912744	3.433189	0.1689978	5.917235
NY	0.4714018	2.121333	0.2979642	3.356108
OK	0.4020649	2.487161	0.3679125	2.718038
OR	0.2837070	3.524763	0.1858814	5.379774
PA	0.4149173	2.410119	0.2551255	3.919640
RI	0.2690255	3.717119	0.2486289	4.022059
TN	0.3112968	3.212368	0.2411562	4.146690
UT	0.2693007	3.713321	0.1581206	6.324286
VA	0.3180828	3.143835	0.2876329	3.476654
WI	0.2337557	4.277971	0.1897288	5.270680
WY	0.2776034	3.602261	0.1129324	8.854854

Load county case time series and apply state-date multipliers to get adjusted time series

raw_county_ts = fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
raw_county_ts = raw_county_ts[,.(State = state, date = as.Date(date), county, cases)]
raw_county_ts = merge(raw_county_ts, state_abbrev)
raw_county_ts = raw_county_ts[,.(state = Code, county_cases = cases, county, date)]
#raw_county_ts  %>% kable() %>% kable_styling()

county_ts = merge(case_dt[,.(state, date, empirical_case_multiplier)],
                  raw_county_ts, by = c('state', 'date'))
county_ts[, adjusted_cases := county_cases * empirical_case_multiplier]
#county_ts  %>% kable() %>% kable_styling()

Plot some example counties. New York City, Los Angeles, Contra Costa, San Francisco, Santa Clara, San Mateo

#unique(county_ts[, .(county, state)])[state == "CA"]

ggplot(
  melt(county_ts[county %in% c('New York City', 'Los Angeles', 'Contra Costa', 'San Francisco', 'Santa Clara', 'San Mateo'),
                 .(county, date, adjusted_cases, county_cases)], id.vars = c('county', 'date')),
  aes(x=date, y=value, color = variable)) + geom_line() + facet_wrap(~county, scales = "free", ncol = 3) +
  scale_y_continuous(trans='log2') + geom_smooth()

## Warning in
## melt.data.table(county_ts[county
## %in% c("New York
## City", "Los Angeles", :
## 'measure.vars' [adjusted_cases,
## county_cases] are not all
## of the same type. By order
## of hierarchy, the molten
## data value column will be of
## type 'double'. All measure
## variables not of type 'double'
## will be coerced too. Check
## DETAILS in ?melt.data.table
## for more on coercion.

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Raw and adjusted doubling times

calculate_raw_and_adjusted_growth_rates = function(full_dt, c) {
  
  subset = full_dt[county == c]
  subset[, `:=`(
    log_2_raw_cases = log(county_cases, 2),
    log_2_adj_cases = log(adjusted_cases, 2),
    day = as.numeric(date - min(date))
    )]
  
  raw_coef = lm(log_2_raw_cases ~ day, data = subset)$coefficients[['day']]
  
  adj_coef = lm(log_2_adj_cases ~ day, data = subset)$coefficients[['day']]
  
  return(data.table(
    county = c,
    raw_coef = raw_coef,
    raw_dt = 1/raw_coef,
    adj_coef = adj_coef,
    adj_dt = 1/adj_coef
  ))
}

rbindlist(lapply(c('New York City', 'Los Angeles', 'Contra Costa', 'San Francisco', 'Santa Clara', 'San Mateo'), 
                 function(x) calculate_raw_and_adjusted_growth_rates(county_ts, x))) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover"))

county	raw_coef	raw_dt	adj_coef	adj_dt
New York City	0.4983562	2.006597	0.3249186	3.077694
Los Angeles	0.3416305	2.927139	0.2690445	3.716858
Contra Costa	0.2380726	4.200399	0.1654866	6.042785
San Francisco	0.2542546	3.933065	0.1816686	5.504528
Santa Clara	0.1714294	5.833305	0.0988434	10.117015
San Mateo	0.1798531	5.560094	0.1072671	9.322525