library(data.table)
library(ggplot2)
library(lubridate)
# install.packages("kableExtra")
library(knitr)
library(kableExtra)
# https://github.com/midas-network/COVID-19/blob/master/parameter_estimates/2019_novel_coronavirus/estimates.csv
ICU_LOS = 9
ACUTE_LOS = 7
DAYS_FROM_SYMPTOMS_TO_HOSP = 6
PREV_DT = 6
For states with any hospitalizedCumulative data that is identical to hospitalizedCurrently, only use the hospitalizedCurrently data. For states with only hospitalizedCumulative data that is never identical to hospitalizedCurrently, use hospitalizedCumulative data. For states with no hospitalizedCumulative, use only hospitalizedCurrently if available.
raw_dt = fread('data/daily-covidtracker-0404.csv')[,.(date, state, positive, hospitalizedCurrently, hospitalizedCumulative)]
raw_dt = raw_dt[!is.na(hospitalizedCurrently) | !is.na(hospitalizedCumulative)]
raw_dt[, date := as.Date(as.character(date), format = '%Y %m %d')]
state_data_types = raw_dt[,.(
useHospitalizedCurrently = pmax(
min(ifelse(!is.na(hospitalizedCumulative), 0, 1)),
max(
c(ifelse(hospitalizedCumulative == hospitalizedCurrently, 1, 0),0), na.rm = TRUE))
), by = 'state']
raw_dt = merge(raw_dt, state_data_types)
raw_dt[useHospitalizedCurrently == 1, hospitalizedCumulative := NA]
raw_dt[,useHospitalizedCurrently := NULL]
raw_dt[state == "CA"] %>% kable() %>% kable_styling()
| state | date | positive | hospitalizedCurrently | hospitalizedCumulative |
|---|---|---|---|---|
| CA | 2020-04-04 | 12026 | 2300 | NA |
| CA | 2020-04-03 | 10701 | 2188 | NA |
| CA | 2020-04-02 | 9191 | 1922 | NA |
| CA | 2020-04-01 | 8155 | 1855 | NA |
| CA | 2020-03-31 | 7482 | 1617 | NA |
| CA | 2020-03-30 | 6447 | 1432 | NA |
| CA | 2020-03-29 | 5708 | 1034 | NA |
| CA | 2020-03-28 | 4643 | 1034 | NA |
| CA | 2020-03-27 | 3879 | 746 | NA |
Aggregate to State-level and combine with Covid Tracking time series
county_estimates = fread('data/county_age_severity_rates_v6.csv')
state_agg = county_estimates[,.(
state_population = sum(population_in_age_group),
wtd_critical_case_rate = weighted.mean(critical_case_rate, population_in_age_group),
wtd_acute_case_rate = weighted.mean(severe_cases_rate, population_in_age_group)
), by = 'State']
state_abbrev = fread('data/state_abbrev.csv')[,.(State, Code)]
state_agg = merge(state_abbrev, state_agg, by = 'State')
state_agg = state_agg[,.(
state = Code,
state_population,
exp_prop_hosp_in_icu = ICU_LOS*wtd_critical_case_rate/(ICU_LOS*wtd_critical_case_rate+ACUTE_LOS*wtd_acute_case_rate),
exp_prop_cases_hos = wtd_critical_case_rate + wtd_acute_case_rate
)]
state_agg %>% kable() %>% kable_styling()
| state | state_population | exp_prop_hosp_in_icu | exp_prop_cases_hos |
|---|---|---|---|
| AL | 4864680 | 0.3611620 | 0.0728536 |
| AK | 738516 | 0.3070323 | 0.0594195 |
| AZ | 6946685 | 0.3701946 | 0.0719562 |
| AR | 2990671 | 0.3665662 | 0.0721470 |
| CA | 39148760 | 0.3493473 | 0.0661899 |
| CO | 5531141 | 0.3388134 | 0.0662268 |
| CT | 3581504 | 0.3689069 | 0.0757854 |
| DE | 949495 | 0.3663666 | 0.0764244 |
| DC | 684498 | 0.3369685 | 0.0610181 |
| FL | 20598139 | 0.3889573 | 0.0815067 |
| GA | 10297484 | 0.3360362 | 0.0650482 |
| HI | 1422029 | 0.3785443 | 0.0750363 |
| ID | 1687809 | 0.3573852 | 0.0680004 |
| IL | 12821497 | 0.3581274 | 0.0700571 |
| IN | 6637426 | 0.3584701 | 0.0700123 |
| IA | 3132499 | 0.3767625 | 0.0732518 |
| KS | 2908776 | 0.3663775 | 0.0693705 |
| KY | 4440204 | 0.3566786 | 0.0717877 |
| LA | 4663616 | 0.3518923 | 0.0685007 |
| ME | 1332813 | 0.3757650 | 0.0838364 |
| MD | 6003435 | 0.3501494 | 0.0703946 |
| MA | 6830193 | 0.3643359 | 0.0735064 |
| MI | 9957488 | 0.3644694 | 0.0742530 |
| MN | 5527358 | 0.3602706 | 0.0708289 |
| MS | 2988762 | 0.3562193 | 0.0693892 |
| MO | 6090062 | 0.3664460 | 0.0730646 |
| MT | 1041732 | 0.3700499 | 0.0770009 |
| NE | 1904760 | 0.3675282 | 0.0691997 |
| NV | 2922849 | 0.3469394 | 0.0694797 |
| NH | 1343622 | 0.3605222 | 0.0782417 |
| NJ | 8881845 | 0.3619384 | 0.0730358 |
| NM | 2092434 | 0.3653841 | 0.0721824 |
| NY | 19618453 | 0.3647079 | 0.0725284 |
| NC | 10155624 | 0.3551984 | 0.0710851 |
| ND | 752201 | 0.3715142 | 0.0682487 |
| OH | 11641879 | 0.3663545 | 0.0739021 |
| OK | 3918137 | 0.3607114 | 0.0688876 |
| OR | 4081943 | 0.3633328 | 0.0742762 |
| PA | 12791181 | 0.3768509 | 0.0773543 |
| RI | 1056611 | 0.3724280 | 0.0753384 |
| SC | 4955925 | 0.3603599 | 0.0738609 |
| SD | 864289 | 0.3724149 | 0.0713408 |
| TN | 6651089 | 0.3567203 | 0.0718104 |
| TX | 27885195 | 0.3343842 | 0.0609522 |
| UT | 3045350 | 0.3320692 | 0.0540195 |
| VT | 624977 | 0.3691260 | 0.0803374 |
| VA | 8413774 | 0.3495834 | 0.0696871 |
| WA | 7294336 | 0.3508452 | 0.0695623 |
| WV | 1829054 | 0.3741721 | 0.0804689 |
| WI | 5778394 | 0.3648416 | 0.0737622 |
| WY | 581836 | 0.3550700 | 0.0705211 |
state_ts = merge(raw_dt, state_agg)
state_ts[state == "CA"] %>% kable() %>% kable_styling()
| state | date | positive | hospitalizedCurrently | hospitalizedCumulative | state_population | exp_prop_hosp_in_icu | exp_prop_cases_hos |
|---|---|---|---|---|---|---|---|
| CA | 2020-04-04 | 12026 | 2300 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-04-03 | 10701 | 2188 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-04-02 | 9191 | 1922 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-04-01 | 8155 | 1855 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-03-31 | 7482 | 1617 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-03-30 | 6447 | 1432 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-03-29 | 5708 | 1034 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-03-28 | 4643 | 1034 | NA | 39148760 | 0.3493473 | 0.0661899 |
| CA | 2020-03-27 | 3879 | 746 | NA | 39148760 | 0.3493473 | 0.0661899 |
state_ts[,exp_agg_los := ICU_LOS*exp_prop_hosp_in_icu + ACUTE_LOS*(1-exp_prop_hosp_in_icu)]
state_ts[!is.na(hospitalizedCumulative), imputed_cum_hos := FALSE]
state_ts[is.na(hospitalizedCumulative), `:=`(
hospitalizedCumulative = hospitalizedCurrently/(1 - 2^(-1*exp_agg_los/PREV_DT)),
imputed_cum_hos = TRUE
)]
state_ts = state_ts[,.(state, date, state_population, exp_prop_cases_hos, positive, hospitalizedCumulative, imputed_cum_hos)]
state_ts[state == "CA"] %>% kable() %>% kable_styling()
| state | date | state_population | exp_prop_cases_hos | positive | hospitalizedCumulative | imputed_cum_hos |
|---|---|---|---|---|---|---|
| CA | 2020-04-04 | 39148760 | 0.0661899 | 12026 | 3904 | TRUE |
| CA | 2020-04-03 | 39148760 | 0.0661899 | 10701 | 3714 | TRUE |
| CA | 2020-04-02 | 39148760 | 0.0661899 | 9191 | 3262 | TRUE |
| CA | 2020-04-01 | 39148760 | 0.0661899 | 8155 | 3148 | TRUE |
| CA | 2020-03-31 | 39148760 | 0.0661899 | 7482 | 2744 | TRUE |
| CA | 2020-03-30 | 39148760 | 0.0661899 | 6447 | 2430 | TRUE |
| CA | 2020-03-29 | 39148760 | 0.0661899 | 5708 | 1755 | TRUE |
| CA | 2020-03-28 | 39148760 | 0.0661899 | 4643 | 1755 | TRUE |
| CA | 2020-03-27 | 39148760 | 0.0661899 | 3879 | 1266 | TRUE |
table(state_ts$imputed_cum_hos)
##
## FALSE TRUE
## 351 115
Only plot states with at least 5 data points
state_ts[, state_n_obs := .N, by='state']
state_ts = state_ts[state_n_obs>=5][,state_n_obs := NULL]
ggplot(state_ts, aes(x=date, y=hospitalizedCumulative)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)
Drop the stupid looking ones: AR, IN, ND, NH, OH, SC, SD, TX, VT, WA, WV
state_ts = state_ts[!(state %in% c('AR', 'IN', 'ND', 'NH', 'OH', 'SC', 'SD', 'TX', 'VT', "WA", 'WV'))]
ggplot(state_ts, aes(x=date, y=hospitalizedCumulative)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)
Expect cumulative hospitalizations to equal exp_prop_cases_hos*lag(positive, DAYS_FROM_SYMPTOMS_TO_HOSP)
# Case time series
raw_case_dt = fread('data/daily-covidtracker-0404.csv')[,.(date = as.Date(as.character(date), format = '%Y %m %d'), state, positive)]
raw_case_dt[, hosp_date := date + DAYS_FROM_SYMPTOMS_TO_HOSP]
lagged_case_dt = raw_case_dt[,.(state, date = hosp_date, lagged_cases = positive)]
state_ts = merge(state_ts, lagged_case_dt, by = c('state', 'date'))
state_ts[, exp_cum_hosp := lagged_cases * exp_prop_cases_hos]
state_ts[state == "CA"] %>% kable() %>% kable_styling()
| state | date | state_population | exp_prop_cases_hos | positive | hospitalizedCumulative | imputed_cum_hos | lagged_cases | exp_cum_hosp |
|---|---|---|---|---|---|---|---|---|
| CA | 2020-03-27 | 39148760 | 0.0661899 | 3879 | 1266 | TRUE | 1279 | 84.65689 |
| CA | 2020-03-28 | 39148760 | 0.0661899 | 4643 | 1755 | TRUE | 1536 | 101.66769 |
| CA | 2020-03-29 | 39148760 | 0.0661899 | 5708 | 1755 | TRUE | 1733 | 114.70710 |
| CA | 2020-03-30 | 39148760 | 0.0661899 | 6447 | 2430 | TRUE | 2102 | 139.13118 |
| CA | 2020-03-31 | 39148760 | 0.0661899 | 7482 | 2744 | TRUE | 2355 | 155.87723 |
| CA | 2020-04-01 | 39148760 | 0.0661899 | 8155 | 3148 | TRUE | 3006 | 198.96685 |
| CA | 2020-04-02 | 39148760 | 0.0661899 | 9191 | 3262 | TRUE | 3879 | 256.75064 |
| CA | 2020-04-03 | 39148760 | 0.0661899 | 10701 | 3714 | TRUE | 4643 | 307.31973 |
| CA | 2020-04-04 | 39148760 | 0.0661899 | 12026 | 3904 | TRUE | 5708 | 377.81198 |
state_ts[, empirical_case_multiplier := hospitalizedCumulative/exp_cum_hosp]
ggplot(state_ts, aes(x=date, y=empirical_case_multiplier)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6)
mult_dt = state_ts[,.(state, hosp_date = date, empirical_case_multiplier)]
case_dt = merge(raw_case_dt, mult_dt, by = c('state', 'hosp_date'))
case_dt[, adjusted_positive := positive * empirical_case_multiplier]
case_dt = case_dt[positive>0 & adjusted_positive>0]
ggplot(
melt(case_dt[,.(state, date, adjusted_positive, positive)], id.vars = c('state', 'date')),
aes(x=date, y=value, color = variable)) + geom_line() + facet_wrap(~state, scales = "free", ncol = 6) +
scale_y_continuous(trans='log2') + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
calculate_raw_and_adjusted_growth_rates = function(full_dt, s) {
subset = full_dt[state == s]
subset[, `:=`(
log_2_raw_cases = log(positive, 2),
log_2_adj_cases = log(adjusted_positive, 2),
day = as.numeric(date - min(date))
)]
raw_coef = lm(log_2_raw_cases ~ day, data = subset)$coefficients[['day']]
adj_coef = lm(log_2_adj_cases ~ day, data = subset)$coefficients[['day']]
return(data.table(
state = s,
raw_coef = raw_coef,
raw_dt = 1/raw_coef,
adj_coef = adj_coef,
adj_dt = 1/adj_coef
))
}
rbindlist(lapply(case_dt[,unique(state)], function(x) calculate_raw_and_adjusted_growth_rates(case_dt, x))) %>% kable() %>% kable_styling()
| state | raw_coef | raw_dt | adj_coef | adj_dt |
|---|---|---|---|---|
| AK | 0.3856168 | 2.593248 | 0.3269181 | 3.058870 |
| AZ | 0.4796230 | 2.084971 | 0.4505924 | 2.219301 |
| CA | 0.2710070 | 3.689941 | 0.1984210 | 5.039788 |
| CO | 0.3069926 | 3.257407 | 0.3269363 | 3.058700 |
| CT | 0.2994571 | 3.339377 | 0.1958279 | 5.106525 |
| DE | 0.3044390 | 3.284731 | 0.3825318 | 2.614162 |
| FL | 0.3782962 | 2.643432 | 0.2303679 | 4.340883 |
| GA | 0.3212578 | 3.112765 | 0.1609529 | 6.212997 |
| HI | 0.3079652 | 3.247120 | 0.1855999 | 5.387933 |
| IA | 0.3202706 | 3.122359 | 0.2153121 | 4.644422 |
| ID | 0.4334699 | 2.306965 | 0.1592757 | 6.278420 |
| KS | 0.3267809 | 3.060154 | 0.3596481 | 2.780496 |
| LA | 0.3604374 | 2.774407 | 0.2425873 | 4.122228 |
| MA | 0.3849194 | 2.597946 | 0.3216632 | 3.108841 |
| MD | 0.3383464 | 2.955551 | 0.2858288 | 3.498597 |
| ME | 0.1916580 | 5.217626 | 0.1457033 | 6.863261 |
| MN | 0.2602138 | 3.842994 | 0.2927941 | 3.415370 |
| MS | 0.4556541 | 2.194647 | 0.2652563 | 3.769938 |
| MT | 0.3673990 | 2.721836 | 0.3941127 | 2.537345 |
| NC | 0.3431944 | 2.913800 | 0.3078194 | 3.248658 |
| NM | 0.2912744 | 3.433189 | 0.1689978 | 5.917235 |
| NY | 0.4714018 | 2.121333 | 0.2979642 | 3.356108 |
| OK | 0.4020649 | 2.487161 | 0.3679125 | 2.718038 |
| OR | 0.2837070 | 3.524763 | 0.1858814 | 5.379774 |
| PA | 0.4149173 | 2.410119 | 0.2551255 | 3.919640 |
| RI | 0.2690255 | 3.717119 | 0.2486289 | 4.022059 |
| TN | 0.3112968 | 3.212368 | 0.2411562 | 4.146690 |
| UT | 0.2693007 | 3.713321 | 0.1581206 | 6.324286 |
| VA | 0.3180828 | 3.143835 | 0.2876329 | 3.476654 |
| WI | 0.2337557 | 4.277971 | 0.1897288 | 5.270680 |
| WY | 0.2776034 | 3.602261 | 0.1129324 | 8.854854 |
raw_county_ts = fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
raw_county_ts = raw_county_ts[,.(State = state, date = as.Date(date), county, cases)]
raw_county_ts = merge(raw_county_ts, state_abbrev)
raw_county_ts = raw_county_ts[,.(state = Code, county_cases = cases, county, date)]
#raw_county_ts %>% kable() %>% kable_styling()
county_ts = merge(case_dt[,.(state, date, empirical_case_multiplier)],
raw_county_ts, by = c('state', 'date'))
county_ts[, adjusted_cases := county_cases * empirical_case_multiplier]
#county_ts %>% kable() %>% kable_styling()
Plot some example counties. New York City, Los Angeles, Contra Costa, San Francisco, Santa Clara, San Mateo
#unique(county_ts[, .(county, state)])[state == "CA"]
ggplot(
melt(county_ts[county %in% c('New York City', 'Los Angeles', 'Contra Costa', 'San Francisco', 'Santa Clara', 'San Mateo'),
.(county, date, adjusted_cases, county_cases)], id.vars = c('county', 'date')),
aes(x=date, y=value, color = variable)) + geom_line() + facet_wrap(~county, scales = "free", ncol = 3) +
scale_y_continuous(trans='log2') + geom_smooth()
## Warning in
## melt.data.table(county_ts[county
## %in% c("New York
## City", "Los Angeles", :
## 'measure.vars' [adjusted_cases,
## county_cases] are not all
## of the same type. By order
## of hierarchy, the molten
## data value column will be of
## type 'double'. All measure
## variables not of type 'double'
## will be coerced too. Check
## DETAILS in ?melt.data.table
## for more on coercion.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Raw and adjusted doubling times
calculate_raw_and_adjusted_growth_rates = function(full_dt, c) {
subset = full_dt[county == c]
subset[, `:=`(
log_2_raw_cases = log(county_cases, 2),
log_2_adj_cases = log(adjusted_cases, 2),
day = as.numeric(date - min(date))
)]
raw_coef = lm(log_2_raw_cases ~ day, data = subset)$coefficients[['day']]
adj_coef = lm(log_2_adj_cases ~ day, data = subset)$coefficients[['day']]
return(data.table(
county = c,
raw_coef = raw_coef,
raw_dt = 1/raw_coef,
adj_coef = adj_coef,
adj_dt = 1/adj_coef
))
}
rbindlist(lapply(c('New York City', 'Los Angeles', 'Contra Costa', 'San Francisco', 'Santa Clara', 'San Mateo'),
function(x) calculate_raw_and_adjusted_growth_rates(county_ts, x))) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover"))
| county | raw_coef | raw_dt | adj_coef | adj_dt |
|---|---|---|---|---|
| New York City | 0.4983562 | 2.006597 | 0.3249186 | 3.077694 |
| Los Angeles | 0.3416305 | 2.927139 | 0.2690445 | 3.716858 |
| Contra Costa | 0.2380726 | 4.200399 | 0.1654866 | 6.042785 |
| San Francisco | 0.2542546 | 3.933065 | 0.1816686 | 5.504528 |
| Santa Clara | 0.1714294 | 5.833305 | 0.0988434 | 10.117015 |
| San Mateo | 0.1798531 | 5.560094 | 0.1072671 | 9.322525 |