Introduction

I started a Twitter thread earlier on this topic in reply to Gregg Gonsalves’ over-the-top response to David Leonhardt’s recent article arguing racial disparites have essentially closed.

The crux of their argument is that there are still a huge racial disparities after age-adjustments are made, i.e., the white population is simply so much older. Well, I’ve checked and they’re wrong about this. Insofar as I can tell, not one of them actually did any sort of direct age adjustment on the more recent data to support their assertions specifically contradicting Leonhardt.

This post is largely a reiteration of what I’ve already shared in that Twitter thread. However, I’m assembling this in an R Notebook here so that those interested can independently reproduce the analysis.

Data prep

library(tidyverse)
library(janitor)
library(ggplot2)
library(ggrepel)
library(hrbrthemes)
library(reshape2)
library(ggpmisc) 
library(viridis)
library(knitr)
library(scales)
library(lubridate)


getLogbreaks=function(start=-2,end=100) {
  c(
    2*10^(start:end),
    3*10^(start:end),
    4*10^(start:end),
    5*10^(start:end),
    6*10^(start:end),
    7*10^(start:end),
    8*10^(start:end),
    9*10^(start:end)
  ) %>%
  sort()
}

Population statistics

These are derived from census data viaCDC WONDER in three-parts: non-hispanic races, hispanics, and all combined. Please note that I am using 2020 census estimates for 2021-2022 because we do not have actual estimates available for these years at this level of detail yet. The population may have changed slightly in predictable and unpredictable ways, but these differences are likely fairly modest. This is also the approach used by CDC with Provisional Mortality Statistics for 2021 and 2022 at current.

Note: The finished result is reproduced here “inline”, but you can reconstitute from original sources if you follow the code I have commented out.

age_cw=tribble(
  ~age_group,~min_age,~max_age,
  '0-4 years',0,4,
  '5-11 years',5,11,
  '12-17 years',12,17,
  '18-29 years',18,29,
  '30-39 years',30,39,
  '40-49 years',40,49,
  '50-64 years',50,64,
  '65-74 years',65,74,
  '75-84 years',75,84,
  '85 years and over',85,150 # single year estimates end at 85+, but to make it clear!
)



getPopData=function() {
  
  # helper function to ease reading CDC tab delimited files
  readCDCPopData=function(filename,groupName=NA) {
  
  
    race=NA # a hack to stick this in namespace to avoid error when column not used
    data=read_delim(filename,delim='\t') %>%
      clean_names() %>%
      data.frame() %>%
      transmute(
        group=coalesce(groupName,race),
        age=as.integer(str_extract(single_year_ages_code,'\\d+')),
        pop=as.integer(population),
        note=ifelse(single_year_ages_code=='85+','Includes population 85 and older!',NA)
      ) %>%
      filter(
        !is.na(age)
      )
  }
  
  # saved and renamed files from CDC wonder on local disk
  bind_rows(
    readCDCPopData("population_2020_races_nonhispanic.csv"),
    readCDCPopData("population_2020_hispanic.csv",'Hispanic'),
    readCDCPopData("population_2020_all.csv",'ALL')
  )
  

  
  
}

getPopFormatted=function() {
  
    getPopData() %>%
      full_join(age_cw,by=character()) %>%
      filter(
        age >= min_age,
        age <= max_age
      ) %>%
      group_by(age_group,group,min_age,max_age) %>%
      summarise(
        pop=sum(pop),
        num_age_rows=n()
      ) %>%
      ungroup() %>%
      mutate(
        age_group=reorder(age_group,min_age)
      ) %>%
      reshape2::dcast(age_group~group,value.var='pop')
  
}


# pop_wide =getPopFormatted()
## pop_ref = datapasta::dpasta(pop_wide)  # generate dataframe in rstudio from the results

 
pop_ref=data.frame(
                                  check.names = FALSE,
                                    age_group = as.factor(c("0-4 years",
                                                            "5-11 years",
                                                            "12-17 years","18-29 years",
                                                            "30-39 years",
                                                            "40-49 years",
                                                            "50-64 years","65-74 years",
                                                            "75-84 years",
                                                            "85 years and over")),                                  
                                          ALL = c(19301292L,28384878L,25135943L,
                                                  53258240L,44666707L,40278494L,
                                                  62799204L,32549398L,16451547L,
                                                  6658420L),
           `American Indian or Alaska Native` = c(147997L,
                                                  234422L,212251L,449959L,
                                                  341952L,285908L,442303L,
                                                  201422L,87254L,28870L),
                                        Asian = c(1088502L,1555283L,1294372L,
                                                  3220929L,3313531L,2887905L,
                                                  3387171L,1561105L,749059L,309340L),
                  `Black or African American` = c(2681631L,3900784L,3424789L,
                                                  7644764L,5955322L,5124837L,
                                                  7510669L,3246374L,1413506L,524665L),
                                     Hispanic = c(4999801L,7292139L,6339895L,
                                                  11884993L,9178580L,8091692L,
                                                  8643959L,2978586L,1368466L,
                                                  534768L),
                         `More than one race` = c(991183L,
                                                  1356456L,1013863L,1596965L,
                                                  853687L,601197L,690101L,
                                                  284037L,124128L,45854L),
  `Native Hawaiian or Other Pacific Islander` = c(44258L,
                                                  60487L,50873L,109093L,
                                                  103385L,80425L,100650L,41017L,
                                                  17209L,6110L),
                                        White = c(9347920L,13985307L,12799900L,
                                                  28351537L,24920250L,23206530L,
                                                  42024351L,24236857L,12691925L,
                                                  5208813L)

)


kable(pop_ref,caption='2020 Census estimates')

2020 Census estimates
age_group	ALL	American Indian or Alaska Native	Asian	Black or African American	Hispanic	More than one race	Native Hawaiian or Other Pacific Islander	White
0-4 years	19301292	147997	1088502	2681631	4999801	991183	44258	9347920
5-11 years	28384878	234422	1555283	3900784	7292139	1356456	60487	13985307
12-17 years	25135943	212251	1294372	3424789	6339895	1013863	50873	12799900
18-29 years	53258240	449959	3220929	7644764	11884993	1596965	109093	28351537
30-39 years	44666707	341952	3313531	5955322	9178580	853687	103385	24920250
40-49 years	40278494	285908	2887905	5124837	8091692	601197	80425	23206530
50-64 years	62799204	442303	3387171	7510669	8643959	690101	100650	42024351
65-74 years	32549398	201422	1561105	3246374	2978586	284037	41017	24236857
75-84 years	16451547	87254	749059	1413506	1368466	124128	17209	12691925
85 years and over	6658420	28870	309340	524665	534768	45854	6110	5208813

Compute age weights

# compute age weights using entire population in 2020 as reference
age_weights=pop_ref %>%
  transmute(
    age_group,
    age_weight=ALL/sum(ALL)
  )


kable(
  mutate(age_weights,age_weight=round(age_weight*100,2) ),
  caption='2020 Population weights for age-adjustment'
)

2020 Population weights for age-adjustment
age_group	age_weight
0-4 years	5.86
5-11 years	8.61
12-17 years	7.63
18-29 years	16.16
30-39 years	13.56
40-49 years	12.22
50-64 years	19.06
65-74 years	9.88
75-84 years	4.99
85 years and over	2.02

Reformat population data for merging with covid data

pop_narrow=reshape2::melt(pop_ref,id.vars=c('age_group','ALL')) %>%
  transmute(
    age_group,
    group=as.character(variable),
    race_and_hispanic_origin_group=ifelse(group=='Hispanic',group,paste('Non-Hispanic',group,sep=' ')),
    race_and_hispanic_origin_group=str_replace(race_and_hispanic_origin_group,'Black or African American','Black'),
    pop=value
  ) %>%
  merge(age_weights,by='age_group')

Calculate weights for vaccination rate survey

I added this part later on. It’s not necessary to age-adjust deaths.

popd=getPopData()




covid_2020_deaths=tibble::tribble(
  ~age, ~deaths,     ~pop,  ~rate,
   18L,     31L, 4159857L,    0.7,
   19L,     49L, 4272385L,    1.1,
   20L,     51L, 4344572L,    1.2,
   21L,     65L, 4283150L,    1.5,
   22L,     82L, 4293020L,    1.9,
   23L,     90L, 4315220L,    2.1,
   24L,     95L, 4358793L,    2.2,
   25L,    112L, 4461006L,    2.5,
   26L,    133L, 4555511L,    2.9,
   27L,    164L, 4628618L,    3.5,
   28L,    157L, 4751001L,    3.3,
   29L,    226L, 4835107L,    4.7,
   30L,    236L, 4821081L,    4.9,
   31L,    273L, 4627340L,    5.9,
   32L,    266L, 4514736L,    5.9,
   33L,    321L, 4432699L,    7.2,
   34L,    366L, 4442547L,    8.2,
   35L,    365L, 4466747L,    8.2,
   36L,    381L, 4319361L,    8.8,
   37L,    446L, 4374441L,   10.2,
   38L,    519L, 4362662L,   11.9,
   39L,    556L, 4305093L,   12.9,
   40L,    602L, 4378957L,   13.7,
   41L,    750L, 4100150L,   18.3,
   42L,    720L, 4014287L,   17.9,
   43L,    809L, 3967810L,   20.4,
   44L,    931L, 3846684L,   24.2,
   45L,   1051L, 3957972L,   26.6,
   46L,   1114L, 3826694L,   29.1,
   47L,   1269L, 3876923L,   32.7,
   48L,   1482L, 4043448L,   36.7,
   49L,   1682L, 4265569L,   39.4,
   50L,   1833L, 4310670L,   42.5,
   51L,   1891L, 4076434L,   46.4,
   52L,   2051L, 3983727L,   51.5,
   53L,   2191L, 3979962L,   55.1,
   54L,   2400L, 4044734L,   59.3,
   55L,   2674L, 4277856L,   62.5,
   56L,   3033L, 4343881L,   69.8,
   57L,   3263L, 4327862L,   75.4,
   58L,   3545L, 4306615L,   82.3,
   59L,   4116L, 4346885L,   94.7,
   60L,   4427L, 4372095L,  101.3,
   61L,   4641L, 4208976L,  110.3,
   62L,   5169L, 4168433L,    124,
   63L,   5480L, 4106492L,  133.4,
   64L,   5742L, 3944582L,  145.6,
   65L,   6201L, 3897420L,  159.1,
   66L,   6355L, 3720410L,  170.8,
   67L,   6610L, 3562236L,  185.6,
   68L,   6929L, 3407082L,  203.4,
   69L,   7406L, 3286519L,  225.3,
   70L,   7773L, 3192029L,  243.5,
   71L,   8153L, 3074732L,  265.2,
   72L,   8790L, 3017173L,  291.3,
   73L,   9306L, 3117385L,  298.5,
   74L,   8754L, 2274412L,  384.9,
   75L,   8533L, 2219565L,  384.4,
   76L,   9156L, 2130427L,  429.8,
   77L,  10115L, 2147097L,  471.1,
   78L,   9876L, 1839828L,  536.8,
   79L,   9521L, 1649916L,  577.1,
   80L,   9772L, 1526957L,    640,
   81L,   9752L, 1400392L,  696.4,
   82L,  10063L, 1303809L,  771.8,
   83L,  10027L, 1163249L,    862,
   84L,  10215L, 1070307L,  954.4,
   85L, 109529L, 6658420L,   1645
  )




vax_age_cw=tribble(
  ~age_group,~min_age,~max_age,
  '18 - 49 years',18,49,
  '50 - 64 years',50,64,
  '65+ years',65,150
)

vax_pop=popd %>%
  full_join(vax_age_cw,by=character()) %>%
  filter(
    age >= min_age,
    age <= max_age
  ) %>%
  group_by(age_group,group,min_age,max_age) %>%
  summarise(
    pop=sum(pop),
    num_age_rows=n()
  ) %>%
  ungroup() %>%
  mutate(
    age_group=reorder(age_group,min_age)
  )



vax_weights=vax_pop %>%
  filter(
    group=='ALL'
  ) %>%
  transmute(
    age_group,
    pop_weight=pop/sum(pop) # standard age-adjustement weights with 2020 population
  )

death_weights=covid_2020_deaths  %>%
  full_join(vax_age_cw,by=character()) %>%
  filter(
    age >= min_age,
    age <= max_age
  ) %>%
  group_by(age_group,min_age,max_age) %>%
  summarise(
    pop=sum(pop),
    deaths=sum(deaths),
    crude_rate=deaths/pop*1e5,
    num_age_rows=n()
  ) %>%
  ungroup() %>%
  transmute(
    age_group=reorder(age_group,min_age),
    death_weight=deaths/sum(deaths,na.rm=T) # non-standard weights with 2020 covid-19 deaths (counts)
  )

vax_weights_comb=merge(vax_weights,death_weights,by='age_group')




kable(
  vax_weights_comb %>% 
    mutate(
      pop_weight=round(pop_weight*100,2),
      death_weight=round(death_weight*100,2)
    ),
  caption='2020 Population weights for age-adjustment'
)

2020 Population weights for age-adjustment
age_group	pop_weight	death_weight
18 - 49 years	53.85	4.39
50 - 64 years	24.47	14.96
65+ years	21.69	80.65

Get covid-19 death data

This particular dataset, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022, comes from CDC data catalog system.

deaths_csv_url='https://data.cdc.gov/api/views/siwp-yg6m/rows.csv?accessType=DOWNLOAD'


if ( !exists('covid_prov') ) {
  
  covid_prov=read_csv(deaths_csv_url) %>%
    clean_names() %>%
    data.frame() %>%
    merge(pop_narrow,by=c('age_group','race_and_hispanic_origin_group') ) %>%
    transmute(
      date=as.Date.character(week_ending_date,"%m/%d/%Y"), # week ending date
      race_and_hispanic_origin_group,
      re=race_and_hispanic_origin_group,
      age_group=reorder(age_group,as.integer(str_extract(age_group,'\\d+')) ),
      covid_death_rate=covid_19_deaths/pop*1e6, # covid deaths per million
      total_death_rate=total_deaths/pop*1e6,
      covid_death_share=covid_19_deaths/total_deaths,
      covid_19_deaths,
      total_deaths,
      pop,
      age_weight
  )
  
} else {
  warning("Re-using previous download of covid-19 deaths!")
}

Calculate age-adjusted rates

ageAdjRates=covid_prov %>%
  group_by(date,race_and_hispanic_origin_group) %>%
  summarise(
    crude_rate=sum(covid_19_deaths)/sum(pop)*1e6,
    covid_death_rate_adj=sum(covid_death_rate*age_weight),
    num_deaths=sum(covid_19_deaths),
    se=(covid_death_rate_adj)/sqrt(num_deaths),
    ci=(1.96*se),
    rate=covid_death_rate_adj,
    rate_lo=covid_death_rate_adj-ci,
    rate_hi=covid_death_rate_adj+ci,
    num_groups=n()
  ) %>%
  ungroup() %>%
  mutate(
    re=race_and_hispanic_origin_group
  )

Deaths analysis

Long term trend

The long-term trend on age-adjusted rates

# https://data.cdc.gov/NCHS/AH-Provisional-COVID-19-Death-Counts-by-Week-Race-/siwp-yg6m
ageAdjRates %>%
  filter(
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black',
      'Non-Hispanic Asian'
    ),
    T
  ) %>%
  ggplot(aes(date,covid_death_rate_adj,color=re,group=re)) +
    coord_cartesian(xlim=as.Date(c('2020-03-15',NA)),expand=F) +
    scale_x_date(date_breaks='2 months',date_labels = '%b %Y') +
    geom_ribbon(aes(ymin=rate_lo,ymax=rate_hi),alpha=0.3) +
    geom_line(size=1) +
    theme_ipsum() +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90,vjust=0.5)

    ) +
    labs(
      y='weekly covid-19 deaths per million, age-adjusted',
      x='Week ending date (epidemiological weeks)',
      title='Age-adjusted covid-19 death rate by week',
      caption=paste(
        'source: US CDC, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022',
        'and 2020 US census statistics via CDC WONDER',
        'Date adjusted in 10 age-intervals provided by CDC using 2020 census as denominator and population weights',
        sep='\n'
      )
    ) +
    geom_text_repel(aes(label=re),data=~filter(.,date=='2021-01-02'))

The recent trend in age-adjusted rates, zoomed-in as it were

ageAdjRates %>%
  filter(
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black',
      'Non-Hispanic Asian'
    ),
    date >= '2022-03-01',
    
    T
  ) %>%
  ggplot(aes(date,covid_death_rate_adj,color=re,group=re)) +
    geom_line(size=2) +
    theme_ipsum() +
    theme(legend.position='none') +
    labs(
      y='weekly covid-19 deaths per million, age-adjusted',
      x='Week ending date (epidemiological weeks)',
      title='Age-adjusted death rates "flipped" recently',
      caption=paste(
        'source: US CDC, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022',
        'and 2020 US census statistics via CDC WONDER',
        'Date adjusted in 10 age-intervals provided by CDC using 2020 census as denominator and population weights',
        sep='\n'
      )
    ) +
    #scale_y_log10()  +
    geom_text_repel(aes(label=re),data=~filter(.,date=='2022-05-07'))

# fix for missing fonts issue - bug with Rttf2pt1 version (segfaults)
# library(extrafont)
# library(remotes)
# remotes::install_version("Rttf2pt1", version = "1.3.8")
# extrafont::font_import()
#extrafont::loadfonts()

Focus on recent weeks and months

ageAdjRates %>%
  filter(
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black'
    ),
    date >= '2022-04-01',
    T
  ) %>%
  ggplot(aes(date,rate,color=re,group=re)) +
    scale_x_date(date_breaks='1 week',date_labels = '%b %d %Y') +
    geom_ribbon(aes(ymin=rate_lo,ymax=rate_hi),alpha=0.3) +
    geom_line(size=2) +
    theme_ipsum() +
    theme(
      legend.position='bottom',
      axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)
    ) +
    labs(
      color=NULL,
      y='weekly covid-19 deaths per million, age-adjusted',
      x='Week ending date (epidemiological weeks)',
      title='These are also statistically significant differences!',
      caption=paste(
        'source: US CDC, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022',
        'and 2020 US census statistics via CDC WONDER',
        'Date adjusted in 10 age-intervals provided by CDC using 2020 census as denominator and population weights',
        'Shaded area is 95% confidence interval, approximated on standardized rates (Keyfits, 1966)',
        sep='\n'
      )
    ) +
    coord_cartesian(expand=F)

It’s about elderly death rates

This is easily explained by higher elderly death rates.

covid_prov %>%
  filter(
    as.integer(str_extract(age_group,"\\d+")) >= 60,
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black'
    ),
    date >= '2022-04-01'
  ) %>%
  ggplot(aes(date,covid_death_rate,color=re)) +
    geom_line(size=1) +
    theme_ipsum() +
    theme(legend.position='none') +
    facet_wrap(~age_group) +
    geom_text_repel(aes(label=re),data=~filter(.,date=='2022-05-07')) +
    labs(
      title='The higher age-adjusted rate is explained by higher elderly\ncovid-19 death rates amongst whites recently',
      y='covid-19 deaths per million'
    )

It is possible that young black and latinos still suffer more (it’s getting very close!). However, even if the ratio for young blacks and latinos were significantly greater than one, the higher absolute death rate amongst the elderly simply carries vastly more weight even after age-adjustment.

covid_prov %>%
  merge(age_cw,by='age_group') %>%
  filter(
    max_age <= 29,
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black'
    ),
    date >= '2022-02-01'
  ) %>%
  group_by(date,re) %>% 
  summarise(
    covid_death_rate=sum(covid_19_deaths)/sum(pop)*1e6
  ) %>%  
  ungroup() %>%
  ggplot(aes(date,covid_death_rate,color=re)) +
    geom_line(size=1) +
    theme_ipsum() +
    theme(legend.position='none') +
    #facet_wrap(~age_group,scale='free_y') +
    geom_text_repel(aes(label=re),data=~filter(.,date=='2022-05-07')) +
    labs(
      title='Even though young black and latinos may still suffer more',
      subtitle='(it is getting very close!)',
      y='covid-19 deaths per million (crude rate for ages <= 29)'
    )

# this bit is designed to automatically update frame 
# and plot as CDC updates the deaths data set (could change relationship in future weeks!)
# last epiweek ended here May 28 2022

covid_prov %>%
  filter(
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black',
      'Non-Hispanic Asian'
    ),
    date >= '2022-04-15',
    T
  ) %>%
  group_by(re,age_group) %>%
  summarise(
    covid_deaths=sum(covid_death_rate),
    fdate=min(date)-6,
    edate=max(date)
  ) %>%
  ungroup() %>%
  merge(age_cw,by='age_group') %>%
  mutate(
    val=round(covid_deaths,1),
    lbl=sprintf("%0.1f",val),
    lbl=str_pad(lbl, 5, pad = " "),
    revage=reorder(age_group,-min_age),
    re=reorder(re,covid_deaths,sum),
    cv=ifelse(covid_deaths<.01,.01,covid_deaths)
  ) %>% {
  
  fdate=as.character.Date(first(.$fdate),"%b %d %Y")
  edate=as.character.Date(first(.$edate),"%b %d %Y")

  ptitle=sprintf("TOTAL COVID-19 Deaths per million\n%s - %s",fdate,edate)
  
  ggplot(.,aes(re,revage)) +
    geom_tile(aes(fill=cv),alpha=0.6,color='black') +
    geom_text(aes(label=lbl),size=5) +
    theme_ipsum() +
    theme(
      axis.text.x = element_text(angle = 45,hjust=1,size=15),
      legend.position='hide',
      panel.grid.major = element_blank(),
      panel.grid.minor.x = element_blank(),
      panel.grid.minor.y = element_blank()

    ) +
    scale_fill_viridis(trans='log10') +
    labs(
      x=NULL,
      y=NULL,
      title=ptitle,
      caption=paste(
        'source: US CDC, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022',
        'and 2020 population estimates via CDC WONDER.',
        sep='\n'
      )
    )

}

We can decompose the contribution of the differences between racial/ethnic groups within age groups according to its impact on age-adjusted calculations (omitting the youngest age groups for visibility).

covid_prov %>%
  group_by(date,age_group) %>%
  mutate(
    rate_diff=covid_death_rate-mean(ifelse(re=='Non-Hispanic White',covid_death_rate,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  merge(age_cw,by='age_group') %>%
  filter(
    re %in% c('Non-Hispanic Black','Hispanic'),
    date>= '2022-01-01',
    min_age >= 30
  ) %>%
  ggplot(aes(date,rate_diff*age_weight,color=re)) +
    geom_hline(yintercept=0) +
    geom_line(size=1) +
    facet_wrap(~age_group) +
    scale_x_date(date_breaks='1 months', date_labels ='%b %Y') +
    theme_ipsum() +
    theme(
      legend.position='bottom',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.spacing = unit(.1, "lines")
    ) +
    labs(
      title='Differences in rates amongst older age groups\nare likely to dominate any reasonable age-adjustment calculation',
      subtitle='(plot decomposes contribution of racial/ethnic difference by age groups according to its impact)',
      x='MMWR week ending date',
      y='Weekly covid-19 deaths per million, difference from non-hispanic whites\nadjusted to standard 2020 age structure (for all races/ethnicites)',
      color=NULL
    )

We can also trend the ratios of age-adjusted rates

ageAdjRates %>%
  group_by(date) %>%
  mutate(
    ratio_of_white=covid_death_rate_adj/mean(ifelse(re=='Non-Hispanic White',covid_death_rate_adj,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    re %in% c('Non-Hispanic Black','Hispanic'),
    date >= '2020-03-15'
  ) %>%
  ggplot(aes(date,ratio_of_white,color=re)) +
    geom_hline(yintercept=1,size=2) +
  
    geom_line(size=2) +
    geom_smooth(method=lm) +
    scale_y_log10(
      minor_breaks=getLogbreaks(-1,5),
      sec.axis=sec_axis(
        ~.,
        #minor_breaks=getLogbreaks(-1,5)
        breaks=c(seq(.1,1,.1),seq(1,10,1))
      )
    ) +
    annotation_logticks(sides='lr')  +
    coord_cartesian(ylim=c(.1,10),expand=F) +
    scale_x_date(date_breaks='1 months', date_labels ='%b %Y') +

    theme_ipsum() +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank(),
      panel.grid.major.y=element_line(size=1)

    ) +
    labs(
      title='The disparity in age-adjusted death rates has been trending down\nsince the early days of the pandemic',
      subtitle='(current ratios are about where the trend would indicate +/- volatility)',
      y='Age-adjusted rates, ratio of non-hispanic white rate'
    ) +
    geom_label_repel(aes(label=re),data=~filter(.,date=='2020-07-11'))

    #coord_cartesian(expand=F)

Likewise, we can compare the differnces in age-adjusted rates.

ageAdjRates %>%
  group_by(date) %>%
  mutate(
    diff=covid_death_rate_adj-mean(ifelse(re=='Non-Hispanic White',covid_death_rate_adj,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    re %in% c('Non-Hispanic Black','Hispanic'),
    date >= '2020-03-15'
  ) %>%
  ggplot(aes(date,diff,color=re)) +
    geom_hline(yintercept=0,size=2) +
    geom_line(size=2) +
    geom_smooth(method=lm) +
    #coord_cartesian(ylim=c(.1,10),expand=F) +
    scale_x_date(date_breaks='1 months', date_labels ='%b %Y') +
    scale_y_continuous(breaks=seq(-100,100,10)) +
    theme_ipsum() +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank(),
      panel.grid.major.y=element_line(size=1)

    ) +
    labs(
      title='The age-adjusted rates have also been converging',
      y='Age-adjusted covid-19 deaths per million, amount greater than whites'
    ) +
    geom_label_repel(aes(label=re),data=~filter(.,date=='2020-07-11'))

The ‘Simpson’s Paradox’ bit fails to address the claim

Some of them claimed age-adjusted estimates on 2022 year-to-date data directly contradicts Leonhardt and that it’s simply an example of Simpson’s Paradox. While it’s technically true that 2022 YTD data still indicate a disparity, this change is too recent for it to have an affect on YTD data. The disparities were still there early in 2022 and the death rates were simply much higher. Even though the ratios now favor blacks and hispanics, the absolute differences are too small and too recent to have a very large effect on the deaths accumulated in the earlier winter wave.

ageAdjRates %>%
  filter(
    re %in% c(
      'Non-Hispanic White',
      'Hispanic',
      'Non-Hispanic Black'
    ),
    date >= '2022-01-01',
    T
  ) %>%
  ggplot(aes(date,covid_death_rate_adj,color=re,group=re)) +
    geom_line(size=1) +
    theme_ipsum() +
    geom_vline(xintercept=as.Date(c('2022-04-15','2022-01-01')),linetype='dashed') +
    theme(legend.position='none') +
    scale_x_date(date_breaks='1 months', date_labels ='%b %Y') +
    labs(
      y='weekly covid-19 deaths per million, age-adjusted',
      x='Week ending date (epidemiological weeks)',
      title='That the sum of this implies higher minority death rates\nYTD does not imply it has not "flipped" recently!',
      caption=paste(
        'source: US CDC, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022',
        'and 2020 US census statistics via CDC WONDER',
        'Date adjusted in 10 age-intervals provided by CDC using 2020 census as denominator and population weights',
        sep='\n'
      )
    ) +
    #scale_y_log10()  +
    geom_text_repel(aes(label=re),data=~filter(.,date=='2022-05-07'))

This is more obvious if we split 2022 around the time when this ‘flip’ happened.

alt_list=tribble(
  ~label,~start_date,~end_date,
  'First','2022-01-08','2022-04-09',
  'Second','2022-04-16','2022-12-01'
  ) %>%
  mutate(
    start_date=as.Date(start_date),
    end_date=as.Date(end_date)
  )


covid_prov %>%
  full_join(alt_list,by=character()) %>%
  filter(
    date >= start_date,
    date <= end_date,
    re %in% c('Hispanic','Non-Hispanic White','Non-Hispanic Black')
  ) %>%
  group_by(re,age_group,label) %>%
  summarise(
    mean_rate=mean(covid_death_rate,na.rm=T),
    sum_rate=sum(covid_death_rate,na.rm=T),
    num_weeks=n(),
    min_date=min(date)-6,
    max_date=max(date)
  ) %>%
  ungroup() %>%
  mutate(
    time_label=sprintf("%s - %s",as.character.Date(min_date,"%b %d %Y"),as.character(max_date,"%b %d %Y")),
    time_label=reorder(time_label,min_date)
  ) %>%
  ggplot(aes(age_group,sum_rate,color=re,group=re)) +
    geom_line() +
    facet_wrap(~time_label,scale='free_y') +
    theme_ipsum() +
    #scale_y_log10(labels=scales::comma_format(),minor_breaks=getLogbreaks()) +
    #annotation_logticks(sides='lr')  +

    theme(
      axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
      legend.position='bottom'
    ) +
    labs(
      title='The data in the 2022 YTD frame looks very different\nwhen split between before and after mid-April',
      color=NULL,
      y='Total deaths per million',
      #y='Mean weekly deaths per million',
      x=NULL
    )

My earlier analysis on Twitter used the same data source they used for their year-to-date analysis (CDC WONDER), albeit on monthly and even weekly basis, so we can be pretty sure it’s not simply a difference in the CDC data sources. This can be trivially reconciled as a consequence of basic arithmetic. The deaths that occurred prior to the ‘flip’, starting around April 2022, simply outweigh that subsequent to it thus far.

Changes in crude rates predict age-adjusted changes

Another thing those going on about Simpson’s Paradox failed to pick up on is that crude rates are not useless for understanding changes in age adjusted rates. Because we can be reasonably certain age structure has changed little since the start of the pandemic, it’s not at all unreasonable to infer changes in the age-adjusted disparity from changes in the crude disparity. Indeed, the expected proportional change is 1:1. A 50% decline in the crude disparity implies an expected 50% decline in the age adjusted disparity – it may be a little higher or a little lower ex post, but it’s a pretty good unbiased prediction.

lineLabs=tribble(
  ~date,~label,
  '2022-01-01','Leonhardt detractors claimed (wrongly) he argued "flipped" around here',
  '2022-04-01','Age-adjusted RR for "flipped" for both groups by April 2022'
) %>%
  mutate(
    date=as.Date(date),
    rr=1,
    variable=NA
  ) 

eqLabs=tribble(
  ~date,~label,
  '2021-01-01','Line of equality - "Flips" below here'
) %>%
  mutate(
    date=as.Date(date),
    rr=1,
    variable=NA
  ) 

ageAdjRates %>%
  filter(
    date >= '2020-03-15'
  ) %>%
  mutate(
    `Crude`=crude_rate,
    `Age Adjusted`=covid_death_rate_adj
  ) %>%
  melt(measure.vars=c('Age Adjusted','Crude')) %>%
  group_by(date,variable) %>%
  mutate(
    rr=value/mean(ifelse(re=='Non-Hispanic White',value,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    re %in% c('Non-Hispanic Black','Hispanic')
  ) %>%
  ggplot(aes(date,rr,color=variable)) +
    facet_wrap(~re,ncol=2) +
    geom_hline(yintercept=1) +
    geom_vline(
      xintercept=lineLabs$date
    ) +
    geom_text(
      aes(label=label),
      data=lineLabs,
      angle=90,
      nudge_x=10
    ) +
    geom_text(
      aes(label=label),
      data=eqLabs,
      nudge_y=.01,
      size=6
    ) +
    scale_y_log10( 
      minor_breaks=c(seq(2,9,1)*.1,seq(2,9,1)),
      breaks=c(.1,1,10),
      sec.axis=sec_axis(~.,
        breaks=c(seq(1,10,1),seq(1,10,1)*.1)                
      )
    ) +
    annotation_logticks(sides='lr')  +  
    geom_smooth(method=lm,se=F,linetype='dotted') +
    geom_line(size=1) +
    theme_ipsum() +
    scale_x_date(date_breaks='1 months',date_labels = '%b %Y') +
    geom_label_repel(aes(label=variable),data=~filter(.,date=='2020-07-18'),size=6,alpha=0.5) +
    geom_text(
      aes(label=label),
      color='black',
      size=8,
      data=function(.) {
        
        dcast(.,re+date~variable) %>%
          group_by(re) %>%
          clean_names() %>%
          summarise(
            cor=cor(log(age_adjusted),log(crude))
          ) %>%
          ungroup() %>%
          mutate(
            variable='alt',
            date=as.Date('2021-01-01'),
            rr=3.5,
            label=sprintf("r = %0.3f",round(cor,3) )
          )
          
      }
    ) +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank()
    ) +
    labs(
      title='Contra Leonhardt\'s detractors, this is poor example of Simpson\'s paradox',
      subtitle='The crude disparity is a very strong proxy for the age-adjusted disparity. At best, age adjustment has a modest effect on the intercept.\nThe proportion between these crude and adjusted relative risk statistics has been approximately constant over the past 2 years.',
      x='Week ending date (epiweeks)',
      y='Relative risk (reference: white)',
      caption=paste(
        'source: US CDC, AH Provisional COVID-19 Death Counts by Week, Race, and Age, United States 2020-2022',
        'and 2020 US census statistics via CDC WONDER',
        'Date adjusted in 10 age-intervals provided by CDC using 2020 census as denominator and population weights',
        sep='\n'
      )

    ) +
    coord_cartesian(ylim=c(.1,10),expand=F)

ageAdjRates %>%
  filter(
    date >= '2020-03-15'
  ) %>%
  mutate(
    `Crude`=crude_rate,
    `Age Adjusted`=covid_death_rate_adj
  ) %>%
  melt(measure.vars=c('Age Adjusted','Crude')) %>%
  group_by(date,variable) %>%
  mutate(
    rr=value/mean(ifelse(re=='Non-Hispanic White',value,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    re %in% c('Non-Hispanic Black','Hispanic')
  ) %>%
  dcast(re+date~variable) %>%
  clean_names() %>%
  mutate(
    ratio=age_adjusted/crude,
    delta=age_adjusted-crude
  ) %>%
  ggplot(aes(date,ratio,color=re)) +
    geom_line(size=2) +
    scale_x_date(date_breaks='1 months',date_labels = '%b %Y') +
    scale_y_continuous(
      breaks=seq(-2,5,.1)
    ) +
    coord_cartesian(ylim=c(0,3)) +
    theme_ipsum() +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank()
    ) +  
    geom_label_repel(aes(label=re),data=~filter(.,date=='2020-11-28'),size=7) +
    labs(
      title='The ratio between crude and age-adjusted\nrelative risk is approximately constant',
      subtitle='(the age-adjusted disparity can be well approximated by multiplying the crude\ndisparity by the averages between these two statistics)',
      y='Ratio of age adjusted RR to crude RR (reference: white)',
      caption='The constant depends on the groups being compared and the age adjustment methods (pop reference, age intervals, etc)\nObviously, this works for covid-19 because population changes little in 2 years & the mortality slope on age has been fairly stable.'
    )

It’s a good bet that age-adjusted rates have ‘flipped’ once the crude rates ratios have flipped to a sufficiently large degree.

Vax analysis

Vax rates can plausibly explain this

CDC provides vax rates by race and age groups. We can fetch the underlying data and do some additional analysis.

# https://data.cdc.gov/Vaccinations/National-Immunization-Survey-Adult-COVID-Module-NI/akkj-j5ru/data


# this download is rather large and computationally intensive
# so only re-run this once when building the notebook (can manually update if desired)
if ( !exists('vaxsurvey') ) {
  vaxsurvey=read_csv("https://data.cdc.gov/api/views/udsf-9v7b/rows.csv?accessType=DOWNLOAD") %>%
    clean_names()
}




# Monthly data CDC provided at run time did not come with years attached
# Manually crosswalk those strings for 2021 with the rest being 2022 by default
# Note: They might be fixing this soon because I contacted them about this.

month_cw=tribble(
  ~time_period, ~year,
  'April 22-May 29',2021,
  'May 30-June 26',2021,
  'June 27-July 31',2021,
  'August 1-August 28',2021,
  'August 29-September 25',2021,
  'September 26-October 30',2021,
  'October 31-November 27',2021,
  'November 28-December 31',2021
)

# extract the most relevant info from the download
# and reformat
vaxrates=vaxsurvey %>%
  filter(
    geography_type=='National Estimates',
    group_name=='Age by race/ethnicity',
    indicator_category=='Fully vaccinated with 1 or 2 doses',
    time_type=='Monthly'
  ) %>%
  mutate(
    group_category=str_replace(group_category,'\\s*\u2013\\s*',' - '),
    time_period=str_trim(time_period)
  ) %>%
  merge(month_cw,by=c('time_period','year'),all.x=T) %>%
  mutate(
    year=coalesce(year,2022), # month that do not match the constructed crosswalk are assumed to be year 2022
    time_period,
    start_str=sprintf("%s, %d",str_split(time_period,'\\s*-\\s*',simplify=T)[,1],year),
    end_str=sprintf("%s, %d",str_split(time_period,'\\s*-\\s*',simplify=T)[,2],year),
    start_date=as.Date(start_str,"%B %d, %Y"),
    end_date=as.Date(end_str,"%B %d, %Y"),
    age_group=str_split(group_category,',',simplify=T)[,1],
    race=str_trim(str_split(group_category,',',simplify=T)[,2]),
    estimate_lo=as.numeric(str_split(x95_percent_ci_percent,'\\s*-\\s*',simplify=T)[,1]),
    estimate_hi=as.numeric(str_split(x95_percent_ci_percent,'\\s*-\\s*',simplify=T)[,2])
  )

Here are the most recent data without age-adjustment.

vaxrates %>%
  filter(
    !str_detect(race,'Other'),
    start_date >= '2021-09-01',
    T
  ) %>%
  ggplot(aes(end_date,estimate_percent/100,color=race)) +
    geom_ribbon(aes(ymin=estimate_lo/100,ymax=estimate_hi/100),alpha=0.4) +
    geom_line(size=2) +
    geom_point(size=3) +
    facet_wrap(~age_group,scale='free_y') +
    scale_x_date(date_breaks='1 months',date_labels = '%b %Y') +
    scale_y_percent(breaks=seq(0,1,.05)) +
    theme_ipsum() +
    labs(
      title='Blacks and latinos are likely vaccinated at significantly higher rates in age-adjusted terms!',
      subtitle='(this is most obvious for those 50-64 years of age)',
      y='Percent fully vaccinated with 1 or 2 doses',
      x='Month ending date',
      caption=paste(
        'source: National Immunization Survey Adult COVID Module (NIS-ACM): COVIDVaxViews',
        'url: https://data.cdc.gov/Vaccinations/National-Immunization-Survey-Adult-COVID-Module-NI/udsf-9v7b',
        sep='\n'
      )
    ) +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank(),
      panel.spacing = unit(.2, "lines")

    ) +
    geom_label_repel(aes(label=race),data=~filter(.,time_period=='November 28-December 31'),size=6,alpha=0.8)

Calculate age adjusted vax rates

# Calculate age-adjusted adult vaccination rates from 3 age intervals they provide.
# Obviously, this is crude approximation given the size of these age intervals and 
# the monthly sampling.  Still, I don't think it's entirely useless!

vaxradj= vaxrates %>%
  merge(vax_weights_comb,by='age_group',all=T) %>% 
  group_by(time_period,start_date,end_date,race) %>%
  summarise(
    death_weighted=sum(estimate_percent*death_weight),
    pop_weighted=sum(estimate_percent*pop_weight)
  ) %>%
  ungroup()

vaxradj %>%
  filter(
    !str_detect(race,'Other')
  ) %>%
  ggplot(aes(end_date,pop_weighted/100,color=race)) +
    geom_line(size=2) +
    geom_point(size=3) +
    scale_x_date(date_breaks='1 months',date_labels = '%b %Y') +
    scale_y_percent(breaks=seq(0,1,.05)) +
    theme_ipsum() +
    labs(
      title='Vaccination rates have converged on age-adjusted terms',
      subtitle='(these are crudely estimated yet still likely conservative vis-a-vis the white gap!)',
      #title='Age-adjusted vaccination rate, adults 18+ in three age intervals (rough)',
      y='Adult fully vaccinated with 1 or 2 doses, age-adjusted',
      x='Month ending date',
      caption=paste(
        'source: National Immunization Survey Adult COVID Module (NIS-ACM): COVIDVaxViews',
        'url: https://data.cdc.gov/Vaccinations/National-Immunization-Survey-Adult-COVID-Module-NI/udsf-9v7b',
        sep='\n'
      )
    ) +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank(),
      panel.spacing = unit(.2, "lines")

    ) +
    geom_label_repel(aes(label=race),data=~filter(.,time_period=='November 28-December 31'),size=6,alpha=0.8)

If we re-weight these vaccination rates according to covid-19 deaths in 2020, the rates increase dramatically. Obviously, this is a non-standard age-adjustment, but I feel it’s a better indication of the ability of vaccination to move the needle on deaths nationally.

vaxradj %>%
  filter(
    !str_detect(race,'Other')
  ) %>%
  ggplot(aes(end_date,death_weighted/100,color=race)) +
    geom_line(size=2) +
    geom_point(size=3) +
    scale_x_date(date_breaks='1 months',date_labels = '%b %Y') +
    scale_y_percent(breaks=seq(0,1,.05)) +
    theme_ipsum() +
    labs(
      title='Age-adjusted vax rates using covid-19 deaths in 2020',
      subtitle='(The older age groups obviously get a much higher weight here)',
      y='Percent fully vaccinated with 1 or 2 doses\nweighted according to 2020 covid-19 death counts',
      x='Month ending date',
      caption=paste(
        'source: National Immunization Survey Adult COVID Module (NIS-ACM): COVIDVaxViews',
        'url: https://data.cdc.gov/Vaccinations/National-Immunization-Survey-Adult-COVID-Module-NI/udsf-9v7b',
        sep='\n'
      )
    ) +
    theme(
      legend.position='none',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank(),
      panel.spacing = unit(.2, "lines")

    ) +
    geom_label_repel(aes(label=race),data=~filter(.,time_period=='November 28-December 31'),size=6,alpha=0.8)

The crude adjustments are apt to systematically understate the magnitude of the white vaccination rate gap because whites are substantially older within them.

popd %>%
  full_join(vax_age_cw,by=character()) %>%
  filter(
    age >= min_age,
    age <= max_age,
    group %in% c('Black or African American','White','Hispanic')
  ) %>%
  group_by(age_group,group) %>%
  mutate(
    share=pop/sum(pop)
  ) %>%
  ungroup() %>%
  mutate(
    age_group2=reorder(ifelse(age>=85,'65+ years (continued for 85+)',age_group),age,mean,na.rm=T)
    #age < 85
  ) %>%
  ggplot(aes(age,share,color=group)) +
    geom_point(size=2) +
    geom_line() +
    facet_wrap(~age_group2,scale='free') +
    theme_ipsum() +
    theme(legend.position='bottom') +
    scale_x_continuous(breaks=seq(15,85,5)) +
    scale_y_percent(accuracy=0.1) +  
    labs(
      title='Within each of these crude age intervals whites skew older',
      y='Age distribution within age group in 2020',
      x='Age, single-year up to 85'
    )

Please note that such issues are much less likely to affect the conclusions on deaths because we’re utilizing more and smaller age intervals. This recent “flip” is found with 10 year, 5 year, and even single-year.¹

A few percentage points higher vaccination rates may not seem like a very big deal. However, this implies a substantially larger proportion of white people are unvaccinated conditional on age and thus at much higher risk, ceteris paribus.

unvax_rr = vaxradj %>%
  group_by(end_date) %>%
  mutate(
    unvax_dw=100-death_weighted,
    unvax_pw=100-pop_weighted,
    rel_dw=unvax_dw/mean(ifelse(race=='White',unvax_dw,NA),na.rm=T),
    rel_pw=unvax_pw/mean(ifelse(race=='White',unvax_pw,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    !str_detect(race,'White|Other')
  ) %>%
  transmute(
    date=end_date,
    race,
    `Not fully vaxed, age-adjusted with 2020 population weights`=rel_pw,
    `Not fully vaxed,age-adjusted with 2020 covid-19 death weights`=rel_dw
  ) %>%
  melt(id.vars=c('date','race'))

death_rrs=ageAdjRates %>%
  mutate(
    re=str_replace_all(as.character(re),c('Non-Hispanic Black'='Black','Non-Hispanic White'='White'))
  )  %>%  
  filter(
    date > '2020-04-01',
    re %in% c('White','Black','Hispanic')
  ) %>%
  group_by(date) %>%
  mutate(
    rr=covid_death_rate_adj/mean(ifelse(re=='White',covid_death_rate_adj,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    re !='White'
  ) %>%
  transmute(
    date,
    race=re,
    value=rr,
    variable='covid-19 death rate, age-adjusted'
  )



bind_rows(death_rrs,unvax_rr) %>%
  ggplot(aes(date,value,color=variable)) +
    geom_hline(yintercept=1) +
    geom_line(size=2) +
    theme_ipsum() +
    theme(
      legend.position='bottom',
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
      panel.grid.minor.x = element_blank()
    ) +
    facet_wrap(~race,ncol=2) +
    geom_smooth(method=lm,se=F,linetype='dotted') +
    scale_x_date(date_breaks='1 months',date_labels = '%b %Y') +
    scale_y_log10( 
      minor_breaks=c(seq(2,9,1)*.1,seq(2,9,1)),
      breaks=c(.1,1,10),
      sec.axis=sec_axis(~.,
        breaks=c(seq(1,10,1),seq(1,10,1)*.1)                
      )
    ) +
    #annotation_logticks(sides='lr') +  
    #scale_y_log10() +
    labs(
      title='The ballpark estimate of relative risk of not fully vaxed\nseems to have "flipped" prior to death rates',
      y='Relative risk (reference: white)',
      color=NULL,
      caption='Dates for vax data refer to ending date of monthly survey period, deaths weekly'
    ) +
    coord_cartesian(ylim=c(.1,10),expand=F)

Final note

For those that somehow haven’t noticed, it’s worth pointing out that covid-19 is much less of a big deal than it was in April 2020.

Deaths from covid-19 have been declining in the United States and throughout the (developed) world. It’s likely at the lowest level since the the earliest days of the pandemic. Moreover, all major racial/ethnic groups and age groups in the United States have benefited from these declines, likely mostly thanks to vaccines, natural immunity, and better medical treatments.

periods_list=tribble(
  ~label,~start_date,~end_date,
  'First','2020-03-15','2020-05-02',
  'Second','2020-10-03','2021-01-02',
  'Third','2021-11-01','2022-02-01',
  'Fourth','2022-04-01','2022-08-20'
) %>%
  mutate(
    start_date=as.Date(start_date),
    end_date=as.Date(end_date)
  )


quick_ds=covid_prov %>%
  full_join(periods_list,by=character()) %>%
  filter(
    date >= start_date,
    date <= end_date,
    re %in% c('Hispanic','Non-Hispanic White','Non-Hispanic Black')
  ) %>%
  group_by(re,age_group,label) %>%
  summarise(
    mean_rate=mean(covid_death_rate,na.rm=T),
    num_weeks=n(),
    min_date=min(date)-6,
    max_date=max(date)
  ) %>%
  ungroup() %>%
  mutate(
    time_label=sprintf("%s - %s",as.character.Date(min_date,"%b %d %Y"),as.character(max_date,"%b %d %Y")),
    time_label=reorder(time_label,min_date)
  )

quick_ds %>%
  ggplot(aes(age_group,mean_rate,color=re,group=re)) +
    geom_line(size=1) +
    facet_wrap(~time_label) +
    theme_ipsum() +
    scale_y_log10(labels=scales::comma_format(),minor_breaks=getLogbreaks()) +
    annotation_logticks(sides='lr')  +

    theme(
      axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
      legend.position='bottom'
    ) +
    labs(
      title='Rates are declining for everyone!',
      subtitle='(higher vax rates, natural immunity, and better medical treatment matter)',
      color=NULL,
      y='Mean weekly deaths per million',
      x=NULL
    )

(at linear scale for easier interpretation)

quick_ds %>%
  ggplot(aes(age_group,mean_rate,color=re,group=re)) +
    geom_line(size=1) +
    facet_wrap(~time_label) +
    theme_ipsum() +
    theme(
      axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
      legend.position='bottom'
    ) +
    labs(
      title='At linear scale for good measure',
      color=NULL,
      y='Mean weekly deaths per million',
      x=NULL
    )

Even if there was still a large disparity in terms of ratios in age-adjusted deaths, i.e., contrary to what’s currently being indicated in CDC data, it would simply matter much, much less at these much lower rates!

Updated with covid hospitalizations

epiweeks=data.frame( input_date = as.Date('2019-01-01')+seq(0,366*5,1)) %>%
  mutate(
    mmwr_week=lubridate::epiweek(input_date),
    mmwr_year=lubridate::epiyear(input_date)
  ) %>%
  group_by(mmwr_year,mmwr_week) %>%
  summarise(
    week_starting_date=min(input_date),
    week_ending_date=max(input_date),
    num_days=n()
  ) %>%
  ungroup() %>%
  filter(
    num_days == 7
  )

# downloaded from https://gis.cdc.gov/grasp/COVIDNet/COVID19_3.html
# now from https://gis.cdc.gov/grasp/covidnet/covid19_3.html
rhosp=read_csv("COVID-19Surveillance_All_Data.csv",skip=2,show_col_types=F) %>%
  clean_names() %>%
  data.frame() %>%
  mutate(
    cumulative_rate=as.numeric(cumulative_rate),
    weekly_rate=as.numeric(weekly_rate)
  ) %>%
  merge(epiweeks,by=c('mmwr_year','mmwr_week'))

rhosp %>%
  filter(
    #network=='COVID-NET',
    age_category=='Overall',
    catchment=='Entire Network',
    sex=='Overall',
    race!='Overall',
    network != 'IHSP',
    T
  ) %>%
  group_by(network,week_ending_date) %>%
  mutate(
    rel_rate=weekly_rate/mean(ifelse(race=='White',weekly_rate,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
      str_detect(race,'Black|Hispanic'),
      !is.na(rel_rate),
      week_ending_date >'2020-03-15'
  ) %>%
  ggplot(aes(week_ending_date,rel_rate,color=network)) +
    geom_line() +
    geom_smooth(method=lm,linetype='dotted',se=F) +
    geom_hline(yintercept=1) +
    coord_cartesian(expand=F,ylim=c(.1,10)) +
    theme_ipsum() +
    theme(
      legend.position='bottom',
      axis.text.x = element_text(angle = 90,hjust=1,size=15),
      panel.spacing = unit(.2, "lines")
    ) +
    scale_y_log10(
      minor_breaks=getLogbreaks(),
      sec.axis=sec_axis(~.,breaks=c(seq(.1,.9,.1),1:10))
    ) +
    annotation_logticks(sides='lr')  +
    scale_x_date(date_breaks='1 months', date_labels ='%b %Y') +
    facet_wrap(~race,ncol=2)  +
    labs(
      title='Relative risk of COVID-19-Associated Hospitalizations',
      y='Crude relative risk (reference: white)',
      x='Week ending date (MMWR)',
      caption='source: CDC\nhttps://gis.cdc.gov/grasp/COVIDNet/COVID19_3.html'
    )

library(ggplot2)

data_deaths=ageAdjRates %>%
  filter(
    date >= '2020-03-15'
  ) %>%
  mutate(
    `Deaths, crude`=crude_rate,
    `Deaths, age adj.`=covid_death_rate_adj
  ) %>%
  melt(measure.vars=c('Deaths, crude','Deaths, age adj.')) %>%
  group_by(date,variable) %>%
  mutate(
    rr=value/mean(ifelse(re=='Non-Hispanic White',value,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
    re %in% c('Non-Hispanic Black','Hispanic')
  ) %>%
  mutate(
    race=ifelse(re=='Hispanic','Hispanic/Latino','Black'),
    week_ending_date=date,
    value=rr
  )

data_hosp=rhosp %>%
  filter(
    age_category=='Overall',
    catchment=='Entire Network',
    sex=='Overall',
    race!='Overall',
    network != 'IHSP',
    T
  ) %>%
  group_by(network,week_ending_date) %>%
  mutate(
    rel_rate=weekly_rate/mean(ifelse(race=='White',weekly_rate,NA),na.rm=T)
  ) %>%
  ungroup() %>%
  filter(
      str_detect(race,'Black|Hispanic'),
      !is.na(rel_rate),
      week_ending_date >'2020-03-15'
  ) %>%
  mutate(
    variable=sprintf('Hospitalizations, crude (%s)',network),
    value=rel_rate
  )


bind_rows(data_hosp,data_deaths) %>%
  ggplot(aes(week_ending_date,log10(value),color=variable)) +
    geom_line() +
    geom_smooth(method=lm,linetype='dotted',se=F) +
    geom_hline(yintercept=0) +
    coord_cartesian(expand=F) +
    #coord_cartesian(expand=F,ylim=c(.1,10)) +
    theme_ipsum() +
    theme(
      legend.position='bottom',
      axis.text.x = element_text(angle = 90,vjust=0.5),
      panel.spacing.x = unit(.5, "lines"),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.x = element_blank()
      #panel.grid.major.y=element_line()

    ) +
    scale_y_continuous(
      breaks=seq(-1,1,.2),
      minor_breaks = log10(c(1/8,1/4,1/2,1,2,4,8)),
      sec.axis = sec_axis(
        ~10^.,
        breaks=c(1/8,1/4,1/2,1,2,4,8),
        
        
        labels=function(v) {
          sprintf("%5s",ifelse(v<1,sprintf("1/%d",round(1/v) ),sprintf("%d",round(v))))
        },
        name='relative risk (linear)'
      )
    ) +
    scale_x_date(date_breaks='1 months', date_labels ='%b %Y') +
    facet_wrap(~race,ncol=2)  +
    labs(
      title='Relative risk of covid-19-associated hospitalizations and deaths',
      y='log10 Relative risk (reference: white)',
      x='Week ending date (MMWR)',
      color=NULL
      #caption='source: CDC\nhttps://gis.cdc.gov/grasp/COVIDNet/COVID19_3.html'
    )

#ggsave("plottest.pdf",gp,units='in',height=8.5,width=11)

Weekly death counts are available on a single year of age basis. CDC WONDER provisional suppresses counts less than 10 but you should be to accurately derive it if you’re mildly creative.↩︎

David Leonhardt is correct about the ‘flip’ in COVID-19 racial disparities