Flaxman et al. recently published a pre-print claiming covid-19 was near the top of the top-10 leading causes of death for children during the pandemic. Several have others have already pointed out numerous issues with this analysis, most notably Kelley of covid-georgia and @COVIDData3.

In rough order of priority:

Because Flaxman et al. presumably wish to include the most recent covid-19 deaths up through April 2022, the best way to handle this is to compare these covid deaths from March 2020 to April 2022 with other causes of deaths shifted back exactly one year so that we encompass the same times of year (seasons) and span the same number of days. This has the advantage of allowing us to get very close to an “apples-to-apples” comparison with the available data while still trying to reflect the most recent covid deaths.1

Although it’d certainly be even easier to do this in one query for the exact same time span, including the provisional estimates for other causes of death in such a manner would substantially inflate covid’s rank as many other causes of death are not yet available for the last few weeks of 2021 through 2022.2

More specifically, my approach is query this data spanning MMWR weeks like so:

Causes Start Week End Week Number of weeks
Covid-19 2020 Week 10
(ending March 07, 2020)
2022 Week 17
(ending April 30, 2022)
113
All other 2019 Week 10
(ending March 09, 2019)
2021 Week 17
(ending May 01, 2021)
113

One other advantage of using MMWR weeks is that CDC WONDER provides population and crude rates whereas that information is not provided with other sub-annual queries. All of which makes this analysis all that much more straight forward.3.



library(knitr)
library(tidyverse)
library(janitor)
library(stringr)
library(reshape2)
library(hrbrthemes)
library(ggrepel)
library(viridis)

readCDCRates=function(filename,period) {
  read_delim(filename,delim='\t',show_col_type=F) %>%
    clean_names() %>%
    transmute(
      age_group=str_trim(five_year_age_groups),
      cause=str_trim(ucd_icd_10_113_cause_list),
      deaths=as.integer(deaths),
      pop=as.integer(population),
      crude_rate=as.numeric(crude_rate),
      rate=deaths/pop*1e6, # per million
      rate_rounded=round(rate,1), 
      period=period
    ) %>%
    filter(
      !is.na(deaths),
      str_detect(cause,'^#') # only return 'rankable' items from the 113 cause list
    )
}



file_co='leading_cause_comp/2020_wk_10 through 2022_wk_17 only covid-19.txt'
lc_covid_only=readCDCRates(file_co,'MMWR weeks: 2020/10 through 2022/17')


file_cl='leading_cause_comp/2019_wk_10 through 2021_wk_17 all 113 cause list.txt'
lc_causelist=readCDCRates(file_cl,'MMWR weeks: 2019/10 through 2021/17')


# calculate annualized adjustment for CDCs crude rates and/or the population (exposures)
actual_weeks=113
possible_weeks=(3*52)+1 # 157 b/c 2020 has a leap week & it's included in both denominators
annualized_adj=possible_weeks/actual_weeks


lc_comb=bind_rows(lc_covid_only,lc_causelist) %>%
  filter(
    # remove covid-19 item from the period starting in 2019
    !( str_detect(period,'2019') & str_detect(cause,'COVID') )
  ) %>%
  mutate(
    age_only=str_trim(str_remove(age_group,"years?")),
    last_age=case_when(
      age_only == '< 1' ~ as.integer(0),
      age_only == '100+' ~ as.integer(100),
      TRUE ~ as.integer(str_extract(age_only,'\\d+$'))            
    ),
    age_group=reorder(age_group,last_age),
    age_only=reorder(age_only,last_age),

    
    # prettier names
    cause_simp=case_when(
      str_detect(cause,"#Congenital malformations") ~ 'Congenital Anomalies',
      str_detect(cause,"#Accidents") ~ 'Unintentional Injury',
      str_detect(cause,"#Assault") ~ 'Homicide',
      str_detect(cause,"#Intentional self-harm") ~ 'Suicide',
      str_detect(cause,"benign neoplasms") ~ 'Benign Neoplasms',
      T ~ str_remove_all(str_remove(cause,"#"),"\\(.*\\)")
    ),
    cause_simp=str_trim(cause_simp)
    
  ) %>%
  arrange(age_group,rate,cause) %>%
  group_by(age_group) %>%
  mutate(
    rank=rank(-rate,ties.method ='first'),
    rate_annualized=round(rate*annualized_adj,1),
    clab=sprintf("%s\n%0.1f",str_wrap(cause_simp,25),rate_annualized)
  ) %>%
  ungroup() 


lc_comb %>%
  filter(
    rank <= 10,
    last_age <= 19,
    T
  ) %>%
  mutate(
    type=ifelse(str_detect(cause,'COVID'),'COVID','Others')
  ) %>%
  ggplot(aes(age_group,rank,fill=rate)) +
    geom_tile(alpha=0.3,color='black',data=~filter(.,type!='COVID')) +
    geom_tile(color='red',alpha=0.3,size=1,data=~filter(.,type=='COVID')) +  
    scale_fill_viridis_c(trans='log10') +
    geom_text(aes(label=clab),size=4) +
    scale_y_reverse(breaks=seq(1,40,1)) +
    scale_x_discrete(position='top') +
    coord_cartesian(expand=F) +
    theme_ipsum() +
    theme(
      legend.position='none',
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank()
    ) +
    labs(
      x=NULL,
      y='Rank within age group',
      title='Leading causes of death for children: covid-19 since the start of the pandemic\nversus other causes shifted back one MMWR year.',
      subtitle='(the number below is annualized deaths per million)',
      caption=paste(
        'source: CDC WONDER, Provisional Mortality Statistics, 2018 through Last Month Results',
        'Underlying causes, all rankable items from 113 cause list, ranked by rates',
        'Covid-19: MMWR weeks 2020/10 through 2022/17',
        'Other causes: MMWR weeks, 2019/10 through 2021/17',
        'Same number of weeks and same seasons covered',
        sep='\n'
      )
    )



nformats=format.args = list(decimal.mark = ".", big.mark = ",")
lc_comb %>%
  filter(
    str_detect(cause,'COVID'),
    last_age < 85 # dont have population data to sort 85+ as rates, tho deaths will work
  ) %>%
  transmute(
    age_group,
    rank,
    deaths,
    `population`=round(pop),    
    #`population (100k)`=round(pop/1e5),
    `rate (as quoted)`=rate_rounded,
    `rate (annualized)`=rate_annualized
  ) %>% 
  kable(caption='COVID-19 deaths and rank within age group',format.args = nformats)
COVID-19 deaths and rank within age group
age_group rank deaths population rate (as quoted) rate (annualized)
< 1 year 9 172 11,205,030 15.4 21.3
1-4 years 9 103 46,698,846 2.2 3.1
5-9 years 7 106 60,713,133 1.7 2.4
10-14 years 8 143 62,263,269 2.3 3.2
15-19 years 6 564 62,882,787 9.0 12.5
20-24 years 5 1,630 64,784,265 25.2 35.0
25-29 years 4 3,335 69,693,729 47.9 66.5
30-34 years 4 6,279 68,515,209 91.6 127.3
35-39 years 2 9,660 65,484,912 147.5 205.0
40-44 years 3 15,317 60,923,664 251.4 349.3
45-49 years 4 24,214 59,911,818 404.2 561.5
50-54 years 3 37,272 61,186,581 609.2 846.3
55-59 years 3 55,512 64,809,297 856.5 1,190.1
60-64 years 3 79,350 62,401,734 1,271.6 1,766.7
65-69 years 3 95,764 53,621,001 1,785.9 2,481.4
70-74 years 3 114,144 44,027,193 2,592.6 3,602.1
75-79 years 3 116,598 29,960,499 3,891.7 5,407.1
80-84 years 3 113,423 19,394,142 5,848.3 8,125.5

While we are at it, we can also report these rates for adults.


lc_comb %>%
  filter(
    rank <= 10,
    last_age > 19,
    last_age <= 54,
    T
  ) %>%
  mutate(
    type=ifelse(str_detect(cause,'COVID'),'COVID','Others')
  ) %>%
  ggplot(aes(age_group,rank,fill=rate)) +
    geom_tile(alpha=0.3,color='black',data=~filter(.,type!='COVID')) +
    geom_tile(color='red',alpha=0.3,size=1,data=~filter(.,type=='COVID')) +  
    scale_fill_viridis_c(trans='log10') +
    geom_text(aes(label=clab),size=3) +
    scale_y_reverse(breaks=seq(1,40,1)) +
    scale_x_discrete(position='top') +
    coord_cartesian(expand=F) +
    theme_ipsum() +
    theme(
      legend.position='none',
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank()
    ) +
    labs(
      x=NULL,
      y='Rank within age group',
      title='Leading causes of death: covid-19 since the start of the pandemic\nversus other causes shifted back one MMWR year.',
      subtitle='(the number below is annualized deaths per million)',
      caption=paste(
        'source: CDC WONDER, Provisional Mortality Statistics, 2018 through Last Month Results',
        'Underlying causes, all rankable items from 113 cause list, ranked by rates',
        'Covid-19: MMWR weeks 2020/10 through 2022/17',
        'Other causes: MMWR weeks, 2019/10 through 2021/17',
        'Same number of weeks and same seasons covered',
        sep='\n'
      )
    )



lc_comb %>%
  filter(
    rank <= 10,
    last_age >= 55,
    last_age <= 84,
    T
  ) %>%
  mutate(
    type=ifelse(str_detect(cause,'COVID'),'COVID','Others')
  ) %>%
  ggplot(aes(age_group,rank,fill=rate)) +
    geom_tile(alpha=0.3,color='black',data=~filter(.,type!='COVID')) +
    geom_tile(color='red',alpha=0.3,size=1,data=~filter(.,type=='COVID')) +  
    scale_fill_viridis_c(trans='log10') +
    geom_text(aes(label=clab),size=3) +
    scale_y_reverse(breaks=seq(1,40,1)) +
    scale_x_discrete(position='top') +
    coord_cartesian(expand=F) +
    theme_ipsum() +
    theme(
      legend.position='none',
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank()
    ) +
    labs(
      x=NULL,
      y='Rank within age group',
      title='Leading causes of death: covid-19 since the start of the pandemic\nversus other causes shifted back one MMWR year.',
      subtitle='(the number below is annualized deaths per million)',
      caption=paste(
        'source: CDC WONDER, Provisional Mortality Statistics, 2018 through Last Month Results',
        'Underlying causes, all rankable items from 113 cause list, ranked by rates',
        'Covid-19: MMWR weeks 2020/10 through 2022/17',
        'Other causes: MMWR weeks, 2019/10 through 2021/17',
        'Same number of weeks and same seasons covered',
        sep='\n'
      )
    )


  1. Which isn’t to say one cannot quibble with this methodology either. For example, comparing March 2020-April 2022 may fail to reflect some of the seasonality in risk that would be better observed if we compared these statistics across entire years or in such as way as to ensure all seasons are equally weighted.↩︎

  2. As I understand it, this is because covid-19 was added some time ago to the National Notifiable Diseases Surveillance System whereas many other causes have to wait to work their way through for the usual/slower reporting process.↩︎

  3. On the other hand, the populations they provide are based on the sum of population estimates across each of the three years these MMWR weeks span, with the numerator (deaths) unadjusted, so the rates are not quite annualized either. This makes no difference for internal comparisons because each period is exactly 113 weeks long. They can be annualized by multiplying the rate by ~1.39 (157 / 113)↩︎

---
title: "Leading causes comparison"
output:
  html_notebook:
    toc: yes
    code_folding: hide
  html_document:
    toc: yes
    df_print: paged
---



[Flaxman et al.](https://twitter.com/flaxter/status/1538533537355288577) recently published a [pre-print claiming covid-19](https://www.medrxiv.org/content/10.1101/2022.05.23.22275458v2) was near the top of the top-10 leading causes of death for children during the pandemic.  Several have others have already pointed out numerous issues with this analysis, most notably [Kelley of covid-georgia](https://www.covid-georgia.com/pediatric-news/fact-check-covid-is-a-leading-cause-of-death-in-children/) and [@COVIDData3](https://twitter.com/COVIDData3/status/1538391991578411009).

In rough order of priority:

- Comparing covid-19 as a *multiple* cause of death to other causes as *underlying* causes.
- Comparing cumulative rates from covid-19 to annualized rate for other causes.
- Comparing rates for covid-19 across multiple years to a single year for other causes
  - This may be affected seasonality and year-to-year volatility.

Because Flaxman et al. presumably wish to include the most recent covid-19 deaths up through April 2022, the best way to handle this is to compare these covid deaths from March 2020 to April 2022 with other causes of deaths shifted back exactly one year so that we encompass the same times of year (seasons) and span the same number of days.  This has the advantage of allowing us to get very close to an "apples-to-apples" comparison with the available data while still *trying* to reflect the most recent covid deaths.^[Which isn't to say one cannot quibble with this methodology either. For example, comparing March 2020-April 2022 may fail to reflect some of the seasonality in risk that would be better observed if we compared these statistics across entire years or in such as way as to ensure all seasons are equally weighted.]


Although it'd certainly be even easier to do this in one query for the exact same time span, including the provisional estimates for other causes of death in such a manner would substantially inflate covid's rank as many other causes of death are [not yet available](https://wonder.cdc.gov/controller/saved/D176/D296F388) for the last few weeks of 2021 through 2022.^[As I understand it, this is because [covid-19 was added](https://www.cdc.gov/nndss/action/stories/covid-19-response.html) some time ago to the National Notifiable Diseases Surveillance System whereas many other causes have to wait to work their way through for the usual/slower reporting process.]

More specifically, my approach is query this data spanning [MMWR weeks](https://ndc.services.cdc.gov/wp-content/uploads/MMWR_Week_overview.pdf) like so:
  
| Causes | Start Week | End Week | Number of weeks |
| --- | ----------- | --------- | ---------- |
| [Covid-19](https://wonder.cdc.gov/controller/saved/D176/D296F215) | 2020 Week 10<br />(ending March 07, 2020)  | 2022 Week 17<br />(ending April 30, 2022) | 113 
| [All other](https://wonder.cdc.gov/controller/saved/D176/D296F222) | 2019 Week 10<br />(ending March 09, 2019) | 2021 Week 17<br />(ending May 01, 2021) | 113

One other advantage of using MMWR weeks is that CDC WONDER provides population and crude rates whereas that information is not provided with other sub-annual queries.  All of which makes this analysis all that much more straight forward.^[On the other hand, the populations they provide are based on the sum of population estimates across each of the three years these MMWR weeks span, with the numerator (deaths) unadjusted, so the rates are not quite annualized either. This makes no difference for internal comparisons because each period is exactly 113 weeks long.  They can be annualized by multiplying the rate by ~1.39 (157 / 113)].






```{r message=FALSE, warning=FALSE, paged.print=FALSE}


library(knitr)
library(tidyverse)
library(janitor)
library(stringr)
library(reshape2)
library(hrbrthemes)
library(ggrepel)
library(viridis)

readCDCRates=function(filename,period) {
  read_delim(filename,delim='\t',show_col_type=F) %>%
    clean_names() %>%
    transmute(
      age_group=str_trim(five_year_age_groups),
      cause=str_trim(ucd_icd_10_113_cause_list),
      deaths=as.integer(deaths),
      pop=as.integer(population),
      crude_rate=as.numeric(crude_rate),
      rate=deaths/pop*1e6, # per million
      rate_rounded=round(rate,1), 
      period=period
    ) %>%
    filter(
      !is.na(deaths),
      str_detect(cause,'^#') # only return 'rankable' items from the 113 cause list
    )
}



file_co='leading_cause_comp/2020_wk_10 through 2022_wk_17 only covid-19.txt'
lc_covid_only=readCDCRates(file_co,'MMWR weeks: 2020/10 through 2022/17')


file_cl='leading_cause_comp/2019_wk_10 through 2021_wk_17 all 113 cause list.txt'
lc_causelist=readCDCRates(file_cl,'MMWR weeks: 2019/10 through 2021/17')


# calculate annualized adjustment for CDCs crude rates and/or the population (exposures)
actual_weeks=113
possible_weeks=(3*52)+1 # 157 b/c 2020 has a leap week & it's included in both denominators
annualized_adj=possible_weeks/actual_weeks


lc_comb=bind_rows(lc_covid_only,lc_causelist) %>%
  filter(
    # remove covid-19 item from the period starting in 2019
    !( str_detect(period,'2019') & str_detect(cause,'COVID') )
  ) %>%
  mutate(
    age_only=str_trim(str_remove(age_group,"years?")),
    last_age=case_when(
      age_only == '< 1' ~ as.integer(0),
      age_only == '100+' ~ as.integer(100),
      TRUE ~ as.integer(str_extract(age_only,'\\d+$'))            
    ),
    age_group=reorder(age_group,last_age),
    age_only=reorder(age_only,last_age),

    
    # prettier names
    cause_simp=case_when(
      str_detect(cause,"#Congenital malformations") ~ 'Congenital Anomalies',
      str_detect(cause,"#Accidents") ~ 'Unintentional Injury',
      str_detect(cause,"#Assault") ~ 'Homicide',
      str_detect(cause,"#Intentional self-harm") ~ 'Suicide',
      str_detect(cause,"benign neoplasms") ~ 'Benign Neoplasms',
      T ~ str_remove_all(str_remove(cause,"#"),"\\(.*\\)")
    ),
    cause_simp=str_trim(cause_simp)
    
  ) %>%
  arrange(age_group,rate,cause) %>%
  group_by(age_group) %>%
  mutate(
    rank=rank(-rate,ties.method ='first'),
    rate_annualized=round(rate*annualized_adj,1),
    clab=sprintf("%s\n%0.1f",str_wrap(cause_simp,25),rate_annualized)
  ) %>%
  ungroup() 




```





```{r fig.height=12, fig.width=12, message=FALSE, warning=FALSE}


lc_comb %>%
  filter(
    rank <= 10,
    last_age <= 19,
    T
  ) %>%
  mutate(
    type=ifelse(str_detect(cause,'COVID'),'COVID','Others')
  ) %>%
  ggplot(aes(age_group,rank,fill=rate)) +
    geom_tile(alpha=0.3,color='black',data=~filter(.,type!='COVID')) +
    geom_tile(color='red',alpha=0.3,size=1,data=~filter(.,type=='COVID')) +  
    scale_fill_viridis_c(trans='log10') +
    geom_text(aes(label=clab),size=4) +
    scale_y_reverse(breaks=seq(1,40,1)) +
    scale_x_discrete(position='top') +
    coord_cartesian(expand=F) +
    theme_ipsum() +
    theme(
      legend.position='none',
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank()
    ) +
    labs(
      x=NULL,
      y='Rank within age group',
      title='Leading causes of death for children: covid-19 since the start of the pandemic\nversus other causes shifted back one MMWR year.',
      subtitle='(the number below is annualized deaths per million)',
      caption=paste(
        'source: CDC WONDER, Provisional Mortality Statistics, 2018 through Last Month Results',
        'Underlying causes, all rankable items from 113 cause list, ranked by rates',
        'Covid-19: MMWR weeks 2020/10 through 2022/17',
        'Other causes: MMWR weeks, 2019/10 through 2021/17',
        'Same number of weeks and same seasons covered',
        sep='\n'
      )
    )

```




```{r message=FALSE, warning=FALSE, paged.print=FALSE,results='asis'}


nformats=format.args = list(decimal.mark = ".", big.mark = ",")
lc_comb %>%
  filter(
    str_detect(cause,'COVID'),
    last_age < 85 # dont have population data to sort 85+ as rates, tho deaths will work
  ) %>%
  transmute(
    age_group,
    rank,
    deaths,
    `population`=round(pop),    
    #`population (100k)`=round(pop/1e5),
    `rate (as quoted)`=rate_rounded,
    `rate (annualized)`=rate_annualized
  ) %>% 
  kable(caption='COVID-19 deaths and rank within age group',format.args = nformats)

```




While we are at it, we can also report these rates for adults.

```{r fig.height=12, fig.width=12, message=FALSE, warning=FALSE, paged.print=FALSE}

lc_comb %>%
  filter(
    rank <= 10,
    last_age > 19,
    last_age <= 54,
    T
  ) %>%
  mutate(
    type=ifelse(str_detect(cause,'COVID'),'COVID','Others')
  ) %>%
  ggplot(aes(age_group,rank,fill=rate)) +
    geom_tile(alpha=0.3,color='black',data=~filter(.,type!='COVID')) +
    geom_tile(color='red',alpha=0.3,size=1,data=~filter(.,type=='COVID')) +  
    scale_fill_viridis_c(trans='log10') +
    geom_text(aes(label=clab),size=3) +
    scale_y_reverse(breaks=seq(1,40,1)) +
    scale_x_discrete(position='top') +
    coord_cartesian(expand=F) +
    theme_ipsum() +
    theme(
      legend.position='none',
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank()
    ) +
    labs(
      x=NULL,
      y='Rank within age group',
      title='Leading causes of death: covid-19 since the start of the pandemic\nversus other causes shifted back one MMWR year.',
      subtitle='(the number below is annualized deaths per million)',
      caption=paste(
        'source: CDC WONDER, Provisional Mortality Statistics, 2018 through Last Month Results',
        'Underlying causes, all rankable items from 113 cause list, ranked by rates',
        'Covid-19: MMWR weeks 2020/10 through 2022/17',
        'Other causes: MMWR weeks, 2019/10 through 2021/17',
        'Same number of weeks and same seasons covered',
        sep='\n'
      )
    )
```



```{r fig.height=12, fig.width=12, message=FALSE, warning=FALSE}
lc_comb %>%
  filter(
    rank <= 10,
    last_age >= 55,
    last_age <= 84,
    T
  ) %>%
  mutate(
    type=ifelse(str_detect(cause,'COVID'),'COVID','Others')
  ) %>%
  ggplot(aes(age_group,rank,fill=rate)) +
    geom_tile(alpha=0.3,color='black',data=~filter(.,type!='COVID')) +
    geom_tile(color='red',alpha=0.3,size=1,data=~filter(.,type=='COVID')) +  
    scale_fill_viridis_c(trans='log10') +
    geom_text(aes(label=clab),size=3) +
    scale_y_reverse(breaks=seq(1,40,1)) +
    scale_x_discrete(position='top') +
    coord_cartesian(expand=F) +
    theme_ipsum() +
    theme(
      legend.position='none',
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank()
    ) +
    labs(
      x=NULL,
      y='Rank within age group',
      title='Leading causes of death: covid-19 since the start of the pandemic\nversus other causes shifted back one MMWR year.',
      subtitle='(the number below is annualized deaths per million)',
      caption=paste(
        'source: CDC WONDER, Provisional Mortality Statistics, 2018 through Last Month Results',
        'Underlying causes, all rankable items from 113 cause list, ranked by rates',
        'Covid-19: MMWR weeks 2020/10 through 2022/17',
        'Other causes: MMWR weeks, 2019/10 through 2021/17',
        'Same number of weeks and same seasons covered',
        sep='\n'
      )
    )
```

