# joining morbidity data sets (copied from milestone 3)
morbidity_all <- bind_rows(sim_ca_clean2, sim_la_clean2)Project Milestone 4
Joining data sets
# Join morbidity and population datasets into a new object
ca_id_rate <- full_join(
morbidity_all,
ca_pop_collapsed,
by = c("county", "sex", "race_ethnicity", "age_cat")
)Visualizations
Graph
ca_id_rate %>%
group_by(race_ethnicity, sex) %>%
summarise(rate = (sum(new_infections) / sum(population)) * 100000) %>%
ggplot(aes(
x = fct_reorder(race_ethnicity, rate, .fun = max, .desc = FALSE), # highest at top
y = rate,
fill = sex)) +
geom_col(position = "dodge") +
coord_flip() +
labs(
title = "Novel Respiratory Disease in California:
New Infection Rate per 100,000\nby Race/Ethnicity and Sex, 2023",
x = "Race/Ethnicity",
y = "Rate per 100,000",
fill = "Sex",
caption = "Source: SIM CA and LA Morbidity Files + CA DOF Population Estimates") +
theme_minimal() +
theme(
plot.title.position = "plot",
plot.title = element_text(hjust = 0.5))Interpretation: Infection rates for the novel respiratory disease in California show clear racial and ethnic disparities. American Indian or Alaska Native and White, Non-Hispanic populations experience the highest infection rates, followed by Black, Non-Hispanic and Hispanic populations. Native Hawaiian or Pacific Islander, Asian, and Multiracial populations have noticeably lower rates. Differences between males and females within each racial/ethnic group are present but relatively small compared with the larger disparities across groups.
Table
age_table2 <- ca_id_rate %>%
group_by(age_cat, sex) %>%
summarise(rate = (sum(new_infections) / sum(population)) * 100000,
.groups = "drop") %>%
pivot_wider(names_from = sex, values_from = rate) %>%
mutate(
Overall = (Female + Male) / 2
) %>%
arrange(age_cat)
# Define shared palette
pal <- colorRampPalette(c("#FFEA00", "#FFC300","#FFA200","#FF7B00"))(nrow(age_table2))
age_table_color <- age_table2 %>%
mutate(
Female = cell_spec(round(Female, 1), "html",
background = pal[rank(Female, ties.method = "first")]),
Male = cell_spec(round(Male, 1), "html",
background = pal[rank(Male, ties.method = "first")]),
Overall = cell_spec(round(Overall, 1), "html",
background = pal[rank(Overall, ties.method = "first")])
)
caption_html <- "<span style='color:black; font-weight:bold; font-size:20px;'>
Novel Respiratory Disease in California: New Infection Rate per 100,000<br>
by Age Group and Sex
</span>"
kable(age_table_color,
escape = FALSE,
format = "html",
caption = caption_html,
col.names = c("Age Group", "Female", "Male", "Overall")) %>%
kable_styling(full_width = FALSE)| Age Group | Female | Male | Overall |
|---|---|---|---|
| 0-17 | 129.3 | 132.8 | 131 |
| 18-49 | 451.1 | 479.4 | 465.2 |
| 50-64 | 243.7 | 248.4 | 246.1 |
| 65+ | 609.4 | 624.8 | 617.1 |
Interpretation: The age groups with the highest infection rate are those over 65 year of age followed by those 18-49 years. Given that the risk does not strictly increase or decrease proportional with age which would be more likely to suggest a biological basis for difference in infection rate, behavioral differences between the age strata should be examined as part of selecting risk reduction strategies.