library(tidyverse)
library(janitor)
library(dplyr)
library(knitr)
install.packages("scales")
library(scales)Project Milestone 4 - Group 3 - Juliana, Daniel, Jason
Milestone #4 Objective - Juliana, Daniel, Jason
Loading Libraries…
Data Structures
str(sim_CA)tibble [98,952 × 12] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:98952] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
$ age_cat : chr [1:98952] "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr [1:98952] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:98952] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ dt_diagnosis : Date[1:98952], format: "2023-05-29" "2023-06-05" ...
$ time_int : num [1:98952] 202322 202323 202324 202325 202326 ...
$ new_infections : num [1:98952] 6 1 2 10 19 25 23 18 22 35 ...
$ cumulative_infected : num [1:98952] 6 7 9 19 38 63 86 104 126 161 ...
$ new_unrecovered : num [1:98952] 0 1 0 0 0 1 0 1 1 1 ...
$ cumulative_unrecovered: num [1:98952] 0 1 1 1 1 2 2 3 4 5 ...
$ new_severe : num [1:98952] 0 0 1 0 0 0 0 0 0 0 ...
$ cumulative_severe : num [1:98952] 0 0 1 1 1 1 1 1 1 1 ...
str(sim_LA)tibble [1,736 × 11] (S3: tbl_df/tbl/data.frame)
$ dt_diagnosis : Date[1:1736], format: "2023-05-29" "2023-06-05" ...
$ age_cat : chr [1:1736] "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr [1:1736] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:1736] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ new_infections : num [1:1736] 15 17 23 51 67 75 106 83 91 173 ...
$ cumulative_infected : num [1:1736] 15 32 55 106 173 248 354 437 528 701 ...
$ new_unrecovered : num [1:1736] 0 0 0 0 1 1 4 3 1 3 ...
$ cumulative_unrecovered: num [1:1736] 0 0 0 0 1 2 6 9 10 13 ...
$ new_severe : num [1:1736] 0 0 0 0 0 0 0 1 0 0 ...
$ cumulative_severe : num [1:1736] 0 0 0 0 0 0 0 1 1 1 ...
$ county : chr [1:1736] "Los Angeles County" "Los Angeles County" "Los Angeles County" "Los Angeles County" ...
str(combined_data)tibble [100,688 × 12] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
$ age_cat : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ dt_diagnosis : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
$ time_int : num [1:100688] 202322 202323 202324 202325 202326 ...
$ new_infections : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
$ cumulative_infected : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
$ new_unrecovered : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
$ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
$ new_severe : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
$ cumulative_severe : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
str(ca_pop)tibble [90,132 × 6] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:90132] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
$ health_officer_region: chr [1:90132] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
$ age_cat : chr [1:90132] "0-4" "0-4" "0-4" "0-4" ...
$ sex : chr [1:90132] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:90132] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ pop : num [1:90132] 2008 2128 2142 2057 1965 ...
Collapsing Data in Preparation for Aggregation
ca_pop_collapsed <- ca_pop %>%
mutate(age_cat = case_when(
age_cat %in% c("0-4", "5-11", "12-17") ~ "0-17",
age_cat == "18-49" ~ "18-49",
age_cat == "50-64" ~ "50-64",
age_cat == "65+" ~ "65+",
TRUE ~ NA_character_
)) %>%
group_by(county, health_officer_region, sex, race_ethnicity, age_cat) %>%
summarise(pop = sum(pop), .groups = "drop")Data Structures of Collapsed Dataset
str(ca_pop_collapsed)tibble [3,248 × 6] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:3248] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
$ health_officer_region: chr [1:3248] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
$ sex : chr [1:3248] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:3248] "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" ...
$ age_cat : chr [1:3248] "0-17" "18-49" "50-64" "65+" ...
$ pop : num [1:3248] 533 945 558 351 52080 ...
Data Cleaning + Data Structures for Cleaned Data
combined_data <- combined_data %>% select(-time_int)combined_data <- combined_data %>%
left_join(ca_pop_collapsed,
by = c("county", "age_cat", "sex", "race_ethnicity"))
str(combined_data)tibble [100,688 × 13] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
$ age_cat : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ dt_diagnosis : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
$ new_infections : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
$ cumulative_infected : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
$ new_unrecovered : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
$ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
$ new_severe : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
$ cumulative_severe : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
$ health_officer_region : chr [1:100688] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
$ pop : num [1:100688] 34155 34155 34155 34155 34155 ...
combined_data <- combined_data %>%
mutate(
rate_infection = (new_infections / pop) * 100000,
rate_unrecovered = (new_unrecovered / pop) * 100000,
rate_severe = (new_severe / pop) * 100000,
rate_cum_infection = (cumulative_infected / pop) * 100000,
rate_cum_unrecovered = (cumulative_unrecovered / pop) * 100000,
rate_cum_severe = (cumulative_severe / pop) * 100000)
str(combined_data)tibble [100,688 × 19] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
$ age_cat : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_ethnicity : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ dt_diagnosis : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
$ new_infections : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
$ cumulative_infected : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
$ new_unrecovered : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
$ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
$ new_severe : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
$ cumulative_severe : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
$ health_officer_region : chr [1:100688] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
$ pop : num [1:100688] 34155 34155 34155 34155 34155 ...
$ rate_infection : num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...
$ rate_unrecovered : num [1:100688] 0 2.93 0 0 0 ...
$ rate_severe : num [1:100688] 0 0 2.93 0 0 ...
$ rate_cum_infection : num [1:100688] 17.6 20.5 26.4 55.6 111.3 ...
$ rate_cum_unrecovered : num [1:100688] 0 2.93 2.93 2.93 2.93 ...
$ rate_cum_severe : num [1:100688] 0 0 2.93 2.93 2.93 ...
combined_data <- combined_data %>%
mutate(across(starts_with("rate"), ~ round(.x, 2)))VISUALIZATION #1:
This figure displays epidemic curves using histograms summarizing the incidence rate of infection in the Bay Area among adults 65+ compared to those under 65. Across the entire observation period, adults 65+ consistently experienced higher incidence rates, indicating a disproportionate burden of disease in older populations, regardless of sex or race.
library(dplyr)
library(ggplot2)
library(lubridate)
bay_data <- combined_data %>%
filter(health_officer_region == "Bay Area") %>%
mutate(age_group = ifelse(age_cat == "65+", "65+", "<65"))
bay_summary <- bay_data %>%
group_by(dt_diagnosis, age_group) %>%
summarise(
new_infections = sum(new_infections, na.rm = TRUE),
pop = sum(pop, na.rm = TRUE),
.groups = "drop") %>%
mutate(incidence_rate = (new_infections / pop) * 100000)
ggplot(bay_summary, aes(x = dt_diagnosis, y = incidence_rate, fill = age_group)) +
geom_col(position = "dodge") +
labs(
title = "Weekly Incidence Rate of Infection in the Bay Area",
subtitle = "Comparison between adults 65+ and those under 65 (all races and sexes)",
x = "Week of Diagnosis",
y = "Incidence Rate per 100,000 Population",
fill = "Age Group"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")VISUALIZATION #2:
This table summarizes the total number of new infections, population size, and incidence rates across Bay Area counties. By comparing incidence per 100,000 residents, it shows which counties are experiencing higher relative infection burdens within the region.
bay_area_counties <- c(
"Alameda County", "Contra Costa County", "Marin County", "Napa County", "Monterey County", "San Benito County", "San Francisco County", "San Mateo County", "Santa Clara County", "Santa Cruz County", "Solano County", "Sonoma County")
bay_area_table <- combined_data %>%
filter(county %in% bay_area_counties) %>%
group_by(county) %>%
summarise(
total_cases = sum(new_infections, na.rm = TRUE),
total_population = sum(pop, na.rm = TRUE)
) %>%
mutate(
incidence_per_100k = (total_cases / total_population) * 100000
) %>%
mutate(
total_cases = comma(total_cases),
total_population = comma(total_population),
incidence_per_100k = comma(round(incidence_per_100k, 1))
) %>%
arrange(county)
kable(
bay_area_table,
col.names = c("County", "Total Cases", "Total Population", "Incidence per 100,000"),
align = "c",
caption = "Incidence rate per 100,000 in Bay Area counties"
)| County | Total Cases | Total Population | Incidence per 100,000 |
|---|---|---|---|
| Alameda County | 149,041 | 51,337,147 | 290.3 |
| Contra Costa County | 103,042 | 35,542,616 | 289.9 |
| Marin County | 24,318 | 7,897,064 | 307.9 |
| Monterey County | 44,136 | 13,530,756 | 326.2 |
| Napa County | 12,673 | 4,169,004 | 304.0 |
| San Benito County | 5,291 | 2,017,542 | 262.2 |
| San Francisco County | 85,116 | 26,288,589 | 323.8 |
| San Mateo County | 67,986 | 23,221,325 | 292.8 |
| Santa Clara County | 173,074 | 59,264,219 | 292.0 |
| Santa Cruz County | 26,333 | 8,156,131 | 322.9 |
| Solano County | 84,328 | 13,849,095 | 608.9 |
| Sonoma County | 46,322 | 14,874,606 | 311.4 |
VISUALIZATION #3:
In the previous two plots, we explored health officer region wide data analyzing health disparities among age groups and counties. In the third and final plot, we built on those existing visualizations to explore the relationships that race/ethnicity may have on the health outcomes of people in the Bay Area. For purposes of visualization, the data are aggregated by month rather than by week.
bay_race_plot <- combined_data %>%
filter(health_officer_region == "Bay Area") %>%
mutate(month = floor_date(dt_diagnosis, "month")) %>%
group_by(month, race_ethnicity) %>%
summarise(
new_infections = sum(new_infections, na.rm = TRUE),
total_pop = sum(pop, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(incidence_per_100k = (new_infections / total_pop) * 100000) %>%
mutate(race_ethnicity = case_when(
race_ethnicity == "American Indian or Alaska Native, Non-Hispanic" ~ "AI/AN",
race_ethnicity == "Native Hawaiian or Pacific Islander, Non-Hispanic" ~ "NH/PI",
race_ethnicity == "Black, Non-Hispanic" ~ "Black",
race_ethnicity == "Hispanic (any race)" ~ "Hispanic/Latino",
race_ethnicity == "Asian, Non-Hispanic" ~ "Asian",
race_ethnicity == "White, Non-Hispanic" ~ "White",
race_ethnicity == "Multiracial (two or more of above races), Non-Hispanic" ~ "Multiracial",
TRUE ~ race_ethnicity
))
ggplot(bay_race_plot, aes(x = month, y = incidence_per_100k, fill = race_ethnicity)) +
geom_col(position = "dodge") +
labs(
title = "Monthly Incidence Rate by Race/Ethnicity in the Bay Area",
x = "Month of Diagnosis",
y = "Incidence Rate per 100,000",
fill = "Race/Ethnicity"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")