Project Milestone 4 - Group 3 - Juliana, Daniel, Jason

Milestone #4 Objective - Juliana, Daniel, Jason

Loading Libraries…

library(tidyverse)
library(janitor)
library(dplyr)
library(knitr)
install.packages("scales")
library(scales)

Data Structures

str(sim_CA)

tibble [98,952 × 12] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:98952] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:98952] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:98952] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:98952] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:98952], format: "2023-05-29" "2023-06-05" ...
 $ time_int              : num [1:98952] 202322 202323 202324 202325 202326 ...
 $ new_infections        : num [1:98952] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:98952] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:98952] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:98952] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:98952] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:98952] 0 0 1 1 1 1 1 1 1 1 ...

str(sim_LA)

tibble [1,736 × 11] (S3: tbl_df/tbl/data.frame)
 $ dt_diagnosis          : Date[1:1736], format: "2023-05-29" "2023-06-05" ...
 $ age_cat               : chr [1:1736] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:1736] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:1736] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ new_infections        : num [1:1736] 15 17 23 51 67 75 106 83 91 173 ...
 $ cumulative_infected   : num [1:1736] 15 32 55 106 173 248 354 437 528 701 ...
 $ new_unrecovered       : num [1:1736] 0 0 0 0 1 1 4 3 1 3 ...
 $ cumulative_unrecovered: num [1:1736] 0 0 0 0 1 2 6 9 10 13 ...
 $ new_severe            : num [1:1736] 0 0 0 0 0 0 0 1 0 0 ...
 $ cumulative_severe     : num [1:1736] 0 0 0 0 0 0 0 1 1 1 ...
 $ county                : chr [1:1736] "Los Angeles County" "Los Angeles County" "Los Angeles County" "Los Angeles County" ...

str(combined_data)

tibble [100,688 × 12] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ time_int              : num [1:100688] 202322 202323 202324 202325 202326 ...
 $ new_infections        : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...

str(ca_pop)

tibble [90,132 × 6] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:90132] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ health_officer_region: chr [1:90132] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ age_cat              : chr [1:90132] "0-4" "0-4" "0-4" "0-4" ...
 $ sex                  : chr [1:90132] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity       : chr [1:90132] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ pop                  : num [1:90132] 2008 2128 2142 2057 1965 ...

Collapsing Data in Preparation for Aggregation

ca_pop_collapsed <- ca_pop %>%
  mutate(age_cat = case_when(
    age_cat %in% c("0-4", "5-11", "12-17") ~ "0-17",
    age_cat == "18-49" ~ "18-49",
    age_cat == "50-64" ~ "50-64",
    age_cat == "65+"   ~ "65+",
    TRUE ~ NA_character_
  )) %>%
  group_by(county, health_officer_region, sex, race_ethnicity, age_cat) %>%
  summarise(pop = sum(pop), .groups = "drop")

Data Structures of Collapsed Dataset

str(ca_pop_collapsed)

tibble [3,248 × 6] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:3248] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ health_officer_region: chr [1:3248] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ sex                  : chr [1:3248] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity       : chr [1:3248] "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" ...
 $ age_cat              : chr [1:3248] "0-17" "18-49" "50-64" "65+" ...
 $ pop                  : num [1:3248] 533 945 558 351 52080 ...

Data Cleaning + Data Structures for Cleaned Data

combined_data <- combined_data %>% select(-time_int)

combined_data <- combined_data %>%
  left_join(ca_pop_collapsed,
            by = c("county", "age_cat", "sex", "race_ethnicity"))
str(combined_data)

tibble [100,688 × 13] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ new_infections        : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
 $ health_officer_region : chr [1:100688] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ pop                   : num [1:100688] 34155 34155 34155 34155 34155 ...

combined_data <- combined_data %>%
  mutate(
    rate_infection = (new_infections / pop) * 100000,
    rate_unrecovered = (new_unrecovered / pop) * 100000,
    rate_severe = (new_severe / pop) * 100000,
    rate_cum_infection = (cumulative_infected / pop) * 100000,
    rate_cum_unrecovered = (cumulative_unrecovered / pop) * 100000,
    rate_cum_severe = (cumulative_severe / pop) * 100000)
str(combined_data)

tibble [100,688 × 19] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ new_infections        : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
 $ health_officer_region : chr [1:100688] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ pop                   : num [1:100688] 34155 34155 34155 34155 34155 ...
 $ rate_infection        : num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...
 $ rate_unrecovered      : num [1:100688] 0 2.93 0 0 0 ...
 $ rate_severe           : num [1:100688] 0 0 2.93 0 0 ...
 $ rate_cum_infection    : num [1:100688] 17.6 20.5 26.4 55.6 111.3 ...
 $ rate_cum_unrecovered  : num [1:100688] 0 2.93 2.93 2.93 2.93 ...
 $ rate_cum_severe       : num [1:100688] 0 0 2.93 2.93 2.93 ...

combined_data <- combined_data %>%
  mutate(across(starts_with("rate"), ~ round(.x, 2)))

VISUALIZATION #1:

This figure displays epidemic curves using histograms summarizing the incidence rate of infection in the Bay Area among adults 65+ compared to those under 65. Across the entire observation period, adults 65+ consistently experienced higher incidence rates, indicating a disproportionate burden of disease in older populations, regardless of sex or race.

library(dplyr)
library(ggplot2)
library(lubridate)

bay_data <- combined_data %>%
  filter(health_officer_region == "Bay Area") %>% 
  mutate(age_group = ifelse(age_cat == "65+", "65+", "<65"))

bay_summary <- bay_data %>%
  group_by(dt_diagnosis, age_group) %>% 
  summarise(
    new_infections = sum(new_infections, na.rm = TRUE),
    pop = sum(pop, na.rm = TRUE),
    .groups = "drop") %>%
  mutate(incidence_rate = (new_infections / pop) * 100000)

ggplot(bay_summary, aes(x = dt_diagnosis, y = incidence_rate, fill = age_group)) +
  geom_col(position = "dodge") +
  labs(
    title = "Weekly Incidence Rate of Infection in the Bay Area",
    subtitle = "Comparison between adults 65+ and those under 65 (all races and sexes)",
    x = "Week of Diagnosis",
    y = "Incidence Rate per 100,000 Population",
    fill = "Age Group"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

VISUALIZATION #2:

This table summarizes the total number of new infections, population size, and incidence rates across Bay Area counties. By comparing incidence per 100,000 residents, it shows which counties are experiencing higher relative infection burdens within the region.

bay_area_counties <- c(
  "Alameda County", "Contra Costa County", "Marin County", "Napa County", "Monterey County", "San Benito County", "San Francisco County", "San Mateo County", "Santa Clara County", "Santa Cruz County", "Solano County", "Sonoma County")

bay_area_table <- combined_data %>%
  filter(county %in% bay_area_counties) %>%
  group_by(county) %>% 
  summarise(
    total_cases = sum(new_infections, na.rm = TRUE),
    total_population = sum(pop, na.rm = TRUE)
  ) %>%
  mutate(
    incidence_per_100k = (total_cases / total_population) * 100000
  ) %>%
  mutate(
    total_cases = comma(total_cases),
    total_population = comma(total_population),
    incidence_per_100k = comma(round(incidence_per_100k, 1))
  ) %>%
  arrange(county)

kable(
  bay_area_table,
  col.names = c("County", "Total Cases", "Total Population", "Incidence per 100,000"),
  align = "c",
  caption = "Incidence rate per 100,000 in Bay Area counties"
)

Incidence rate per 100,000 in Bay Area counties
County	Total Cases	Total Population	Incidence per 100,000
Alameda County	149,041	51,337,147	290.3
Contra Costa County	103,042	35,542,616	289.9
Marin County	24,318	7,897,064	307.9
Monterey County	44,136	13,530,756	326.2
Napa County	12,673	4,169,004	304.0
San Benito County	5,291	2,017,542	262.2
San Francisco County	85,116	26,288,589	323.8
San Mateo County	67,986	23,221,325	292.8
Santa Clara County	173,074	59,264,219	292.0
Santa Cruz County	26,333	8,156,131	322.9
Solano County	84,328	13,849,095	608.9
Sonoma County	46,322	14,874,606	311.4

VISUALIZATION #3:

In the previous two plots, we explored health officer region wide data analyzing health disparities among age groups and counties. In the third and final plot, we built on those existing visualizations to explore the relationships that race/ethnicity may have on the health outcomes of people in the Bay Area. For purposes of visualization, the data are aggregated by month rather than by week.

bay_race_plot <- combined_data %>%
  filter(health_officer_region == "Bay Area") %>%
  mutate(month = floor_date(dt_diagnosis, "month")) %>%
  group_by(month, race_ethnicity) %>%
  summarise(
    new_infections = sum(new_infections, na.rm = TRUE),
    total_pop = sum(pop, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(incidence_per_100k = (new_infections / total_pop) * 100000) %>%
  mutate(race_ethnicity = case_when(
    race_ethnicity == "American Indian or Alaska Native, Non-Hispanic" ~ "AI/AN",
    race_ethnicity == "Native Hawaiian or Pacific Islander, Non-Hispanic" ~ "NH/PI",
    race_ethnicity == "Black, Non-Hispanic" ~ "Black",
    race_ethnicity == "Hispanic (any race)" ~ "Hispanic/Latino",
    race_ethnicity == "Asian, Non-Hispanic" ~ "Asian",
    race_ethnicity == "White, Non-Hispanic" ~ "White",
    race_ethnicity == "Multiracial (two or more of above races), Non-Hispanic" ~ "Multiracial",
    TRUE ~ race_ethnicity 
  ))

ggplot(bay_race_plot, aes(x = month, y = incidence_per_100k, fill = race_ethnicity)) +
  geom_col(position = "dodge") +
  labs(
    title = "Monthly Incidence Rate by Race/Ethnicity in the Bay Area",
    x = "Month of Diagnosis",
    y = "Incidence Rate per 100,000",
    fill = "Race/Ethnicity"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")