Project Milestone 4 - Group 3 - Juliana, Daniel, Jason

Milestone #4 Objective - Juliana, Daniel, Jason

Loading Libraries…

library(tidyverse)
library(janitor)
library(dplyr)
library(knitr)
install.packages("scales")
library(scales)

Data Structures

str(sim_CA)
tibble [98,952 × 12] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:98952] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:98952] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:98952] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:98952] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:98952], format: "2023-05-29" "2023-06-05" ...
 $ time_int              : num [1:98952] 202322 202323 202324 202325 202326 ...
 $ new_infections        : num [1:98952] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:98952] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:98952] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:98952] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:98952] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:98952] 0 0 1 1 1 1 1 1 1 1 ...
str(sim_LA)
tibble [1,736 × 11] (S3: tbl_df/tbl/data.frame)
 $ dt_diagnosis          : Date[1:1736], format: "2023-05-29" "2023-06-05" ...
 $ age_cat               : chr [1:1736] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:1736] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:1736] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ new_infections        : num [1:1736] 15 17 23 51 67 75 106 83 91 173 ...
 $ cumulative_infected   : num [1:1736] 15 32 55 106 173 248 354 437 528 701 ...
 $ new_unrecovered       : num [1:1736] 0 0 0 0 1 1 4 3 1 3 ...
 $ cumulative_unrecovered: num [1:1736] 0 0 0 0 1 2 6 9 10 13 ...
 $ new_severe            : num [1:1736] 0 0 0 0 0 0 0 1 0 0 ...
 $ cumulative_severe     : num [1:1736] 0 0 0 0 0 0 0 1 1 1 ...
 $ county                : chr [1:1736] "Los Angeles County" "Los Angeles County" "Los Angeles County" "Los Angeles County" ...
str(combined_data)
tibble [100,688 × 12] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ time_int              : num [1:100688] 202322 202323 202324 202325 202326 ...
 $ new_infections        : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
str(ca_pop)
tibble [90,132 × 6] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:90132] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ health_officer_region: chr [1:90132] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ age_cat              : chr [1:90132] "0-4" "0-4" "0-4" "0-4" ...
 $ sex                  : chr [1:90132] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity       : chr [1:90132] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ pop                  : num [1:90132] 2008 2128 2142 2057 1965 ...

Collapsing Data in Preparation for Aggregation

ca_pop_collapsed <- ca_pop %>%
  mutate(age_cat = case_when(
    age_cat %in% c("0-4", "5-11", "12-17") ~ "0-17",
    age_cat == "18-49" ~ "18-49",
    age_cat == "50-64" ~ "50-64",
    age_cat == "65+"   ~ "65+",
    TRUE ~ NA_character_
  )) %>%
  group_by(county, health_officer_region, sex, race_ethnicity, age_cat) %>%
  summarise(pop = sum(pop), .groups = "drop")

Data Structures of Collapsed Dataset

str(ca_pop_collapsed)
tibble [3,248 × 6] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:3248] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ health_officer_region: chr [1:3248] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ sex                  : chr [1:3248] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity       : chr [1:3248] "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" ...
 $ age_cat              : chr [1:3248] "0-17" "18-49" "50-64" "65+" ...
 $ pop                  : num [1:3248] 533 945 558 351 52080 ...

Data Cleaning + Data Structures for Cleaned Data

combined_data <- combined_data %>% select(-time_int)
combined_data <- combined_data %>%
  left_join(ca_pop_collapsed,
            by = c("county", "age_cat", "sex", "race_ethnicity"))
str(combined_data)
tibble [100,688 × 13] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ new_infections        : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
 $ health_officer_region : chr [1:100688] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ pop                   : num [1:100688] 34155 34155 34155 34155 34155 ...
combined_data <- combined_data %>%
  mutate(
    rate_infection = (new_infections / pop) * 100000,
    rate_unrecovered = (new_unrecovered / pop) * 100000,
    rate_severe = (new_severe / pop) * 100000,
    rate_cum_infection = (cumulative_infected / pop) * 100000,
    rate_cum_unrecovered = (cumulative_unrecovered / pop) * 100000,
    rate_cum_severe = (cumulative_severe / pop) * 100000)
str(combined_data)
tibble [100,688 × 19] (S3: tbl_df/tbl/data.frame)
 $ county                : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
 $ age_cat               : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                   : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_ethnicity        : chr [1:100688] "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ dt_diagnosis          : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ new_infections        : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected   : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_unrecovered       : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
 $ cumulative_unrecovered: num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
 $ new_severe            : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe     : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
 $ health_officer_region : chr [1:100688] "Bay Area" "Bay Area" "Bay Area" "Bay Area" ...
 $ pop                   : num [1:100688] 34155 34155 34155 34155 34155 ...
 $ rate_infection        : num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...
 $ rate_unrecovered      : num [1:100688] 0 2.93 0 0 0 ...
 $ rate_severe           : num [1:100688] 0 0 2.93 0 0 ...
 $ rate_cum_infection    : num [1:100688] 17.6 20.5 26.4 55.6 111.3 ...
 $ rate_cum_unrecovered  : num [1:100688] 0 2.93 2.93 2.93 2.93 ...
 $ rate_cum_severe       : num [1:100688] 0 0 2.93 2.93 2.93 ...
combined_data <- combined_data %>%
  mutate(across(starts_with("rate"), ~ round(.x, 2)))

VISUALIZATION #1:

This figure displays epidemic curves using histograms summarizing the incidence rate of infection in the Bay Area among adults 65+ compared to those under 65. Across the entire observation period, adults 65+ consistently experienced higher incidence rates, indicating a disproportionate burden of disease in older populations, regardless of sex or race.

library(dplyr)
library(ggplot2)
library(lubridate)

bay_data <- combined_data %>%
  filter(health_officer_region == "Bay Area") %>% 
  mutate(age_group = ifelse(age_cat == "65+", "65+", "<65"))

bay_summary <- bay_data %>%
  group_by(dt_diagnosis, age_group) %>% 
  summarise(
    new_infections = sum(new_infections, na.rm = TRUE),
    pop = sum(pop, na.rm = TRUE),
    .groups = "drop") %>%
  mutate(incidence_rate = (new_infections / pop) * 100000)

ggplot(bay_summary, aes(x = dt_diagnosis, y = incidence_rate, fill = age_group)) +
  geom_col(position = "dodge") +
  labs(
    title = "Weekly Incidence Rate of Infection in the Bay Area",
    subtitle = "Comparison between adults 65+ and those under 65 (all races and sexes)",
    x = "Week of Diagnosis",
    y = "Incidence Rate per 100,000 Population",
    fill = "Age Group"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

VISUALIZATION #2:

This table summarizes the total number of new infections, population size, and incidence rates across Bay Area counties. By comparing incidence per 100,000 residents, it shows which counties are experiencing higher relative infection burdens within the region.

bay_area_counties <- c(
  "Alameda County", "Contra Costa County", "Marin County", "Napa County", "Monterey County", "San Benito County", "San Francisco County", "San Mateo County", "Santa Clara County", "Santa Cruz County", "Solano County", "Sonoma County")

bay_area_table <- combined_data %>%
  filter(county %in% bay_area_counties) %>%
  group_by(county) %>% 
  summarise(
    total_cases = sum(new_infections, na.rm = TRUE),
    total_population = sum(pop, na.rm = TRUE)
  ) %>%
  mutate(
    incidence_per_100k = (total_cases / total_population) * 100000
  ) %>%
  mutate(
    total_cases = comma(total_cases),
    total_population = comma(total_population),
    incidence_per_100k = comma(round(incidence_per_100k, 1))
  ) %>%
  arrange(county)

kable(
  bay_area_table,
  col.names = c("County", "Total Cases", "Total Population", "Incidence per 100,000"),
  align = "c",
  caption = "Incidence rate per 100,000 in Bay Area counties"
)
Incidence rate per 100,000 in Bay Area counties
County Total Cases Total Population Incidence per 100,000
Alameda County 149,041 51,337,147 290.3
Contra Costa County 103,042 35,542,616 289.9
Marin County 24,318 7,897,064 307.9
Monterey County 44,136 13,530,756 326.2
Napa County 12,673 4,169,004 304.0
San Benito County 5,291 2,017,542 262.2
San Francisco County 85,116 26,288,589 323.8
San Mateo County 67,986 23,221,325 292.8
Santa Clara County 173,074 59,264,219 292.0
Santa Cruz County 26,333 8,156,131 322.9
Solano County 84,328 13,849,095 608.9
Sonoma County 46,322 14,874,606 311.4

VISUALIZATION #3:

In the previous two plots, we explored health officer region wide data analyzing health disparities among age groups and counties. In the third and final plot, we built on those existing visualizations to explore the relationships that race/ethnicity may have on the health outcomes of people in the Bay Area. For purposes of visualization, the data are aggregated by month rather than by week.

bay_race_plot <- combined_data %>%
  filter(health_officer_region == "Bay Area") %>%
  mutate(month = floor_date(dt_diagnosis, "month")) %>%
  group_by(month, race_ethnicity) %>%
  summarise(
    new_infections = sum(new_infections, na.rm = TRUE),
    total_pop = sum(pop, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(incidence_per_100k = (new_infections / total_pop) * 100000) %>%
  mutate(race_ethnicity = case_when(
    race_ethnicity == "American Indian or Alaska Native, Non-Hispanic" ~ "AI/AN",
    race_ethnicity == "Native Hawaiian or Pacific Islander, Non-Hispanic" ~ "NH/PI",
    race_ethnicity == "Black, Non-Hispanic" ~ "Black",
    race_ethnicity == "Hispanic (any race)" ~ "Hispanic/Latino",
    race_ethnicity == "Asian, Non-Hispanic" ~ "Asian",
    race_ethnicity == "White, Non-Hispanic" ~ "White",
    race_ethnicity == "Multiracial (two or more of above races), Non-Hispanic" ~ "Multiracial",
    TRUE ~ race_ethnicity 
  ))

ggplot(bay_race_plot, aes(x = month, y = incidence_per_100k, fill = race_ethnicity)) +
  geom_col(position = "dodge") +
  labs(
    title = "Monthly Incidence Rate by Race/Ethnicity in the Bay Area",
    x = "Month of Diagnosis",
    y = "Incidence Rate per 100,000",
    fill = "Race/Ethnicity"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")