PHW251 – Milestone 4

````markdown

str(joined_data_ca)

## tibble [100,688 × 15] (S3: tbl_df/tbl/data.frame)
##  $ county             : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
##  $ week_start         : POSIXct[1:100688], format: "2023-05-29" "2023-06-05" ...
##  $ age_group          : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
##  $ sex                : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
##  $ race_eth           : chr [1:100688] "1" "1" "1" "1" ...
##  $ cases_new          : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
##  $ cases_cum          : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
##  $ unrecovered_new    : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
##  $ unrecovered_cum    : num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
##  $ severe_new         : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
##  $ severe_cum         : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
##  $ source             : chr [1:100688] "CA" "CA" "CA" "CA" ...
##  $ race               : chr [1:100688] "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" ...
##  $ population         : num [1:100688] 34155 34155 34155 34155 34155 ...
##  $ incidence_rate_100k: num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...

# Load data using HERE -> knitting always sees the same files

# 1) california population 
ca_pop   <- read_csv(here::here("ca_pop_2023.csv")) %>% clean_names() %>%
  rename(race = race7)

# 2) statewide dataset
ca_state <- read_csv(here::here("sim_novelid_CA.csv")) %>% clean_names()

# 3) los angeles county dataset
la_county <- read_csv(here::here("sim_novelid_LACounty.csv")) %>% clean_names()

# Basic Cleaning


# Remove dt_report ONLY if it exists
if ("dt_report" %in% names(la_county)) {
  la_county <- la_county %>% select(-dt_report)
}


# Convert race_ethnicity into a factor
#    (these numbers represent categories.)
ca_state <- ca_state %>%
  mutate(race_ethnicity = as.factor(race_ethnicity))

#Clean column names 

# Make sure all names in datasets use snake_case 

# Convert LA County column names - all lowercase
names(la_county) <- tolower(names(la_county))

# ca_state columns are already in snake_case - used clean_names() when importing

# check the cleaned names:
names(ca_pop)

## [1] "county"                "health_officer_region" "age_cat"              
## [4] "sex"                   "race"                  "pop"

names(ca_state)

##  [1] "county"                 "age_cat"                "sex"                   
##  [4] "race_ethnicity"         "dt_diagnosis"           "time_int"              
##  [7] "new_infections"         "cumulative_infected"    "new_unrecovered"       
## [10] "cumulative_unrecovered" "new_severe"             "cumulative_severe"

names(la_county)

##  [1] "dt_dx"                  "age_category"           "sex"                   
##  [4] "race_eth"               "dx_new"                 "infected_cumulative"   
##  [7] "unrecovered_new"        "unrecovered_cumulative" "severe_new"            
## [10] "severe_cumulative"

# Clean + Combine Morbidity Data

# Standardize CA statewide dataset

ca_std <- ca_state %>%
  rename(
    week_start        = dt_diagnosis,
    age_group         = age_cat,
    race_eth          = race_ethnicity,
    cases_new         = new_infections,
    cases_cum         = cumulative_infected,
    unrecovered_new   = new_unrecovered,
    unrecovered_cum   = cumulative_unrecovered,
    severe_new        = new_severe,
    severe_cum        = cumulative_severe
  ) %>%
  mutate(
    county     = as.character(county),
    age_group  = as.character(age_group),
    sex        = toupper(sex),
    race_eth   = as.character(race_eth),
    week_start = ymd(week_start),
    source     = "CA"
  )

# Standardize LA County dataset

la_std <- la_county %>%
  rename(
    week_start      = dt_dx,
    age_group       = age_category,
    race_eth        = race_eth,
    cases_new       = dx_new,
    cases_cum       = infected_cumulative,
    unrecovered_cum = unrecovered_cumulative,
    severe_cum      = severe_cumulative
  ) %>%
  mutate(
    county     = "Los Angeles County",
    age_group  = as.character(age_group),
    sex        = toupper(sex),
    race_eth   = as.character(race_eth),

    # LA dates are in weird formats like "29MAY2023"
    week_start = suppressWarnings(
      parse_date_time(week_start, orders = c("dmy", "bdY"))
    ),

    source     = "LA"
  ) %>%
  mutate(
    # If these columns do not exist in LA, fill with NA
    unrecovered_new = ifelse("unrecovered_new" %in% names(.), unrecovered_new, NA),
    severe_new      = ifelse("severe_new" %in% names(.), severe_new, NA)
  )

# Align columns and combine datasets

core_cols <- c(
  "county", "week_start", "age_group", "sex", "race_eth",
  "cases_new", "cases_cum",
  "unrecovered_new", "unrecovered_cum",
  "severe_new", "severe_cum",
  "source"
)

add_missing_cols <- function(df, cols) {
  missing <- setdiff(cols, names(df))
  df[missing] <- NA
  df %>% select(all_of(cols))
}

ca_std2 <- add_missing_cols(ca_std, core_cols)
la_std2 <- add_missing_cols(la_std, core_cols)

combined <- bind_rows(ca_std2, la_std2)

# Quick checks

list(
  rows = nrow(combined),
  counties = n_distinct(combined$county),
  date_range = range(combined$week_start, na.rm = TRUE),
  missing_dates = sum(is.na(combined$week_start))
)

## $rows
## [1] 100688
## 
## $counties
## [1] 58
## 
## $date_range
## [1] "2023-05-29 UTC" "2023-12-25 UTC"
## 
## $missing_dates
## [1] 0

# Join Morbidity and Population Data

# Standardize population dataset

ca_pop_std <- ca_pop %>%
  
  # Make county names match morbidity dataset
  mutate(county = paste0(county, " County")) %>%
  
  # Convert age_cat -> age_group that matches combined dataset
  mutate(age_group = case_when(
    age_cat %in% c("0-4", "5-11", "12-17") ~ "0-17",
    age_cat == "18-49" ~ "18-49",
    age_cat == "50-64" ~ "50-64",
    age_cat == "65+"   ~ "65+",
    TRUE ~ NA_character_
  )) %>%

  # Standardize sex
  mutate(sex = toupper(sex)) %>%

  # Standardize race categories
  mutate(race = case_when(
    race == "WhiteTE NH" ~ "WHITE NON-HISPANIC",
    race == "BlackTE NH" ~ "BLACK NON-HISPANIC",
    race == "AsianTE NH" ~ "ASIAN NON-HISPANIC",
    race == "AIAN NH" ~ "AMERICAN INDIAN OR ALASKA NATIVE NON-HISPANIC",
    race == "NHPI NH" ~ "NATIVE HAWAIIAN OR PACIFIC ISLANDER NON-HISPANIC",
    race == "Hispanic" ~ "HISPANIC",
    race == "MR NH" ~ "MULTIRACIAL NON-HISPANIC",
    TRUE ~ NA_character_
  )) %>%

  # Keep only relevant variables
  select(county, age_group, sex, race, pop) %>%
  
  # Aggregate population to one row per group
  group_by(county, age_group, sex, race) %>%
  summarise(population = sum(pop), .groups = "drop")



# STANDARDIZE MORBIDITY DATASET

combined_std <- combined %>%
  
  # Standardize sex
  mutate(sex = toupper(sex)) %>%

  # Convert race_eth into same race names used in population dataset
  mutate(race = case_when(
    race_eth == "1" | str_detect(race_eth, "White") ~ "WHITE NON-HISPANIC",
    race_eth == "2" | str_detect(race_eth, "Black") ~ "BLACK NON-HISPANIC",
    race_eth == "3" | str_detect(race_eth, "Asian") ~ "ASIAN NON-HISPANIC",
    race_eth == "4" | str_detect(race_eth, "American|Alaska") ~
         "AMERICAN INDIAN OR ALASKA NATIVE NON-HISPANIC",
    race_eth == "5" | str_detect(race_eth, "Pacific|Hawai") ~
         "NATIVE HAWAIIAN OR PACIFIC ISLANDER NON-HISPANIC",
    race_eth == "6" | str_detect(race_eth, "Hispanic|Latino") ~ "HISPANIC",
    race_eth == "7" | str_detect(race_eth, "Multi") ~ "MULTIRACIAL NON-HISPANIC",
    TRUE ~ NA_character_
  ))


# JOIN MORBIDITY + POPULATION

joined_data_ca <- combined_std %>%
  left_join(
    ca_pop_std,
    by = c("county", "age_group", "sex", "race")
  )


# CALCULATE INCIDENCE RATE PER 100k

joined_data_ca <- joined_data_ca %>%
  mutate(
    incidence_rate_100k = if_else(
      !is.na(population) & population > 0,
      (cases_new / population) * 100000,
      NA_real_
    )
  )

# Quick Check

summary(joined_data_ca$population)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      0.0    119.8    754.0  14809.4   5921.0 980387.0    28768

summary(joined_data_ca$incidence_rate_100k)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      0.0      0.0     40.5   2896.6    819.7 618548.4    29791

head(
  joined_data_ca %>%
    select(
      week_start, county, age_group, sex, race,
      cases_new, population, incidence_rate_100k
    ),
  20
)

## # A tibble: 20 × 8
##    week_start          county         age_group sex   race  cases_new population
##    <dttm>              <chr>          <chr>     <chr> <chr>     <dbl>      <dbl>
##  1 2023-05-29 00:00:00 Alameda County 0-17      FEMA… WHIT…         6      34155
##  2 2023-06-05 00:00:00 Alameda County 0-17      FEMA… WHIT…         1      34155
##  3 2023-06-12 00:00:00 Alameda County 0-17      FEMA… WHIT…         2      34155
##  4 2023-06-19 00:00:00 Alameda County 0-17      FEMA… WHIT…        10      34155
##  5 2023-06-26 00:00:00 Alameda County 0-17      FEMA… WHIT…        19      34155
##  6 2023-07-03 00:00:00 Alameda County 0-17      FEMA… WHIT…        25      34155
##  7 2023-07-10 00:00:00 Alameda County 0-17      FEMA… WHIT…        23      34155
##  8 2023-07-17 00:00:00 Alameda County 0-17      FEMA… WHIT…        18      34155
##  9 2023-07-24 00:00:00 Alameda County 0-17      FEMA… WHIT…        22      34155
## 10 2023-07-31 00:00:00 Alameda County 0-17      FEMA… WHIT…        35      34155
## 11 2023-08-07 00:00:00 Alameda County 0-17      FEMA… WHIT…        29      34155
## 12 2023-08-14 00:00:00 Alameda County 0-17      FEMA… WHIT…        43      34155
## 13 2023-08-21 00:00:00 Alameda County 0-17      FEMA… WHIT…        69      34155
## 14 2023-08-28 00:00:00 Alameda County 0-17      FEMA… WHIT…       109      34155
## 15 2023-09-04 00:00:00 Alameda County 0-17      FEMA… WHIT…       100      34155
## 16 2023-09-11 00:00:00 Alameda County 0-17      FEMA… WHIT…        98      34155
## 17 2023-09-18 00:00:00 Alameda County 0-17      FEMA… WHIT…       113      34155
## 18 2023-09-25 00:00:00 Alameda County 0-17      FEMA… WHIT…       103      34155
## 19 2023-10-02 00:00:00 Alameda County 0-17      FEMA… WHIT…        93      34155
## 20 2023-10-09 00:00:00 Alameda County 0-17      FEMA… WHIT…       106      34155
## # ℹ 1 more variable: incidence_rate_100k <dbl>

sex_summary <- joined_data_ca %>%
  group_by(sex) %>%
  summarise(
    total_cases = sum(cases_new, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    incidence_rate_100k = (total_cases / total_population) * 100000
  )

ggplot(sex_summary,
       aes(x = sex, y = incidence_rate_100k, fill = sex)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = round(incidence_rate_100k, 1)),
            vjust = -0.5, size = 5) +
  scale_fill_manual(values = c("FEMALE" = "coral", "MALE" = "cyan3")) +
  labs(
    title = "Incidence Rate per 100,000 by Sex",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
    x = "Sex",
    y = "Incidence Rate per 100,000"
  ) +
  theme_minimal(base_size = 15)

Incidence rates were nearly identical between males and females, suggesting no major sex-based differences in infection risk during this outbreak.

library(gt)

sex_summary %>%
  gt() %>%
  tab_header(
    title = "Incidence Rate per 100,000 by Sex",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County"
  ) %>%
  fmt_number(
    columns = c(total_cases, total_population, incidence_rate_100k),
    decimals = 1
  )

sex	total_cases	total_population	incidence_rate_100k
Incidence Rate per 100,000 by Sex
Simulated Infectious Disease Outbreak, California & LA County
FEMALE	2,294,166.0	540,075,676.0	424.8
MALE	2,255,417.0	525,013,644.0	429.6

The total number of cases and overall incidence rate are similar for males and females, indicating that sex did not meaningfully influence the burden of disease in this population.

age_summary <- joined_data_ca %>%
  group_by(age_group) %>%
  summarise(
    total_cases = sum(cases_new, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    incidence_rate_100k = (total_cases / total_population) * 100000
  )

ggplot(age_summary,
       aes(x = age_group, y = incidence_rate_100k, fill = age_group)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(incidence_rate_100k, 1)), 
            vjust = -0.5, size = 4) +
  labs(
    title = "Incidence Rate per 100,000 by Age Group",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
    x = "Age Group",
    y = "Incidence Rate per 100,000",
    fill = "Age Group"
  ) +
  theme_minimal(base_size = 15)

Incidence rates increase steadily with age, with the highest burden occurring in adults 65 and older. This pattern suggests greater vulnerability or exposure among older populations.

race_summary <- joined_data_ca %>%
  group_by(race) %>%
  summarise(
    total_cases = sum(cases_new, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    incidence_rate_100k = (total_cases / total_population) * 100000
  ) %>%
  arrange(desc(incidence_rate_100k))

ggplot(race_summary,
       aes(x = reorder(race, incidence_rate_100k),
           y = incidence_rate_100k,
           fill = race)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(incidence_rate_100k, 1)),
            vjust = -0.5, size = 3.8) +
  labs(
    title = "Incidence Rate per 100,000 by Race/Ethnicity",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
    x = "Race/Ethnicity",
    y = "Incidence Rate per 100,000",
    fill = "Race/Ethnicity"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 25, hjust = 1))

Incidence rates varied widely across racial and ethnic groups, indicating potential differences in exposure, underlying health status, or access to testing and care.

Additional

Incidence rates were comparable across sex, with only a very small difference between females and males. This suggests that transmission and overall disease burden were distributed fairly evenly across both groups during this outbreak. These patterns indicate that sex alone may not be a major driver of risk in this dataset, and other factors such as age, race or exposure patterns may be more influential.

Discussion

Overall, the incidence trends show that sex does not appear to meaningfully influence infection risk in this outbreak, as the rates between males and females are nearly identical. This consistency suggests broad and similar exposure patterns across both groups. These descriptive findings help build a high-level understanding of the outbreak before moving into more advanced or stratified analyses, and they highlight the importance of evaluating additional demographic or structural factors that may better explain differences in disease burden.

PHW251 – Milestone 4

Group 6

2025-11-17

Additional

Discussion