Milestone #5

R Markdown

Milestone #4

str(joined_data_ca)

## tibble [100,688 × 15] (S3: tbl_df/tbl/data.frame)
##  $ county             : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
##  $ week_start         : POSIXct[1:100688], format: "2023-05-29" "2023-06-05" ...
##  $ age_group          : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
##  $ sex                : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
##  $ race_eth           : chr [1:100688] "1" "1" "1" "1" ...
##  $ cases_new          : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
##  $ cases_cum          : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
##  $ unrecovered_new    : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
##  $ unrecovered_cum    : num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
##  $ severe_new         : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
##  $ severe_cum         : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
##  $ source             : chr [1:100688] "CA" "CA" "CA" "CA" ...
##  $ race               : chr [1:100688] "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" ...
##  $ population         : num [1:100688] 34155 34155 34155 34155 34155 ...
##  $ incidence_rate_100k: num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...

# Load data using HERE -> knitting always sees the same files

# 1) california population 
ca_pop   <- read_csv(here::here("ca_pop_2023.csv")) %>% clean_names() %>%
  rename(race = race7)

## Rows: 90132 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): county, health_officer_region, age_cat, sex, race7
## dbl (1): pop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 2) statewide dataset
ca_state <- read_csv(here::here("sim_novelid_CA.csv")) %>% clean_names()

## Rows: 98952 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): county, age_cat, sex
## dbl  (8): race_ethnicity, time_int, new_infections, cumulative_infected, new...
## date (1): dt_diagnosis
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 3) los angeles county dataset
la_county <- read_csv(here::here("sim_novelid_LACounty.csv")) %>% clean_names()

## Rows: 1736 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): DT_DX, AGE_CATEGORY, SEX, RACE_ETH
## dbl (6): DX_NEW, INFECTED_CUMULATIVE, UNRECOVERED_NEW, UNRECOVERED_CUMULATIV...
## lgl (1): DT_REPORT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Basic Cleaning


# Remove dt_report ONLY if it exists
if ("dt_report" %in% names(la_county)) {
  la_county <- la_county %>% select(-dt_report)
}


# Convert race_ethnicity into a factor
#    (these numbers represent categories.)
ca_state <- ca_state %>%
  mutate(race_ethnicity = as.factor(race_ethnicity))

#Clean column names 

# Make sure all names in datasets use snake_case 

# Convert LA County column names - all lowercase
names(la_county) <- tolower(names(la_county))

# ca_state columns are already in snake_case - used clean_names() when importing

# check the cleaned names:
names(ca_pop)

## [1] "county"                "health_officer_region" "age_cat"              
## [4] "sex"                   "race"                  "pop"

names(ca_state)

##  [1] "county"                 "age_cat"                "sex"                   
##  [4] "race_ethnicity"         "dt_diagnosis"           "time_int"              
##  [7] "new_infections"         "cumulative_infected"    "new_unrecovered"       
## [10] "cumulative_unrecovered" "new_severe"             "cumulative_severe"

names(la_county)

##  [1] "dt_dx"                  "age_category"           "sex"                   
##  [4] "race_eth"               "dx_new"                 "infected_cumulative"   
##  [7] "unrecovered_new"        "unrecovered_cumulative" "severe_new"            
## [10] "severe_cumulative"

# Clean + Combine Morbidity Data

# Standardize CA statewide dataset

ca_std <- ca_state %>%
  rename(
    week_start        = dt_diagnosis,
    age_group         = age_cat,
    race_eth          = race_ethnicity,
    cases_new         = new_infections,
    cases_cum         = cumulative_infected,
    unrecovered_new   = new_unrecovered,
    unrecovered_cum   = cumulative_unrecovered,
    severe_new        = new_severe,
    severe_cum        = cumulative_severe
  ) %>%
  mutate(
    county     = as.character(county),
    age_group  = as.character(age_group),
    sex        = toupper(sex),
    race_eth   = as.character(race_eth),
    week_start = ymd(week_start),
    source     = "CA"
  )

# Standardize LA County dataset

la_std <- la_county %>%
  rename(
    week_start      = dt_dx,
    age_group       = age_category,
    race_eth        = race_eth,
    cases_new       = dx_new,
    cases_cum       = infected_cumulative,
    unrecovered_cum = unrecovered_cumulative,
    severe_cum      = severe_cumulative
  ) %>%
  mutate(
    county     = "Los Angeles County",
    age_group  = as.character(age_group),
    sex        = toupper(sex),
    race_eth   = as.character(race_eth),

    # LA dates are in weird formats like "29MAY2023"
    week_start = suppressWarnings(
      parse_date_time(week_start, orders = c("dmy", "bdY"))
    ),

    source     = "LA"
  ) %>%
  mutate(
    # If these columns do not exist in LA, fill with NA
    unrecovered_new = ifelse("unrecovered_new" %in% names(.), unrecovered_new, NA),
    severe_new      = ifelse("severe_new" %in% names(.), severe_new, NA)
  )

# Align columns and combine datasets

core_cols <- c(
  "county", "week_start", "age_group", "sex", "race_eth",
  "cases_new", "cases_cum",
  "unrecovered_new", "unrecovered_cum",
  "severe_new", "severe_cum",
  "source"
)

add_missing_cols <- function(df, cols) {
  missing <- setdiff(cols, names(df))
  df[missing] <- NA
  df %>% select(all_of(cols))
}

ca_std2 <- add_missing_cols(ca_std, core_cols)
la_std2 <- add_missing_cols(la_std, core_cols)

combined <- bind_rows(ca_std2, la_std2)

# Quick checks

list(
  rows = nrow(combined),
  counties = n_distinct(combined$county),
  date_range = range(combined$week_start, na.rm = TRUE),
  missing_dates = sum(is.na(combined$week_start))
)

## $rows
## [1] 100688
## 
## $counties
## [1] 58
## 
## $date_range
## [1] "2023-05-29 UTC" "2023-12-25 UTC"
## 
## $missing_dates
## [1] 0

# Join Morbidity and Population Data

# Standardize population dataset

ca_pop_std <- ca_pop %>%
  
  # Make county names match morbidity dataset
  mutate(county = paste0(county, " County")) %>%
  
  # Convert age_cat -> age_group that matches combined dataset
  mutate(age_group = case_when(
    age_cat %in% c("0-4", "5-11", "12-17") ~ "0-17",
    age_cat == "18-49" ~ "18-49",
    age_cat == "50-64" ~ "50-64",
    age_cat == "65+"   ~ "65+",
    TRUE ~ NA_character_
  )) %>%

  # Standardize sex
  mutate(sex = toupper(sex)) %>%

  # Standardize race categories
  mutate(race = case_when(
    race == "WhiteTE NH" ~ "WHITE NON-HISPANIC",
    race == "BlackTE NH" ~ "BLACK NON-HISPANIC",
    race == "AsianTE NH" ~ "ASIAN NON-HISPANIC",
    race == "AIAN NH" ~ "AMERICAN INDIAN OR ALASKA NATIVE NON-HISPANIC",
    race == "NHPI NH" ~ "NATIVE HAWAIIAN OR PACIFIC ISLANDER NON-HISPANIC",
    race == "Hispanic" ~ "HISPANIC",
    race == "MR NH" ~ "MULTIRACIAL NON-HISPANIC",
    TRUE ~ NA_character_
  )) %>%

  # Keep only relevant variables
  select(county, age_group, sex, race, pop) %>%
  
  # Aggregate population to one row per group
  group_by(county, age_group, sex, race) %>%
  summarise(population = sum(pop), .groups = "drop")



# STANDARDIZE MORBIDITY DATASET

combined_std <- combined %>%
  
  # Standardize sex
  mutate(sex = toupper(sex)) %>%

  # Convert race_eth into same race names used in population dataset
  mutate(race = case_when(
    race_eth == "1" | str_detect(race_eth, "White") ~ "WHITE NON-HISPANIC",
    race_eth == "2" | str_detect(race_eth, "Black") ~ "BLACK NON-HISPANIC",
    race_eth == "3" | str_detect(race_eth, "Asian") ~ "ASIAN NON-HISPANIC",
    race_eth == "4" | str_detect(race_eth, "American|Alaska") ~
         "AMERICAN INDIAN OR ALASKA NATIVE NON-HISPANIC",
    race_eth == "5" | str_detect(race_eth, "Pacific|Hawai") ~
         "NATIVE HAWAIIAN OR PACIFIC ISLANDER NON-HISPANIC",
    race_eth == "6" | str_detect(race_eth, "Hispanic|Latino") ~ "HISPANIC",
    race_eth == "7" | str_detect(race_eth, "Multi") ~ "MULTIRACIAL NON-HISPANIC",
    TRUE ~ NA_character_
  ))


# JOIN MORBIDITY + POPULATION

joined_data_ca <- combined_std %>%
  left_join(
    ca_pop_std,
    by = c("county", "age_group", "sex", "race")
  )


# CALCULATE INCIDENCE RATE PER 100k

joined_data_ca <- joined_data_ca %>%
  mutate(
    incidence_rate_100k = if_else(
      !is.na(population) & population > 0,
      (cases_new / population) * 100000,
      NA_real_
    )
  )

# Quick Check

summary(joined_data_ca$population)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      0.0    119.8    754.0  14809.4   5921.0 980387.0    28768

summary(joined_data_ca$incidence_rate_100k)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      0.0      0.0     40.5   2896.6    819.7 618548.4    29791

head(
  joined_data_ca %>%
    select(
      week_start, county, age_group, sex, race,
      cases_new, population, incidence_rate_100k
    ),
  20
)

## # A tibble: 20 × 8
##    week_start          county         age_group sex   race  cases_new population
##    <dttm>              <chr>          <chr>     <chr> <chr>     <dbl>      <dbl>
##  1 2023-05-29 00:00:00 Alameda County 0-17      FEMA… WHIT…         6      34155
##  2 2023-06-05 00:00:00 Alameda County 0-17      FEMA… WHIT…         1      34155
##  3 2023-06-12 00:00:00 Alameda County 0-17      FEMA… WHIT…         2      34155
##  4 2023-06-19 00:00:00 Alameda County 0-17      FEMA… WHIT…        10      34155
##  5 2023-06-26 00:00:00 Alameda County 0-17      FEMA… WHIT…        19      34155
##  6 2023-07-03 00:00:00 Alameda County 0-17      FEMA… WHIT…        25      34155
##  7 2023-07-10 00:00:00 Alameda County 0-17      FEMA… WHIT…        23      34155
##  8 2023-07-17 00:00:00 Alameda County 0-17      FEMA… WHIT…        18      34155
##  9 2023-07-24 00:00:00 Alameda County 0-17      FEMA… WHIT…        22      34155
## 10 2023-07-31 00:00:00 Alameda County 0-17      FEMA… WHIT…        35      34155
## 11 2023-08-07 00:00:00 Alameda County 0-17      FEMA… WHIT…        29      34155
## 12 2023-08-14 00:00:00 Alameda County 0-17      FEMA… WHIT…        43      34155
## 13 2023-08-21 00:00:00 Alameda County 0-17      FEMA… WHIT…        69      34155
## 14 2023-08-28 00:00:00 Alameda County 0-17      FEMA… WHIT…       109      34155
## 15 2023-09-04 00:00:00 Alameda County 0-17      FEMA… WHIT…       100      34155
## 16 2023-09-11 00:00:00 Alameda County 0-17      FEMA… WHIT…        98      34155
## 17 2023-09-18 00:00:00 Alameda County 0-17      FEMA… WHIT…       113      34155
## 18 2023-09-25 00:00:00 Alameda County 0-17      FEMA… WHIT…       103      34155
## 19 2023-10-02 00:00:00 Alameda County 0-17      FEMA… WHIT…        93      34155
## 20 2023-10-09 00:00:00 Alameda County 0-17      FEMA… WHIT…       106      34155
## # ℹ 1 more variable: incidence_rate_100k <dbl>

sex_summary <- joined_data_ca %>%
  group_by(sex) %>%
  summarise(
    total_cases = sum(cases_new, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    incidence_rate_100k = (total_cases / total_population) * 100000
  )

library(gt)

sex_summary %>%
  # Make sex labels prettier in the table
  mutate(
    sex = recode(sex,
                 "FEMALE" = "Female",
                 "MALE"   = "Male")
  ) %>%
  gt() %>%
  tab_header(
    title = "Incidence Rate per 100,000 by Sex",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County"
  ) %>%
  # Clean column names for the table
  cols_label(
    sex                = "Sex",
    total_cases        = "Total cases",
    total_population   = "Total population",
    incidence_rate_100k = "Incidence rate per 100,000"
  ) %>%
  # No decimals for counts
  fmt_integer(
    columns = c(total_cases, total_population)
  ) %>%
  # Keep one decimal for the rate (you can change to 0 if you prefer)
  fmt_number(
    columns = incidence_rate_100k,
    decimals = 1
  )

Sex	Total cases	Total population	Incidence rate per 100,000
Incidence Rate per 100,000 by Sex
Simulated Infectious Disease Outbreak, California & LA County
Female	2,294,166	540,075,676	424.8
Male	2,255,417	525,013,644	429.6

The total number of cases and overall incidence rate were very similar for males and females, suggesting that sex did not meaningfully influence the burden of disease in this population.

age_summary <- joined_data_ca %>%
  group_by(age_group) %>%
  summarise(
    total_cases = sum(cases_new, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    incidence_rate_100k = (total_cases / total_population) * 100000
  )

ggplot(age_summary,
       aes(x = age_group, y = incidence_rate_100k, fill = age_group)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(incidence_rate_100k, 1)), 
            vjust = -0.5, size = 4) +
  labs(
    title = "Incidence Rate per 100,000 by Age Group",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
    x = "Age group",
    y = "Incidence rate per 100,000"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    legend.position = "none"  # remove legend, keep x-axis labels
  )

Incidence rates increased steadily with age, with the highest burden seen in adults 65 and older. This pattern suggests greater vulnerability or exposure among older adults.

race_summary <- joined_data_ca %>%
  group_by(race) %>%
  summarise(
    total_cases = sum(cases_new, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    incidence_rate_100k = (total_cases / total_population) * 100000
  ) %>%
  arrange(desc(incidence_rate_100k))

max_rate <- max(race_summary$incidence_rate_100k, na.rm = TRUE)

ggplot(race_summary,
       aes(x = reorder(race, incidence_rate_100k),
           y = incidence_rate_100k,
           fill = race)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(incidence_rate_100k, 1)),
            vjust = -0.5, size = 3.8) +
  labs(
    title = "Incidence Rate per 100,000 by Race/Ethnicity",
    subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
    x = NULL,
    y = "Incidence rate per 100,000",
    fill = "Race/ethnicity"
  ) +
  theme_minimal(base_size = 14) +
  # hide x-axis labels and ticks but keep legend
  theme(
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  # give extra headroom so all text labels show
  scale_y_continuous(
    limits = c(0, max_rate * 1.1),  # 10% headroom
    expand = expansion(mult = c(0, 0.05))
  )

Incidence rates varied across racial and ethnic groups, suggesting possible differences in exposure, underlying health status, or access to testing and care.

Problem Statement

The California Department of Public Health is monitoring a simulated outbreak of a novel infectious respiratory disease and must quickly assess its impact across the state. Multiple data sources have been provided including weekly morbidity and severity counts for all California counties except Los Angeles, detailed demographic and population data for Los Angeles County, and 2023 county-level population estimates (to support statewide surveillance activities). To effectively guide the public health response, these datasets must be merged and analyzed to produce a unified view of the outbreak.

Public health leadership seeks to understand the trajectory of the outbreak, identify whether certain demographic or geographic groups are disproportionately affected, and determine how prevention and treatment resources should be allocated. The task requires integrating the three datasets, generating summary measures such as morbidity rates and severity distributions, and producing clear, print-ready tables and graphical visualizations that communicate key trends across counties and demographic categories. This analysis will inform data-driven decision-making for targeted interventions and resource deployment.

Methods for Each Data Source

The sim_novelid_CA.csv dataset contained weekly morbidity records for all counties in California except Los Angeles County. It included cases, severity indicators, and demographic variables, and was stratified by age category, county, race, sex, and week start date. We cleaned and standardized the dataset, converting label cases, renaming the race variable, and ensuring that groups matched the format of the population data. We removed duplicate columns and looked for missing values before joining it.

The sim_novelid_LACounty.csv dataset was formatted in the same manner as the California state dataset, but it contained separate data for Los Angeles County. We utilized the same cleaning steps to ensure that both of the morbidity datasets were consistent and uniform. After we standardized column names and categories, we combined this dataset with the California dataset to create a morbidity dataset that included all counties.

The ca_pop_2023.csv dataset contained population estimates by age group, county, race, and sex. The labels in this dataset did not match those in the morbidity data, so they required more cleaning. We added County and re-coded population age categories to have them match age groups, sex labels, and race labels used in the morbidity datasets.

After all of the datasets had uniform variable names and categories, we grouped and summed the population, then joined it to the morbidity data using county, age group, sex, and race. Then we created a new variable for incidence per 100,000. Other minor cleaning steps were completed to prepare for the development of tables and plot visualizations.

Results

The incidence rates for males and females were almost identical, showing that sex did not seem to play a meaningful role in infection risk in this simulated outbreak. More noticeable differences appeared when looking at age groups. Incidence increased steadily with age, with the highest burden seen in adults 65 years and older, while younger groups had much lower rates. There were also clear differences across racial and ethnic groups, with some groups experiencing higher incidence rates than others. These patterns help give an early picture of how the outbreak affected different populations before moving into more detailed analyses.

Discussion

These results suggest that sex did not have a major influence on infection risk, since the rates for males and females were essentially the same. Stronger patterns were seen with age and race and ethnicity. Older adults consistently carried the highest burden, which is expected for many respiratory infections. The variation across racial and ethnic groups also points to the need to explore factors like underlying health conditions, exposure risks, and differences in access to care that may contribute to these disparities. Altogether, these descriptive findings help guide our initial understanding of who is most impacted and support future work focused on prevention efforts, resource allocation, and examining demographic groups that may be at higher risk.