````markdown
str(joined_data_ca)
## tibble [100,688 × 15] (S3: tbl_df/tbl/data.frame)
## $ county : chr [1:100688] "Alameda County" "Alameda County" "Alameda County" "Alameda County" ...
## $ week_start : POSIXct[1:100688], format: "2023-05-29" "2023-06-05" ...
## $ age_group : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
## $ sex : chr [1:100688] "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
## $ race_eth : chr [1:100688] "1" "1" "1" "1" ...
## $ cases_new : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
## $ cases_cum : num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
## $ unrecovered_new : num [1:100688] 0 1 0 0 0 1 0 1 1 1 ...
## $ unrecovered_cum : num [1:100688] 0 1 1 1 1 2 2 3 4 5 ...
## $ severe_new : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
## $ severe_cum : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
## $ source : chr [1:100688] "CA" "CA" "CA" "CA" ...
## $ race : chr [1:100688] "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" "WHITE NON-HISPANIC" ...
## $ population : num [1:100688] 34155 34155 34155 34155 34155 ...
## $ incidence_rate_100k: num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...
# Load data using HERE -> knitting always sees the same files
# 1) california population
ca_pop <- read_csv(here::here("ca_pop_2023.csv")) %>% clean_names() %>%
rename(race = race7)
# 2) statewide dataset
ca_state <- read_csv(here::here("sim_novelid_CA.csv")) %>% clean_names()
# 3) los angeles county dataset
la_county <- read_csv(here::here("sim_novelid_LACounty.csv")) %>% clean_names()
# Basic Cleaning
# Remove dt_report ONLY if it exists
if ("dt_report" %in% names(la_county)) {
la_county <- la_county %>% select(-dt_report)
}
# Convert race_ethnicity into a factor
# (these numbers represent categories.)
ca_state <- ca_state %>%
mutate(race_ethnicity = as.factor(race_ethnicity))
#Clean column names
# Make sure all names in datasets use snake_case
# Convert LA County column names - all lowercase
names(la_county) <- tolower(names(la_county))
# ca_state columns are already in snake_case - used clean_names() when importing
# check the cleaned names:
names(ca_pop)
## [1] "county" "health_officer_region" "age_cat"
## [4] "sex" "race" "pop"
names(ca_state)
## [1] "county" "age_cat" "sex"
## [4] "race_ethnicity" "dt_diagnosis" "time_int"
## [7] "new_infections" "cumulative_infected" "new_unrecovered"
## [10] "cumulative_unrecovered" "new_severe" "cumulative_severe"
names(la_county)
## [1] "dt_dx" "age_category" "sex"
## [4] "race_eth" "dx_new" "infected_cumulative"
## [7] "unrecovered_new" "unrecovered_cumulative" "severe_new"
## [10] "severe_cumulative"
# Clean + Combine Morbidity Data
# Standardize CA statewide dataset
ca_std <- ca_state %>%
rename(
week_start = dt_diagnosis,
age_group = age_cat,
race_eth = race_ethnicity,
cases_new = new_infections,
cases_cum = cumulative_infected,
unrecovered_new = new_unrecovered,
unrecovered_cum = cumulative_unrecovered,
severe_new = new_severe,
severe_cum = cumulative_severe
) %>%
mutate(
county = as.character(county),
age_group = as.character(age_group),
sex = toupper(sex),
race_eth = as.character(race_eth),
week_start = ymd(week_start),
source = "CA"
)
# Standardize LA County dataset
la_std <- la_county %>%
rename(
week_start = dt_dx,
age_group = age_category,
race_eth = race_eth,
cases_new = dx_new,
cases_cum = infected_cumulative,
unrecovered_cum = unrecovered_cumulative,
severe_cum = severe_cumulative
) %>%
mutate(
county = "Los Angeles County",
age_group = as.character(age_group),
sex = toupper(sex),
race_eth = as.character(race_eth),
# LA dates are in weird formats like "29MAY2023"
week_start = suppressWarnings(
parse_date_time(week_start, orders = c("dmy", "bdY"))
),
source = "LA"
) %>%
mutate(
# If these columns do not exist in LA, fill with NA
unrecovered_new = ifelse("unrecovered_new" %in% names(.), unrecovered_new, NA),
severe_new = ifelse("severe_new" %in% names(.), severe_new, NA)
)
# Align columns and combine datasets
core_cols <- c(
"county", "week_start", "age_group", "sex", "race_eth",
"cases_new", "cases_cum",
"unrecovered_new", "unrecovered_cum",
"severe_new", "severe_cum",
"source"
)
add_missing_cols <- function(df, cols) {
missing <- setdiff(cols, names(df))
df[missing] <- NA
df %>% select(all_of(cols))
}
ca_std2 <- add_missing_cols(ca_std, core_cols)
la_std2 <- add_missing_cols(la_std, core_cols)
combined <- bind_rows(ca_std2, la_std2)
# Quick checks
list(
rows = nrow(combined),
counties = n_distinct(combined$county),
date_range = range(combined$week_start, na.rm = TRUE),
missing_dates = sum(is.na(combined$week_start))
)
## $rows
## [1] 100688
##
## $counties
## [1] 58
##
## $date_range
## [1] "2023-05-29 UTC" "2023-12-25 UTC"
##
## $missing_dates
## [1] 0
# Join Morbidity and Population Data
# Standardize population dataset
ca_pop_std <- ca_pop %>%
# Make county names match morbidity dataset
mutate(county = paste0(county, " County")) %>%
# Convert age_cat -> age_group that matches combined dataset
mutate(age_group = case_when(
age_cat %in% c("0-4", "5-11", "12-17") ~ "0-17",
age_cat == "18-49" ~ "18-49",
age_cat == "50-64" ~ "50-64",
age_cat == "65+" ~ "65+",
TRUE ~ NA_character_
)) %>%
# Standardize sex
mutate(sex = toupper(sex)) %>%
# Standardize race categories
mutate(race = case_when(
race == "WhiteTE NH" ~ "WHITE NON-HISPANIC",
race == "BlackTE NH" ~ "BLACK NON-HISPANIC",
race == "AsianTE NH" ~ "ASIAN NON-HISPANIC",
race == "AIAN NH" ~ "AMERICAN INDIAN OR ALASKA NATIVE NON-HISPANIC",
race == "NHPI NH" ~ "NATIVE HAWAIIAN OR PACIFIC ISLANDER NON-HISPANIC",
race == "Hispanic" ~ "HISPANIC",
race == "MR NH" ~ "MULTIRACIAL NON-HISPANIC",
TRUE ~ NA_character_
)) %>%
# Keep only relevant variables
select(county, age_group, sex, race, pop) %>%
# Aggregate population to one row per group
group_by(county, age_group, sex, race) %>%
summarise(population = sum(pop), .groups = "drop")
# STANDARDIZE MORBIDITY DATASET
combined_std <- combined %>%
# Standardize sex
mutate(sex = toupper(sex)) %>%
# Convert race_eth into same race names used in population dataset
mutate(race = case_when(
race_eth == "1" | str_detect(race_eth, "White") ~ "WHITE NON-HISPANIC",
race_eth == "2" | str_detect(race_eth, "Black") ~ "BLACK NON-HISPANIC",
race_eth == "3" | str_detect(race_eth, "Asian") ~ "ASIAN NON-HISPANIC",
race_eth == "4" | str_detect(race_eth, "American|Alaska") ~
"AMERICAN INDIAN OR ALASKA NATIVE NON-HISPANIC",
race_eth == "5" | str_detect(race_eth, "Pacific|Hawai") ~
"NATIVE HAWAIIAN OR PACIFIC ISLANDER NON-HISPANIC",
race_eth == "6" | str_detect(race_eth, "Hispanic|Latino") ~ "HISPANIC",
race_eth == "7" | str_detect(race_eth, "Multi") ~ "MULTIRACIAL NON-HISPANIC",
TRUE ~ NA_character_
))
# JOIN MORBIDITY + POPULATION
joined_data_ca <- combined_std %>%
left_join(
ca_pop_std,
by = c("county", "age_group", "sex", "race")
)
# CALCULATE INCIDENCE RATE PER 100k
joined_data_ca <- joined_data_ca %>%
mutate(
incidence_rate_100k = if_else(
!is.na(population) & population > 0,
(cases_new / population) * 100000,
NA_real_
)
)
# Quick Check
summary(joined_data_ca$population)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 119.8 754.0 14809.4 5921.0 980387.0 28768
summary(joined_data_ca$incidence_rate_100k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 40.5 2896.6 819.7 618548.4 29791
head(
joined_data_ca %>%
select(
week_start, county, age_group, sex, race,
cases_new, population, incidence_rate_100k
),
20
)
## # A tibble: 20 × 8
## week_start county age_group sex race cases_new population
## <dttm> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2023-05-29 00:00:00 Alameda County 0-17 FEMA… WHIT… 6 34155
## 2 2023-06-05 00:00:00 Alameda County 0-17 FEMA… WHIT… 1 34155
## 3 2023-06-12 00:00:00 Alameda County 0-17 FEMA… WHIT… 2 34155
## 4 2023-06-19 00:00:00 Alameda County 0-17 FEMA… WHIT… 10 34155
## 5 2023-06-26 00:00:00 Alameda County 0-17 FEMA… WHIT… 19 34155
## 6 2023-07-03 00:00:00 Alameda County 0-17 FEMA… WHIT… 25 34155
## 7 2023-07-10 00:00:00 Alameda County 0-17 FEMA… WHIT… 23 34155
## 8 2023-07-17 00:00:00 Alameda County 0-17 FEMA… WHIT… 18 34155
## 9 2023-07-24 00:00:00 Alameda County 0-17 FEMA… WHIT… 22 34155
## 10 2023-07-31 00:00:00 Alameda County 0-17 FEMA… WHIT… 35 34155
## 11 2023-08-07 00:00:00 Alameda County 0-17 FEMA… WHIT… 29 34155
## 12 2023-08-14 00:00:00 Alameda County 0-17 FEMA… WHIT… 43 34155
## 13 2023-08-21 00:00:00 Alameda County 0-17 FEMA… WHIT… 69 34155
## 14 2023-08-28 00:00:00 Alameda County 0-17 FEMA… WHIT… 109 34155
## 15 2023-09-04 00:00:00 Alameda County 0-17 FEMA… WHIT… 100 34155
## 16 2023-09-11 00:00:00 Alameda County 0-17 FEMA… WHIT… 98 34155
## 17 2023-09-18 00:00:00 Alameda County 0-17 FEMA… WHIT… 113 34155
## 18 2023-09-25 00:00:00 Alameda County 0-17 FEMA… WHIT… 103 34155
## 19 2023-10-02 00:00:00 Alameda County 0-17 FEMA… WHIT… 93 34155
## 20 2023-10-09 00:00:00 Alameda County 0-17 FEMA… WHIT… 106 34155
## # ℹ 1 more variable: incidence_rate_100k <dbl>
sex_summary <- joined_data_ca %>%
group_by(sex) %>%
summarise(
total_cases = sum(cases_new, na.rm = TRUE),
total_population = sum(population, na.rm = TRUE),
incidence_rate_100k = (total_cases / total_population) * 100000
)
ggplot(sex_summary,
aes(x = sex, y = incidence_rate_100k, fill = sex)) +
geom_col(width = 0.6) +
geom_text(aes(label = round(incidence_rate_100k, 1)),
vjust = -0.5, size = 5) +
scale_fill_manual(values = c("FEMALE" = "coral", "MALE" = "cyan3")) +
labs(
title = "Incidence Rate per 100,000 by Sex",
subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
x = "Sex",
y = "Incidence Rate per 100,000"
) +
theme_minimal(base_size = 15)
Incidence rates were nearly identical between males and females, suggesting no major sex-based differences in infection risk during this outbreak.
library(gt)
sex_summary %>%
gt() %>%
tab_header(
title = "Incidence Rate per 100,000 by Sex",
subtitle = "Simulated Infectious Disease Outbreak, California & LA County"
) %>%
fmt_number(
columns = c(total_cases, total_population, incidence_rate_100k),
decimals = 1
)
| Incidence Rate per 100,000 by Sex | |||
| Simulated Infectious Disease Outbreak, California & LA County | |||
| sex | total_cases | total_population | incidence_rate_100k |
|---|---|---|---|
| FEMALE | 2,294,166.0 | 540,075,676.0 | 424.8 |
| MALE | 2,255,417.0 | 525,013,644.0 | 429.6 |
The total number of cases and overall incidence rate are similar for males and females, indicating that sex did not meaningfully influence the burden of disease in this population.
age_summary <- joined_data_ca %>%
group_by(age_group) %>%
summarise(
total_cases = sum(cases_new, na.rm = TRUE),
total_population = sum(population, na.rm = TRUE),
incidence_rate_100k = (total_cases / total_population) * 100000
)
ggplot(age_summary,
aes(x = age_group, y = incidence_rate_100k, fill = age_group)) +
geom_col(width = 0.7) +
geom_text(aes(label = round(incidence_rate_100k, 1)),
vjust = -0.5, size = 4) +
labs(
title = "Incidence Rate per 100,000 by Age Group",
subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
x = "Age Group",
y = "Incidence Rate per 100,000",
fill = "Age Group"
) +
theme_minimal(base_size = 15)
Incidence rates increase steadily with age, with the highest burden occurring in adults 65 and older. This pattern suggests greater vulnerability or exposure among older populations.
race_summary <- joined_data_ca %>%
group_by(race) %>%
summarise(
total_cases = sum(cases_new, na.rm = TRUE),
total_population = sum(population, na.rm = TRUE),
incidence_rate_100k = (total_cases / total_population) * 100000
) %>%
arrange(desc(incidence_rate_100k))
ggplot(race_summary,
aes(x = reorder(race, incidence_rate_100k),
y = incidence_rate_100k,
fill = race)) +
geom_col(width = 0.7) +
geom_text(aes(label = round(incidence_rate_100k, 1)),
vjust = -0.5, size = 3.8) +
labs(
title = "Incidence Rate per 100,000 by Race/Ethnicity",
subtitle = "Simulated Infectious Disease Outbreak, California & LA County",
x = "Race/Ethnicity",
y = "Incidence Rate per 100,000",
fill = "Race/Ethnicity"
) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 25, hjust = 1))
Incidence rates varied widely across racial and ethnic groups, indicating potential differences in exposure, underlying health status, or access to testing and care.
Incidence rates were comparable across sex, with only a very small difference between females and males. This suggests that transmission and overall disease burden were distributed fairly evenly across both groups during this outbreak. These patterns indicate that sex alone may not be a major driver of risk in this dataset, and other factors such as age, race or exposure patterns may be more influential.
Overall, the incidence trends show that sex does not appear to meaningfully influence infection risk in this outbreak, as the rates between males and females are nearly identical. This consistency suggests broad and similar exposure patterns across both groups. These descriptive findings help build a high-level understanding of the outbreak before moving into more advanced or stratified analyses, and they highlight the importance of evaluating additional demographic or structural factors that may better explain differences in disease burden.