| Variable Name | Data Type | Description |
|---|---|---|
| county | Character | California county of residence of novel infectious respiratory ID cases |
| race_eth | Character | Race-Ethnicity categorization as defined by California Department of Finance |
| total_infections | Integer | Cumulative number of diagnosed individuals from May 28th-Dec 30th, 2023, per race category |
| total_unrecovered | Integer | Cumulative number of individuals reported as unrecovered after a week of being diagnosed from May 28th-Dec 30th, 2023, per race category |
| total_severe | Integer | Cumulative number of identified individuals having severe disease requiring hospitalization from May 28th-Dec 30th, 2023, per race category |
| pop | Integer | Population of each California county, per race category |
Project Milestone 3
Group 1- Project Milestone #3
For Scenario 1: Infectious disease outbreak (simulated) in California
Data Dictionary
Import Datasets
#import DataSet#1 from GitHub (sim_novelid_CA.csv)
CA_data <- read.csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/sim_novelid_CA.csv") %>%
rename_with(~ tolower(gsub(" ","_",.x,fixed=TRUE)))
#Dataset two (sim_novelid_LACounty.csv)
LACounty_data <- read.csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/sim_novelid_LACounty.csv") %>%
rename_with(~ tolower(gsub(" ","_",.x,fixed=TRUE)))
#Import Dataset three (population dataset)
DS3 <- read.csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/ca_pop_2023.csv") %>%
rename_with(~ tolower(gsub(" ","_",.x,fixed=TRUE))) Morbidity Datasets
1) Recode Column Names
#str (CA_data)#str(LACounty_data)
unique (LACounty_data$race_eth)[1] "White, Non-Hispanic"
[2] "Black, Non-Hispanic"
[3] "American Indian or Alaska Native, Non-Hispanic"
[4] "Asian, Non-Hispanic"
[5] "Native Hawaiian or Pacific Islander, Non-Hispanic"
[6] "Multiracial (two or more of above races), Non-Hispanic"
[7] "Hispanic (any race)"
#Rename columns to new column names, add date column in date format
CA_data_clean <- CA_data %>%
rename (race_eth = race_ethnicity) %>%
rename (infections_new = new_infections) %>%
rename (infected_cum = cumulative_infected) %>%
rename (unrecovered_new = new_unrecovered)%>%
rename (unrecovered_cum = cumulative_unrecovered) %>%
rename (severe_new = new_severe) %>%
rename (severe_cum = cumulative_severe) %>%
mutate(
year = as.numeric(substr(time_int, 1, 4)), # Extract the year
mmwr_week = as.numeric(substr(time_int, 5, 6)), # Extract the week
date = MMWRweek2Date(MMWRyear = year, MMWRweek = mmwr_week) # Convert to date
) %>%
select (-dt_diagnosis, -year, -mmwr_week, -time_int)
#Remove word "county" in the county column
CA_data_clean <- CA_data_clean %>%
mutate(county = gsub("\\County\\b", "", county))
#Recode race_eth column to convert numbers to descriptions same as LACounty data
CA_data_clean <- CA_data_clean %>%
mutate(
race_eth = recode(
race_eth,
`1` = "White, Non-Hispanic",
`2` = "Black, Non-Hispanic",
`3` = "American Indian or Alaska Native, Non-Hispanic",
`4` = "Asian, Non-Hispanic",
`5` = "Native Hawaiian or Pacific Islander, Non-Hispanic",
`6` = "Multiracial (two or more of above races), Non-Hispanic",
`7` = "Hispanic (any race)",
`9` = "Unknown"
)
)
#head(CA_data_clean)
str(CA_data_clean)'data.frame': 98952 obs. of 11 variables:
$ county : chr "Alameda " "Alameda " "Alameda " "Alameda " ...
$ age_cat : chr "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_eth : chr "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ infections_new : int 6 1 2 10 19 25 23 18 22 35 ...
$ infected_cum : int 6 7 9 19 38 63 86 104 126 161 ...
$ unrecovered_new: int 0 1 0 0 0 1 0 1 1 1 ...
$ unrecovered_cum: int 0 1 1 1 1 2 2 3 4 5 ...
$ severe_new : int 0 0 1 0 0 0 0 0 0 0 ...
$ severe_cum : int 0 0 1 1 1 1 1 1 1 1 ...
$ date : Date, format: "2023-05-28" "2023-06-04" ...
#unique(CA_data_clean$race_eth)#LA Dataset cleaned
LA_data_clean <- LACounty_data %>%
mutate(county = "Los Angeles") %>%
rename(age_cat = age_category,
date = dt_dx,
infections_new = dx_new,
infected_cum = infected_cumulative,
unrecovered_cum = unrecovered_cumulative,
severe_cum = severe_cumulative) %>%
mutate(date = as.Date(date, format = "%d%b%Y")) %>%
select(county, age_cat, sex, race_eth, infections_new, infected_cum,
unrecovered_new, unrecovered_cum, severe_new, severe_cum, date)
#head(LA_data_clean)
str(LA_data_clean)'data.frame': 1736 obs. of 11 variables:
$ county : chr "Los Angeles" "Los Angeles" "Los Angeles" "Los Angeles" ...
$ age_cat : chr "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_eth : chr "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ infections_new : int 15 17 23 51 67 75 106 83 91 173 ...
$ infected_cum : int 15 32 55 106 173 248 354 437 528 701 ...
$ unrecovered_new: int 0 0 0 0 1 1 4 3 1 3 ...
$ unrecovered_cum: int 0 0 0 0 1 2 6 9 10 13 ...
$ severe_new : int 0 0 0 0 0 0 0 1 0 0 ...
$ severe_cum : int 0 0 0 0 0 0 0 1 1 1 ...
$ date : Date, format: "2023-05-29" "2023-06-05" ...
2) Join Morbidity Datasets & Aggregate by Race
#Join CA_data_clean with LA_data_clean
Counties_join <- full_join (CA_data_clean, LA_data_clean) %>%
arrange (county)
#Aggregate by Race/Ethnicity
Counties_race <- Counties_join %>%
group_by(county, race_eth) %>%
summarize (total_infections = sum(infections_new),
total_unrecovered = sum (unrecovered_new),
total_severe = sum (severe_new)) %>%
ungroup()
head(Counties_race)# A tibble: 6 × 5
county race_eth total_infections total_unrecovered total_severe
<chr> <chr> <int> <int> <int>
1 "Alameda " American Indian or… 444 56 12
2 "Alameda " Asian, Non-Hispanic 39069 4884 1108
3 "Alameda " Black, Non-Hispanic 17161 2209 510
4 "Alameda " Hispanic (any race) 33568 3028 661
5 "Alameda " Multiracial (two o… 5719 451 100
6 "Alameda " Native Hawaiian or… 1183 132 31
Population Dataset
1) Rename racial ethnic groups
DS3 <- DS3 %>% mutate(race7 = recode(race7, `WhiteTE NH` = "White, Non-Hispanic", `Black NH` = "Black, Non-Hispanic", `AIAN NH` = "American Indian or Alaska Native, Non-Hispanic", `Asian NH` = "Asian, Non-Hispanic", `NHPI NH` = "Native Hawaiian or Pacific Islander, Non-Hispanic", `MR NH` = "Multiracial (two or more of above races), Non-Hispanic", `Hispanic` = "Hispanic (any race)" ))
#rename race column to match
DS3 <- rename(DS3, race_eth = race7)2) Recategorize ages
DS3 <- DS3 %>% mutate(age_cat = recode(age_cat, "0-4" = "0-17", "5-11" = "0-17", "12-17" = "0-17"))
#Remove unused columns
DS3_clean <- DS3 %>%
select(-health_officer_region)
str(DS3_clean)'data.frame': 90132 obs. of 5 variables:
$ county : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ age_cat : chr "0-17" "0-17" "0-17" "0-17" ...
$ sex : chr "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
$ race_eth: chr "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
$ pop : int 2008 2128 2142 2057 1965 2017 1835 1840 1911 1935 ...
3) Aggregate by Race/Ethnicity
#Aggregate by race for each county
pop_aggregate <- DS3_clean %>%
group_by(county, race_eth) %>%
summarize(population_aggregate = sum(pop))Data Element Statistics
| Variable Name | Statistics |
|---|---|
| county | 58 counties |
| race/ethnicity | 7 categories |
| total infections | Range = 0 to 429,165; Mean = 11,206 |
| total unrecovered | Range = 0 to 43,063; Mean = 1,384 |
| total severe | Range = 0 to 9,482; Mean = 313 |
| population | Range = 0 to 4,089,110; Mean = 96,328 |