Project Milestone 3

Group 1- Project Milestone #3

For Scenario 1: Infectious disease outbreak (simulated) in California

Data Dictionary

Data Dictionary
Variable Name	Data Type	Description
county	Character	California county of residence of novel infectious respiratory ID cases
race_eth	Character	Race-Ethnicity categorization as defined by California Department of Finance
total_infections	Integer	Cumulative number of diagnosed individuals from May 28th-Dec 30th, 2023, per race category
total_unrecovered	Integer	Cumulative number of individuals reported as unrecovered after a week of being diagnosed from May 28th-Dec 30th, 2023, per race category
total_severe	Integer	Cumulative number of identified individuals having severe disease requiring hospitalization from May 28th-Dec 30th, 2023, per race category
pop	Integer	Population of each California county, per race category

Import Datasets

#import DataSet#1 from GitHub (sim_novelid_CA.csv)
CA_data <- read.csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/sim_novelid_CA.csv") %>%
  rename_with(~ tolower(gsub(" ","_",.x,fixed=TRUE))) 

#Dataset two (sim_novelid_LACounty.csv)
LACounty_data <- read.csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/sim_novelid_LACounty.csv") %>%
  rename_with(~ tolower(gsub(" ","_",.x,fixed=TRUE))) 

#Import Dataset three (population dataset)
DS3 <- read.csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/ca_pop_2023.csv") %>%
  rename_with(~ tolower(gsub(" ","_",.x,fixed=TRUE)))

Morbidity Datasets

1) Recode Column Names

#str (CA_data)

#str(LACounty_data)
unique (LACounty_data$race_eth)

[1] "White, Non-Hispanic"                                   
[2] "Black, Non-Hispanic"                                   
[3] "American Indian or Alaska Native, Non-Hispanic"        
[4] "Asian, Non-Hispanic"                                   
[5] "Native Hawaiian or Pacific Islander, Non-Hispanic"     
[6] "Multiracial (two or more of above races), Non-Hispanic"
[7] "Hispanic (any race)"

#Rename columns to new column names, add date column in date format
CA_data_clean <-  CA_data %>%
  rename (race_eth = race_ethnicity) %>%
  rename (infections_new = new_infections) %>%
  rename (infected_cum = cumulative_infected) %>%
  rename (unrecovered_new = new_unrecovered)%>%
  rename (unrecovered_cum = cumulative_unrecovered) %>%
  rename (severe_new = new_severe) %>%
  rename (severe_cum = cumulative_severe) %>%
   mutate(
    year = as.numeric(substr(time_int, 1, 4)),  # Extract the year
    mmwr_week = as.numeric(substr(time_int, 5, 6)),  # Extract the week
    date = MMWRweek2Date(MMWRyear = year, MMWRweek = mmwr_week)  # Convert to date
  ) %>%
  select (-dt_diagnosis, -year, -mmwr_week, -time_int)

#Remove word "county" in the county column
CA_data_clean <- CA_data_clean %>%
  mutate(county = gsub("\\County\\b", "", county))

#Recode race_eth column to convert numbers to descriptions same as LACounty data
CA_data_clean <- CA_data_clean %>%
  mutate(
    race_eth = recode(
      race_eth,
      `1` = "White, Non-Hispanic",
      `2` = "Black, Non-Hispanic",
      `3` = "American Indian or Alaska Native, Non-Hispanic",
      `4` = "Asian, Non-Hispanic",
      `5` = "Native Hawaiian or Pacific Islander, Non-Hispanic",
      `6` = "Multiracial (two or more of above races), Non-Hispanic",
      `7` = "Hispanic (any race)",
      `9` = "Unknown"
    )
  )
#head(CA_data_clean)
str(CA_data_clean)

'data.frame':   98952 obs. of  11 variables:
 $ county         : chr  "Alameda " "Alameda " "Alameda " "Alameda " ...
 $ age_cat        : chr  "0-17" "0-17" "0-17" "0-17" ...
 $ sex            : chr  "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_eth       : chr  "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ infections_new : int  6 1 2 10 19 25 23 18 22 35 ...
 $ infected_cum   : int  6 7 9 19 38 63 86 104 126 161 ...
 $ unrecovered_new: int  0 1 0 0 0 1 0 1 1 1 ...
 $ unrecovered_cum: int  0 1 1 1 1 2 2 3 4 5 ...
 $ severe_new     : int  0 0 1 0 0 0 0 0 0 0 ...
 $ severe_cum     : int  0 0 1 1 1 1 1 1 1 1 ...
 $ date           : Date, format: "2023-05-28" "2023-06-04" ...

#unique(CA_data_clean$race_eth)

#LA Dataset cleaned

LA_data_clean <- LACounty_data %>%
  mutate(county = "Los Angeles") %>%
  rename(age_cat = age_category,
         date =  dt_dx,
         infections_new = dx_new,
         infected_cum = infected_cumulative,
         unrecovered_cum = unrecovered_cumulative,
         severe_cum = severe_cumulative) %>%
  mutate(date = as.Date(date, format = "%d%b%Y")) %>%
  select(county, age_cat, sex, race_eth, infections_new, infected_cum, 
         unrecovered_new, unrecovered_cum, severe_new, severe_cum, date)

#head(LA_data_clean)
str(LA_data_clean)

'data.frame':   1736 obs. of  11 variables:
 $ county         : chr  "Los Angeles" "Los Angeles" "Los Angeles" "Los Angeles" ...
 $ age_cat        : chr  "0-17" "0-17" "0-17" "0-17" ...
 $ sex            : chr  "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_eth       : chr  "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ infections_new : int  15 17 23 51 67 75 106 83 91 173 ...
 $ infected_cum   : int  15 32 55 106 173 248 354 437 528 701 ...
 $ unrecovered_new: int  0 0 0 0 1 1 4 3 1 3 ...
 $ unrecovered_cum: int  0 0 0 0 1 2 6 9 10 13 ...
 $ severe_new     : int  0 0 0 0 0 0 0 1 0 0 ...
 $ severe_cum     : int  0 0 0 0 0 0 0 1 1 1 ...
 $ date           : Date, format: "2023-05-29" "2023-06-05" ...

2) Join Morbidity Datasets & Aggregate by Race

#Join CA_data_clean with LA_data_clean

Counties_join <- full_join (CA_data_clean, LA_data_clean) %>%
  arrange (county)

#Aggregate by Race/Ethnicity
Counties_race <- Counties_join %>%
  group_by(county, race_eth) %>%
  summarize (total_infections = sum(infections_new),
             total_unrecovered = sum (unrecovered_new),
             total_severe = sum (severe_new)) %>%
  ungroup()

head(Counties_race)

# A tibble: 6 × 5
  county     race_eth            total_infections total_unrecovered total_severe
  <chr>      <chr>                          <int>             <int>        <int>
1 "Alameda " American Indian or…              444                56           12
2 "Alameda " Asian, Non-Hispanic            39069              4884         1108
3 "Alameda " Black, Non-Hispanic            17161              2209          510
4 "Alameda " Hispanic (any race)            33568              3028          661
5 "Alameda " Multiracial (two o…             5719               451          100
6 "Alameda " Native Hawaiian or…             1183               132           31

Population Dataset

1) Rename racial ethnic groups

DS3 <- DS3 %>% mutate(race7 = recode(race7, `WhiteTE NH` = "White, Non-Hispanic", `Black NH` = "Black, Non-Hispanic", `AIAN NH` = "American Indian or Alaska Native, Non-Hispanic", `Asian NH` = "Asian, Non-Hispanic", `NHPI NH` = "Native Hawaiian or Pacific Islander, Non-Hispanic", `MR NH` = "Multiracial (two or more of above races), Non-Hispanic", `Hispanic` = "Hispanic (any race)" ))

#rename race column to match
DS3 <- rename(DS3, race_eth = race7)

2) Recategorize ages

DS3 <- DS3 %>% mutate(age_cat = recode(age_cat, "0-4" = "0-17", "5-11" = "0-17", "12-17" = "0-17"))

#Remove unused columns
DS3_clean <- DS3 %>%
  select(-health_officer_region)

str(DS3_clean)

'data.frame':   90132 obs. of  5 variables:
 $ county  : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ age_cat : chr  "0-17" "0-17" "0-17" "0-17" ...
 $ sex     : chr  "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
 $ race_eth: chr  "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" "White, Non-Hispanic" ...
 $ pop     : int  2008 2128 2142 2057 1965 2017 1835 1840 1911 1935 ...

3) Aggregate by Race/Ethnicity

#Aggregate by race for each county  
pop_aggregate <- DS3_clean %>% 
  group_by(county, race_eth) %>% 
  summarize(population_aggregate = sum(pop))

Data Element Statistics

Summary Statistics
Variable Name	Statistics
county	58 counties
race/ethnicity	7 categories
total infections	Range = 0 to 429,165; Mean = 11,206
total unrecovered	Range = 0 to 43,063; Mean = 1,384
total severe	Range = 0 to 9,482; Mean = 313
population	Range = 0 to 4,089,110; Mean = 96,328