Project Milestone 4

Author

Group 1

Group 1- Project Milestone #4

For Scenario 1: Infectious disease outbreak (simulated) in California

Data Dictionary

Data Dictionary
Variable Name	Data Type	Description
county	Character	California county of residence of novel infectious respiratory ID cases
race_eth	Character	Race-Ethnicity categorization as defined by California Department of Finance
total_infections	Integer	Cumulative number of diagnosed individuals from May 28th-Dec 30th, 2023, per race category
total_unrecovered	Integer	Cumulative number of individuals reported as unrecovered after a week of being diagnosed from May 28th-Dec 30th, 2023, per race category
total_severe	Integer	Cumulative number of identified individuals having severe disease requiring hospitalization from May 28th-Dec 30th, 2023, per race category
pop	Integer	Population of each California county, per race category

Import Datasets

Morbidity Datasets

1) Recode Column Names

2) Join Morbidity Datasets & Aggregate by Race

Population Dataset

1) Rename racial ethnic groups

2) Recategorize ages

3) Aggregate by Race/Ethnicity

Milestone 4 - Join all Datasets and Add Rate Columns

#str(Counties_race)
#str(pop_aggregate)

#make sure data matches for all datasets
library(stringr)

Counties_race <- Counties_race %>%
  mutate(county = str_to_title(str_trim(county)),
         race_eth = str_to_title(str_trim(race_eth)))

pop_aggregate <- pop_aggregate %>%
  mutate(county = str_to_title(str_trim(county)),
         race_eth = str_to_title(str_trim(race_eth)))


#join County aggregated by race with population dataset
Counties_final <- Counties_race %>%
  inner_join (pop_aggregate, by = c("county", "race_eth")) %>%
  arrange(county) 

#Add rate (% population columns)
Counties_final <- Counties_final %>%
  mutate(
    total_infections_pct = round((total_infections / total_population) * 100,2),
    total_unrecovered_pct = round((total_unrecovered / total_population) * 100,2),
    total_severe_pct = round((total_severe / total_population) * 100,2)
  )

Counties_final <- Counties_final %>%
  mutate(across(everything(), ~replace(., is.nan(.), 0)))
  #mutate(race_eth = case_when(
   # race_eth == "American Indian Or Alaska Native, Non-Hispanic" ~ "American Indian or Alaska Native",
    #TRUE ~ race_eth
#  ))

str (Counties_final)

tibble [406 × 9] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:406] "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ race_eth             : chr [1:406] "American Indian Or Alaska Native, Non-Hispanic" "Asian, Non-Hispanic" "Black, Non-Hispanic" "Hispanic (Any Race)" ...
 $ total_infections     : num [1:406] 444 39069 17161 33568 5719 ...
 $ total_unrecovered    : num [1:406] 56 4884 2209 3028 451 ...
 $ total_severe         : num [1:406] 12 1108 510 661 100 ...
 $ total_population     : num [1:406] 4569 568612 157817 335452 88937 ...
 $ total_infections_pct : num [1:406] 9.72 6.87 10.87 10.01 6.43 ...
 $ total_unrecovered_pct: num [1:406] 1.23 0.86 1.4 0.9 0.51 0.9 1.59 2.39 0 0 ...
 $ total_severe_pct     : num [1:406] 0.26 0.19 0.32 0.2 0.11 0.21 0.35 0.48 0 0 ...

Visualizations

Graph of Infections by Race

CA_agg_by_race_only <- Counties_final %>%
  group_by(race_eth) %>%
  summarize (total_infections = sum(total_infections),
             total_unrecovered = sum (total_unrecovered),
             total_severe = sum (total_severe),
             total_population = sum(total_population))

CA_race <- CA_agg_by_race_only %>% 
  mutate(
    total_infections_pct = round((total_infections / total_population) * 100,2),
    total_unrecovered_pct = round((total_unrecovered / total_population) * 100,2),
    total_severe_pct = round((total_severe / total_population) * 100,2)
  )

#Pivot data for side by side bar graph
data_long_pct <- CA_race %>%
  pivot_longer(cols = c(total_infections_pct, total_unrecovered_pct, total_severe_pct),
               names_to = "metric", values_to = "value")

#Bar Graph
Graph_2 <- ggplot(data_long_pct, aes(x = race_eth, y = value, fill = metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Novel Respiratory Disease Infections by Race/Ethnicity",
    x = "Race Category",
    y = "Percent of Total Population",
    fill = "Infection Category",  # Renaming the legend title
    caption = "Interpretation: Data represents percentages of infection categories calculated based on total population
    \n per race/ethnicity. The graph shows the Race Categories most affected are Alaska Native, White, 
    \n Black and Hispanic with Asian and Multiracial least affected. The percentage of severe and unaffected
    \n cases follow the same pattern." # Adding a footnote
  ) +
  scale_fill_manual(
    values = c("total_infections_pct" = "skyblue", 
               "total_unrecovered_pct" = "orange", 
               "total_severe_pct" = "red"),
    labels = c("Total Infections", "Total Unrecovered", "Total Severe")  # Renaming the legend categories
  ) +
  scale_x_discrete(labels = function(x) ifelse(nchar(x) > 19, paste(substr(x, 1, 19), "\n", substr(x, 20, nchar(x))), x)) +  
  # Add line break for long labels
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y = element_text(size = 10),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.caption = element_text(hjust = 0, size = 10)  # Left-align the footnote
  )

Graph_2

Graph of top ten most affected counties

Counties_county <- Counties_final %>%
  mutate(
    total_infections_pct = round((total_infections / total_population) * 100,2),
    total_unrecovered_pct = round((total_unrecovered / total_population) * 100,2),
    total_severe_pct = round((total_severe / total_population) * 100,2)
  )

# Summary data

counties_county_only <- Counties_county %>%
  group_by(county) %>%
  summarize(
    total_infections = sum(total_infections),
    total_unrecovered = sum(total_unrecovered),
    total_severe = sum(total_severe),
    total_population = sum(total_population)
  )

#Percentage of cases and of top 10 counties


CA_county_only <- counties_county_only %>% 
  mutate(
    total_infections_pct = round((total_infections / total_population) * 100, 2),
    total_unrecovered_pct = round((total_unrecovered / total_population) * 100, 2),
    total_severe_pct = round((total_severe / total_population) * 100, 2)
  ) %>% 
  group_by(county) %>% # Ensure that summarization is done by groups if `county` exists.
  summarize(
    Total_Unrecovered = sum(total_unrecovered, na.rm = TRUE),
    Total_Severe = sum(total_severe, na.rm = TRUE),
    Total_Infections = sum(total_infections, na.rm = TRUE)
  ) %>%
  arrange(desc(Total_Infections), desc(Total_Unrecovered), desc(Total_Severe))%>%
  slice_head(n=10)


# Pivot the data for plotting

top_10_counties_pct <- CA_county_only %>%
  pivot_longer(
    cols = c(Total_Infections, Total_Unrecovered, Total_Severe),
    names_to = "metric", values_to = "value"
  )

# Create the bar plot

ggplot(top_10_counties_pct, aes(x = reorder(county, -value), y = value, fill = metric)) +
  geom_bar(stat = "identity", position = "dodge") +  # Create side-by-side bars
  labs(
    title = "Top Ten Counties Affected by Novel Respiratory Disease",
    x = "County",                   # Corrected to match the axis
    y = "Percent of Total Population",
    fill = "Category",           # Legend title for infection categories
    caption = "Interpretation: 
    Graph shows top ten most affected counties with Los Angeles
    having the highest percentage of infected people, 
    along with severe and unrecovered cases."  # Added caption
  ) +
  scale_fill_manual(
    values = c(
      Total_Infections = "skyblue", 
      Total_Unrecovered = "orange", 
      Total_Severe = "red"
    ),
    labels = c("Total Infections", "Total Unrecovered", "Total Severe")  # Renaming the legend categories
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),  # Rotate x-axis labels
    axis.text.y = element_text(size = 10),  # Adjust y-axis labels
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    plot.caption = element_text(size = 10, hjust = 0)  # Style the caption
  )

Percent Infection and Infection Type by Race

#percentage stats

CA_race_table <- CA_race %>% mutate(percent_infected = (total_infections/total_population)*100, percent_severe_of_infected = (total_severe/total_infections) * 100, percent_unrecovered_of_infected = (total_unrecovered/total_infections)*100)

#cleaning dataframe

CA_race_table <- select(CA_race_table, -total_infections, -total_unrecovered, -total_severe, -total_population, -total_infections_pct, -total_unrecovered_pct, -total_severe_pct)

#percent infection table by race

race_kable <- kable(
  CA_race_table,
  col.names = c('Race/Ethnicity', 'Percent Infected', 'Percentage of Severe Cases among Infected', 'Percentage of Unrecovered Cases among Infected'),
  digits = 1,
  align = c('l', 'c', 'c', 'c'),
  caption = "Novel Respiratory Disease Infection Percentages by Race/Ethnicity"
) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  footnote(
    general_title = "<b>Interpretation:</b>",
    general = "The group with the highest percent of infection is American Indian or Alaska Native. However, the group with the highest percentage of severe cases and unrecovered cases is White. The Multiracial group has the lowest percentage of infection, severe infections, and unrecovered infections.",
    escape = FALSE
  ) 

race_kable

Novel Respiratory Disease Infection Percentages by Race/Ethnicity
Race/Ethnicity	Percent Infected	Percentage of Severe Cases among Infected	Percentage of Unrecovered Cases among Infected
American Indian Or Alaska Native, Non-Hispanic	14.0	3.0	13.3
Asian, Non-Hispanic	8.7	3.0	13.4
Black, Non-Hispanic	12.3	2.6	11.8
Hispanic (Any Race)	12.1	2.0	9.2
Multiracial (Two Or More Of Above Races), Non-Hispanic	7.2	1.9	8.6
Native Hawaiian Or Pacific Islander, Non-Hispanic	11.0	2.4	10.5
White, Non-Hispanic	12.8	3.6	15.6
Interpretation:
The group with the highest percent of infection is American Indian or Alaska Native. However, the group with the highest percentage of severe cases and unrecovered cases is White. The Multiracial group has the lowest percentage of infection, severe infections, and unrecovered infections.

Data Element Statistics

tibble [406 × 3] (S3: tbl_df/tbl/data.frame)
 $ county          : chr [1:406] "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ race_eth        : chr [1:406] "American Indian Or Alaska Native, Non-Hispanic" "Asian, Non-Hispanic" "Black, Non-Hispanic" "Hispanic (Any Race)" ...
 $ total_population: int [1:406] 4569 568612 157817 335452 88937 14710 485940 209 13 14 ...

Summary Statistics
Variable Name	Statistics
county	58 counties
race/ethnicity	7 categories
total infections	Range = 0 to 429,165; Mean = 11,206
total unrecovered	Range = 0 to 43,063; Mean = 1,384
total severe	Range = 0 to 9,482; Mean = 313
population	Range = 0 to 4,089,110; Mean = 96,328