Milestone 4

Final Dataset

See code in Milestone 3 for joining all three datasets into one final dataset.

final_dataset <- read_csv("final_dataset.csv")

Here is the structure of our final dataset:

str(final_dataset)

## spc_tbl_ [812 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ county                 : chr [1:812] "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ race_ethnicity         : chr [1:812] "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "Asian, Non-Hispanic" "Asian, Non-Hispanic" ...
##  $ sex                    : chr [1:812] "FEMALE" "MALE" "FEMALE" "MALE" ...
##  $ total_new_infections   : num [1:812] 229 215 20375 18694 9070 ...
##  $ total_new_severe       : num [1:812] 4 8 589 519 283 227 339 322 55 45 ...
##  $ pop                    : num [1:812] 2387 2182 295285 273327 84798 ...
##  $ infection_rate_per_100k: num [1:812] 9594 9853 6900 6839 10696 ...
##  $ severity_rate          : num [1:812] 0.0175 0.0372 0.0289 0.0278 0.0312 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   county = col_character(),
##   ..   race_ethnicity = col_character(),
##   ..   sex = col_character(),
##   ..   total_new_infections = col_double(),
##   ..   total_new_severe = col_double(),
##   ..   pop = col_double(),
##   ..   infection_rate_per_100k = col_double(),
##   ..   severity_rate = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Add Health Officer Region

After further review of the data, we decided to include the “health_officer_region” column in our dataset to allow for easier aggregation and visualization.

ca_pop_2023 <- read_csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/ca_pop_2023.csv")

health_officer_region_data <- data.frame(health_officer_region = ca_pop_2023$health_officer_region, county = ca_pop_2023$county) %>%
  distinct(county, .keep_all = TRUE)

final_dataset <- left_join(x = final_dataset, y = health_officer_region_data, by = "county")

Calculate New Variable

Add new severe infection rate column per 100,000 population.

Prior new variables created in milestone 3:

1. new infection rate column per 100,000 population

2. severe infection rate per total infections

Visualizations (at least 3 - one per group member)

Total Number of New Infections per Health Officer Region by Race/Ethnicity

county_plot <- final_dataset %>%
  group_by(health_officer_region) %>%
  summarize(new_cases = sum(total_new_infections)) %>%
  arrange(new_cases)

plot_ly(
  county_plot,
  x = ~health_officer_region,
  y = ~new_cases,
  type = ~"bar"
) %>%
  layout(title = "Total Number of New Cases per Health Officer Region",
         yaxis = list(title = "New Cases of Novel Infectious Disease"),
         xaxis = list(title = "Health Officer Region")
  )

Interpretation: This data shows that Southern California has the largest number of new cases of novel infectious disease. However, this could be due to the fact that Southern California has the largest population.

Case Rate for New Cases of Novel Infectious Disease by Health Officer Region

case_rate_region <- final_dataset %>%
  group_by(health_officer_region) %>%
  summarize(new_cases = sum(total_new_infections),
            total_pop = sum(pop),
            case_rate = round((new_cases/total_pop)*10000, 3)) %>%
  ungroup()

datatable(
  case_rate_region,
  options = list(
    order = list(3, 'desc'), 
    columnDefs = list(
      list(className = 'dt-center', targets = 1:2) 
    ), 
    dom = 'ti'
  ),
  rownames = FALSE,
  colnames = c("Health Officer Region", "Number of New Cases", "Total Population", "Case Rate per 10,000")
)

Interpretation: Central California has the highest case rate for new cases of the novel infectious disease - 1815.191 new cases per 10,000.

Total Number of New Severe Infections per Health Officer Region by Race/Ethnicity

county_plot_severe <- final_dataset %>%
  group_by(health_officer_region) %>%
  summarize(new_cases = sum(total_new_severe)) %>%
  arrange(new_cases)

plot_ly(
  county_plot_severe,
  x = ~health_officer_region,
  y = ~new_cases,
  type = ~"bar"
) %>%
  layout(title = "Total Number of New Severe Cases per Health Officer Region",
         yaxis = list(title = "New Cases of Severe Novel Infectious Disease"),
         xaxis = list(title = "Health Officer Region")
  )

Interpretation: This data shows that Southern California has the largest number of severe cases of novel infectious disease.

Case Rate for Severe Infectious Disease

severe_case_rate_region <- final_dataset %>%
  group_by(health_officer_region) %>%
  summarize(new_cases = sum(total_new_severe),
            total_pop = sum(pop),
            case_rate = round(new_cases/total_pop, 4)*10000) %>%
  ungroup()

datatable(
  severe_case_rate_region,
  options = list(
    order = list(3, 'desc'), 
    columnDefs = list(
      list(className = 'dt-center', targets = 1:3) 
    ), 
    dom = 'ti'
  ),
  rownames = FALSE,
  colnames = c("Health Officer Region", "Number of New Severe Cases", "Total Population", "Case Rate per 10,000")
)

Interpretation: Central California has the highest severe case rate per 10,000 population at 45, indicating a relatively higher burden of severe cases compared to other regions.

Average Infection Rate per 100k per Race/Ethnicity

agg_data <- final_dataset %>%
  group_by(race_ethnicity) %>%
  summarise(average_infection_rate = mean(infection_rate_per_100k, na.rm = TRUE))

agg_data$agg_data_wrapped <- c(
  "American Indian or\nAlaska Native,\nNon-Hispanic",
  "Asian,\nNon-Hispanic",
  "Black,\nNon-Hispanic",
  "Hispanic",
  "Multiracial,\nNon-Hispanic", 
  "Native Hawaiian \nor \nPacific Islander,\nNon-Hispanic", 
  "White,\nNon-Hispanic" 
)

average_infection_by_race <- ggplot(agg_data, aes(x = agg_data_wrapped, y = average_infection_rate, fill = average_infection_rate)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Infection Rates by Race/Ethnicity",
       x = "Race/Ethnicity", y = "Average Infection Rate per 100,000") +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        legend.position = "none") +
  scale_fill_gradient(low = "#A7C7E7", high = "#2D6A4F")

print(average_infection_by_race)

Cumulative Incidence of Severe Novel Infection by Race/Ethnicity

race_ethnicity_severe <- final_dataset %>%
  group_by(race_ethnicity) %>%
  summarize(new_severe_infections = sum(total_new_severe),
            race_ethnicity_population = sum(pop),
            proportion = (new_severe_infections/race_ethnicity_population)*100000) %>%
  ungroup()

race_ethnicity_severe <- arrange(race_ethnicity_severe, race_ethnicity_severe$proportion)

race_ethnicity_severe$race_ethnicity_wrapped <- c(
  "Multiracial,\nNon-Hispanic", 
  "Hispanic",
  "Asian,\nNon-Hispanic", 
  "Native Hawaiian \nor \nPacific Islander,\nNon-Hispanic", 
  "Black,\nNon-Hispanic",
  "American Indian or\nAlaska Native,\nNon-Hispanic",
  "White,\nNon-Hispanic" 
)

race_ethnicity_severe %>%
  ggplot(aes(x = race_ethnicity_wrapped, y = proportion, fill = proportion)) +
  geom_bar(stat = "identity")+
  theme(
    # axis.text.x = element_text(a, hjust = 1),
    legend.position = "none") +
  labs(title = "Novel Severe Infectious Disease Incidence Rate by Race/Ethnicity Group", x = "Race/Ethnicity Group", y = "Incidence Rate per 100,000", fill = "Race/Ethnicity Group") +
  scale_fill_gradient(low = "#A7C7E7", high = "#2D6A4F")

Interpretation: This graph shows that White, Non-Hispanic and American Indian or Alaska Natives, Non-Hispanic are the most affected by severe novel infectious disease. ## Infection by Race and Sex

Infection_by_race_sex <- final_dataset %>%
  group_by(race_ethnicity, sex) %>%
  summarize(mean_infection_rate = mean(infection_rate_per_100k, na.rm = TRUE))

## `summarise()` has grouped output by 'race_ethnicity'. You can override using
## the `.groups` argument.

Infection_by_race_sex$wrapped_race_ethnicity <- str_wrap(Infection_by_race_sex$race_ethnicity, width = 25)
  
infection_by_race_sex_plot <- Infection_by_race_sex %>%
  ggplot(aes(x = wrapped_race_ethnicity, y = mean_infection_rate, fill = sex)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  labs(title = "Mean Infection Rate by Race/Ethnicity and Sex",
       x = "Race/Ethnicity", y = "Mean Infection Rate per 100,000") +
  scale_fill_manual(values = c("lightblue", "lightpink")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.background = element_rect(fill = "grey90", color = NA),
        strip.text = element_text(face = "bold"))

print(infection_by_race_sex_plot)

Interpretation: This bar plot shows the distribution of mean infection rates across counties for each race/ethnicity, with varying infection rates between sexes across different groups. The graph shows Native Hawaiian or Pacific Islander females are disproportionately affected. However, the mean infection rates seem to be equal between males and females per race/ethnicity group.

Total Number of New Infections per Health Officer Region by Sex

county_sex_plot <- final_dataset %>%
  group_by(health_officer_region, sex) %>%
  summarize(new_cases = sum(total_new_infections)) %>%
  arrange(new_cases) %>%
  ungroup()

plot_ly(
  county_sex_plot,
  x= ~health_officer_region,
  y= ~new_cases,
  color= ~sex,
  type = "bar",
  colors = c("lightpink", "lightblue")
) %>%
  layout(barmode = "",
         title = "Total Number of Cases per Health Officer Region by Sex",
         yaxis = list(title = "New Cases of Novel Infectious Disease"),
         xaxis = list(title = "Health Officer Region")
  )

Interpretation: Overall, males and females seem to be equally affected by novel infectious disease, regardless of region.

Case Rate by Region and Sex

case_rate_sex <- final_dataset %>%
  group_by(health_officer_region,sex) %>%
  summarize(new_cases = sum(total_new_infections),
            total_pop = sum(pop),
            case_rate = round(new_cases/total_pop, 3)) %>%
  ungroup()

datatable(
  case_rate_sex, 
  options = list(
    pageLength = 6,
    lengthMenu = c(6,12),
    order = list(4, 'desc'),
    columnDefs = list(
    list(className = 'dt-center', targets=1:3)
  )),
  rownames = FALSE,
  colnames = c("Health Officer Region", "Sex", "Number of New Cases", "Total Population", "Case Rate")
)

Interpretation: The highest case rates for new cases of disease are in Central Calfornia, with a case rate of 0.186 for males and 0.177 for females.

Case Rate by Region and Race/Ethnicity

case_rate_race <- final_dataset %>%
  group_by(health_officer_region,race_ethnicity) %>%
  summarize(new_cases = sum(total_new_infections),
            total_pop = sum(pop),
            case_rate = round(new_cases/total_pop,3)) %>%
  ungroup()

datatable(
  case_rate_race, 
  options = list(
    pageLength = 7,
    lengthMenu = c(7,14,21),
    order = list(4, 'desc'),
    columnDefs = list(
     list(className = 'dt-center', targets=1:3)
   )),
  rownames = FALSE,
  colnames = c("Health Officer Region", "Race/Ethnicity", "Number of New Cases", "Total Population", "Case Rate"))

Interpretation: The highest case rates for new cases of disease are in Central California, disproprotionately affecting Non-Hispanic Whites, American Indian or Alaska Natives, and Blacks.