See code in Milestone 3 for joining all three datasets into one final dataset.
final_dataset <- read_csv("final_dataset.csv")
Here is the structure of our final dataset:
str(final_dataset)
## spc_tbl_ [812 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ county : chr [1:812] "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ race_ethnicity : chr [1:812] "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "Asian, Non-Hispanic" "Asian, Non-Hispanic" ...
## $ sex : chr [1:812] "FEMALE" "MALE" "FEMALE" "MALE" ...
## $ total_new_infections : num [1:812] 229 215 20375 18694 9070 ...
## $ total_new_severe : num [1:812] 4 8 589 519 283 227 339 322 55 45 ...
## $ pop : num [1:812] 2387 2182 295285 273327 84798 ...
## $ infection_rate_per_100k: num [1:812] 9594 9853 6900 6839 10696 ...
## $ severity_rate : num [1:812] 0.0175 0.0372 0.0289 0.0278 0.0312 ...
## - attr(*, "spec")=
## .. cols(
## .. county = col_character(),
## .. race_ethnicity = col_character(),
## .. sex = col_character(),
## .. total_new_infections = col_double(),
## .. total_new_severe = col_double(),
## .. pop = col_double(),
## .. infection_rate_per_100k = col_double(),
## .. severity_rate = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
After further review of the data, we decided to include the “health_officer_region” column in our dataset to allow for easier aggregation and visualization.
ca_pop_2023 <- read_csv("https://raw.githubusercontent.com/PHW290/phw251_projectdata/refs/heads/main/scenario_1/ca_pop_2023.csv")
health_officer_region_data <- data.frame(health_officer_region = ca_pop_2023$health_officer_region, county = ca_pop_2023$county) %>%
distinct(county, .keep_all = TRUE)
final_dataset <- left_join(x = final_dataset, y = health_officer_region_data, by = "county")
Add new severe infection rate column per 100,000 population.
Prior new variables created in milestone 3:
1. new infection rate column per 100,000 population
2. severe infection rate per total infections
county_plot <- final_dataset %>%
group_by(health_officer_region) %>%
summarize(new_cases = sum(total_new_infections)) %>%
arrange(new_cases)
plot_ly(
county_plot,
x = ~health_officer_region,
y = ~new_cases,
type = ~"bar"
) %>%
layout(title = "Total Number of New Cases per Health Officer Region",
yaxis = list(title = "New Cases of Novel Infectious Disease"),
xaxis = list(title = "Health Officer Region")
)
Interpretation: This data shows that Southern California has the largest number of new cases of novel infectious disease. However, this could be due to the fact that Southern California has the largest population.
case_rate_region <- final_dataset %>%
group_by(health_officer_region) %>%
summarize(new_cases = sum(total_new_infections),
total_pop = sum(pop),
case_rate = round((new_cases/total_pop)*10000, 3)) %>%
ungroup()
datatable(
case_rate_region,
options = list(
order = list(3, 'desc'),
columnDefs = list(
list(className = 'dt-center', targets = 1:2)
),
dom = 'ti'
),
rownames = FALSE,
colnames = c("Health Officer Region", "Number of New Cases", "Total Population", "Case Rate per 10,000")
)
Interpretation: Central California has the highest case rate for new cases of the novel infectious disease - 1815.191 new cases per 10,000.
county_plot_severe <- final_dataset %>%
group_by(health_officer_region) %>%
summarize(new_cases = sum(total_new_severe)) %>%
arrange(new_cases)
plot_ly(
county_plot_severe,
x = ~health_officer_region,
y = ~new_cases,
type = ~"bar"
) %>%
layout(title = "Total Number of New Severe Cases per Health Officer Region",
yaxis = list(title = "New Cases of Severe Novel Infectious Disease"),
xaxis = list(title = "Health Officer Region")
)
Interpretation: This data shows that Southern California has the largest number of severe cases of novel infectious disease.
severe_case_rate_region <- final_dataset %>%
group_by(health_officer_region) %>%
summarize(new_cases = sum(total_new_severe),
total_pop = sum(pop),
case_rate = round(new_cases/total_pop, 4)*10000) %>%
ungroup()
datatable(
severe_case_rate_region,
options = list(
order = list(3, 'desc'),
columnDefs = list(
list(className = 'dt-center', targets = 1:3)
),
dom = 'ti'
),
rownames = FALSE,
colnames = c("Health Officer Region", "Number of New Severe Cases", "Total Population", "Case Rate per 10,000")
)
Interpretation: Central California has the highest severe case rate per 10,000 population at 45, indicating a relatively higher burden of severe cases compared to other regions.
agg_data <- final_dataset %>%
group_by(race_ethnicity) %>%
summarise(average_infection_rate = mean(infection_rate_per_100k, na.rm = TRUE))
agg_data$agg_data_wrapped <- c(
"American Indian or\nAlaska Native,\nNon-Hispanic",
"Asian,\nNon-Hispanic",
"Black,\nNon-Hispanic",
"Hispanic",
"Multiracial,\nNon-Hispanic",
"Native Hawaiian \nor \nPacific Islander,\nNon-Hispanic",
"White,\nNon-Hispanic"
)
average_infection_by_race <- ggplot(agg_data, aes(x = agg_data_wrapped, y = average_infection_rate, fill = average_infection_rate)) +
geom_bar(stat = "identity") +
labs(title = "Average Infection Rates by Race/Ethnicity",
x = "Race/Ethnicity", y = "Average Infection Rate per 100,000") +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
legend.position = "none") +
scale_fill_gradient(low = "#A7C7E7", high = "#2D6A4F")
print(average_infection_by_race)
race_ethnicity_severe <- final_dataset %>%
group_by(race_ethnicity) %>%
summarize(new_severe_infections = sum(total_new_severe),
race_ethnicity_population = sum(pop),
proportion = (new_severe_infections/race_ethnicity_population)*100000) %>%
ungroup()
race_ethnicity_severe <- arrange(race_ethnicity_severe, race_ethnicity_severe$proportion)
race_ethnicity_severe$race_ethnicity_wrapped <- c(
"Multiracial,\nNon-Hispanic",
"Hispanic",
"Asian,\nNon-Hispanic",
"Native Hawaiian \nor \nPacific Islander,\nNon-Hispanic",
"Black,\nNon-Hispanic",
"American Indian or\nAlaska Native,\nNon-Hispanic",
"White,\nNon-Hispanic"
)
race_ethnicity_severe %>%
ggplot(aes(x = race_ethnicity_wrapped, y = proportion, fill = proportion)) +
geom_bar(stat = "identity")+
theme(
# axis.text.x = element_text(a, hjust = 1),
legend.position = "none") +
labs(title = "Novel Severe Infectious Disease Incidence Rate by Race/Ethnicity Group", x = "Race/Ethnicity Group", y = "Incidence Rate per 100,000", fill = "Race/Ethnicity Group") +
scale_fill_gradient(low = "#A7C7E7", high = "#2D6A4F")
Interpretation: This graph shows that White, Non-Hispanic and American Indian or Alaska Natives, Non-Hispanic are the most affected by severe novel infectious disease. ## Infection by Race and Sex
Infection_by_race_sex <- final_dataset %>%
group_by(race_ethnicity, sex) %>%
summarize(mean_infection_rate = mean(infection_rate_per_100k, na.rm = TRUE))
## `summarise()` has grouped output by 'race_ethnicity'. You can override using
## the `.groups` argument.
Infection_by_race_sex$wrapped_race_ethnicity <- str_wrap(Infection_by_race_sex$race_ethnicity, width = 25)
infection_by_race_sex_plot <- Infection_by_race_sex %>%
ggplot(aes(x = wrapped_race_ethnicity, y = mean_infection_rate, fill = sex)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
labs(title = "Mean Infection Rate by Race/Ethnicity and Sex",
x = "Race/Ethnicity", y = "Mean Infection Rate per 100,000") +
scale_fill_manual(values = c("lightblue", "lightpink")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(fill = "grey90", color = NA),
strip.text = element_text(face = "bold"))
print(infection_by_race_sex_plot)
Interpretation: This bar plot shows the distribution of mean infection
rates across counties for each race/ethnicity, with varying infection
rates between sexes across different groups. The graph shows Native
Hawaiian or Pacific Islander females are disproportionately affected.
However, the mean infection rates seem to be equal between males and
females per race/ethnicity group.
county_sex_plot <- final_dataset %>%
group_by(health_officer_region, sex) %>%
summarize(new_cases = sum(total_new_infections)) %>%
arrange(new_cases) %>%
ungroup()
plot_ly(
county_sex_plot,
x= ~health_officer_region,
y= ~new_cases,
color= ~sex,
type = "bar",
colors = c("lightpink", "lightblue")
) %>%
layout(barmode = "",
title = "Total Number of Cases per Health Officer Region by Sex",
yaxis = list(title = "New Cases of Novel Infectious Disease"),
xaxis = list(title = "Health Officer Region")
)
Interpretation: Overall, males and females seem to be equally affected by novel infectious disease, regardless of region.
case_rate_sex <- final_dataset %>%
group_by(health_officer_region,sex) %>%
summarize(new_cases = sum(total_new_infections),
total_pop = sum(pop),
case_rate = round(new_cases/total_pop, 3)) %>%
ungroup()
datatable(
case_rate_sex,
options = list(
pageLength = 6,
lengthMenu = c(6,12),
order = list(4, 'desc'),
columnDefs = list(
list(className = 'dt-center', targets=1:3)
)),
rownames = FALSE,
colnames = c("Health Officer Region", "Sex", "Number of New Cases", "Total Population", "Case Rate")
)
Interpretation: The highest case rates for new cases of disease are in Central Calfornia, with a case rate of 0.186 for males and 0.177 for females.
case_rate_race <- final_dataset %>%
group_by(health_officer_region,race_ethnicity) %>%
summarize(new_cases = sum(total_new_infections),
total_pop = sum(pop),
case_rate = round(new_cases/total_pop,3)) %>%
ungroup()
datatable(
case_rate_race,
options = list(
pageLength = 7,
lengthMenu = c(7,14,21),
order = list(4, 'desc'),
columnDefs = list(
list(className = 'dt-center', targets=1:3)
)),
rownames = FALSE,
colnames = c("Health Officer Region", "Race/Ethnicity", "Number of New Cases", "Total Population", "Case Rate"))
Interpretation: The highest case rates for new cases of disease are in Central California, disproprotionately affecting Non-Hispanic Whites, American Indian or Alaska Natives, and Blacks.