Analysis
knitr::opts_chunk$set(echo = TRUE)
#install.packages(tidyverse)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
file_path <- "/oscar/data/methods2025/0_data/team09_data/CADD_demographics.txt"
CADD_demo <- read_delim(file_path, delim = "|",
col_names = c("Category", "Count"),
trim_ws = TRUE, show_col_types = FALSE)
print(CADD_demo)
## # A tibble: 8 × 2
## Category Count
## <chr> <dbl>
## 1 Male 2493
## 2 Female 1118
## 3 White 3085
## 4 Black or African American 264
## 5 Asian 135
## 6 Unknown 127
## 7 Non-Hispanic 3121
## 8 Hispanic 490
# Categorize by each demographic
CADD_demo <- CADD_demo %>%
mutate(Group = case_when(Category %in% c("Male", "Female") ~ "Gender",
Category %in% c("White", "Black or African American", "Asian", "Unknown") ~ "Race",
Category %in% c("Non-Hispanic", "Hispanic") ~ "Ethnicity", TRUE ~ "Other" ))
# Calculate proportions
total_demo <- CADD_demo %>%
group_by(Group) %>%
mutate(Total = sum(Count),
Proportion = (Count / Total) * 100) %>%
ungroup()
print(CADD_demo)
## # A tibble: 8 × 3
## Category Count Group
## <chr> <dbl> <chr>
## 1 Male 2493 Gender
## 2 Female 1118 Gender
## 3 White 3085 Race
## 4 Black or African American 264 Race
## 5 Asian 135 Race
## 6 Unknown 127 Race
## 7 Non-Hispanic 3121 Ethnicity
## 8 Hispanic 490 Ethnicity
# Compare ratios for gender
female_count <- CADD_demo %>% filter (Category == "Female") %>% pull(Count)
male_count <- CADD_demo %>% filter(Category == "Male") %>% pull(Count)
gender_ratio <- male_count / female_count
print(gender_ratio)
## [1] 2.229875
## [1] 2.229875 : Males had 2.23 times the number of anxiety cases compared to females
# Chi-Square Test
## Gender
### Create table for gender
gender_table <- matrix(c(male_count, female_count), nrow=2, byrow = TRUE)
rownames(gender_table) <- c("Male", "Female")
colnames(gender_table) <- c("Anxiety Cases")
Chi_test_gender <- chisq.test(gender_table)
print(Chi_test_gender)
##
## Chi-squared test for given probabilities
##
## data: gender_table
## X-squared = 523.57, df = 1, p-value < 2.2e-16
# Chi-squared test for given probabilities
# data: gender_table
# X-squared = 523.57, df = 1, p-value < 2.2e-16
# There is a statistical difference in the number of CADD cases between males and females.
# The p-value is less than 0.05, so we reject the null hypothesis (which assumed there would be no association between gender and CADD prevalence)
# Gender and CADD prevalence are likely associated and not just by chance
# Males are significantly more likely to have documented CADD compared to females.
## Race
### Create table for race
race_table <- CADD_demo %>%
filter(Group == "Race") %>%
select(Category, Count) %>%
column_to_rownames("Category") %>%
as.matrix()
chisq.test(race_table)
##
## Chi-squared test for given probabilities
##
## data: race_table
## X-squared = 7046.7, df = 3, p-value < 2.2e-16
# Chi-squared test for given probabilities
# X-squared = 7046.7, df = 3, p-value < 2.2e-16
chi_race <- chisq.test(race_table)
chi_race$stdres
## [1] 83.86681 -24.54802 -29.50567 -29.81312
rownames(race_table)
## [1] "White" "Black or African American"
## [3] "Asian" "Unknown"
#[1] 83.86681 -24.54802 -29.50567 -29.81312 ("White" "Black or African American" "Asian" "Unknown")
# There is a strong significant difference (less that p value of 0.5) in the distribution of CADD cases
# across racial groups. The chance of this distribution occurred by change is almost zero.
# The standardized residuals demonstrate that white individuals were significantly over represented while
# Unknown, Black or African American, and Asian were underrepresented.
## Ethnicity
### Create table for ethnicity
eth_table <- CADD_demo %>%
filter(Group == "Ethnicity") %>%
select(Category, Count) %>%
column_to_rownames("Category") %>%
as.matrix()
chisq.test(eth_table)
##
## Chi-squared test for given probabilities
##
## data: eth_table
## X-squared = 1917, df = 1, p-value < 2.2e-16
chi_ethnicity <- chisq.test(eth_table)
chi_ethnicity$stdres
## [1] 43.78316 -43.78316
rownames(eth_table)
## [1] "Non-Hispanic" "Hispanic"
#Chi-squared test for given probabilities
# X-squared = 1917, df = 1, p-value < 2.2e-16
# 43.78316 -43.78316 ("Non-Hispanic" "Hispanic")
# There is a highly statistical difference in anxiety prevalence between Hispanic
# Non-Hispanic individuals. The difference is very unlikely to be due to chance.
# Based on the standardized residuals, Non-Hispanics are significantly over represented
# among CADD cases.
Bar Plots
# Bar plot for gender
gender_plot <- CADD_demo %>%
filter(Group == "Gender") %>%
mutate(Proportion = (Count / sum(Count)) * 100)
ggplot(gender_plot, aes(x = reorder(Category, -Proportion), y = Proportion)) +
geom_col(fill = "Red") +
geom_text(aes(label = paste0(round(Proportion, 1), "%")), vjust = -0.5) +
theme_minimal() +
labs(title = "Distribution of CADD Cases by Gender",
y = "Proportion of Cases (%)",
x = "Gender") +
theme(legend.position = "none")
# Bar Plot for race
race_plot <- CADD_demo %>%
filter(Group == "Race") %>%
mutate(Proportion = (Count / sum(Count)) * 100)
ggplot(race_plot, aes(x = reorder(Category, -Proportion), y = Proportion)) +
geom_col(fill = "red") +
geom_text(aes(label = paste0(round(Proportion, 1), "%")), vjust = -0.5) +
theme_minimal() +
labs(title = "Distribution of CADD Cases by Race",
y = "Proportion of Cases (%)",
x = "Race") +
theme(legend.position = "none")
# Bar plot for ethnicity
ethnicity_plot <- CADD_demo %>%
filter(Group == "Ethnicity") %>%
mutate(Proportion = (Count / sum(Count)) * 100)
ggplot(ethnicity_plot, aes(x = reorder(Category, -Proportion), y = Proportion)) +
geom_col(fill = "red") +
geom_text(aes(label = paste0(round(Proportion, 1), "%")), vjust = -0.5) +
theme_minimal() +
labs(title = "Distribution of CADD Cases by Ethnicity",
y = "Proportion of Cases (%)",
x = "Ethnicity") +
theme(legend.position = "none")