Analysis

knitr::opts_chunk$set(echo = TRUE)
#install.packages(tidyverse)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
file_path <- "/oscar/data/methods2025/0_data/team09_data/CADD_demographics.txt"

CADD_demo <- read_delim(file_path, delim = "|", 
                        col_names = c("Category", "Count"), 
                        trim_ws = TRUE, show_col_types = FALSE)
print(CADD_demo)
## # A tibble: 8 × 2
##   Category                  Count
##   <chr>                     <dbl>
## 1 Male                       2493
## 2 Female                     1118
## 3 White                      3085
## 4 Black or African American   264
## 5 Asian                       135
## 6 Unknown                     127
## 7 Non-Hispanic               3121
## 8 Hispanic                    490
# Categorize by each demographic
CADD_demo <- CADD_demo %>%
  mutate(Group = case_when(Category %in% c("Male", "Female") ~ "Gender", 
                           Category %in% c("White", "Black or African American", "Asian", "Unknown") ~ "Race",
                           Category %in% c("Non-Hispanic", "Hispanic") ~ "Ethnicity", TRUE ~ "Other" ))

# Calculate proportions
total_demo <- CADD_demo %>%
  group_by(Group) %>%
  mutate(Total = sum(Count), 
         Proportion = (Count / Total) * 100) %>%
  ungroup()

print(CADD_demo)
## # A tibble: 8 × 3
##   Category                  Count Group    
##   <chr>                     <dbl> <chr>    
## 1 Male                       2493 Gender   
## 2 Female                     1118 Gender   
## 3 White                      3085 Race     
## 4 Black or African American   264 Race     
## 5 Asian                       135 Race     
## 6 Unknown                     127 Race     
## 7 Non-Hispanic               3121 Ethnicity
## 8 Hispanic                    490 Ethnicity
# Compare ratios for gender
female_count <- CADD_demo %>% filter (Category == "Female") %>% pull(Count)
male_count <- CADD_demo %>% filter(Category == "Male") %>% pull(Count)

gender_ratio <- male_count / female_count
print(gender_ratio)
## [1] 2.229875
## [1] 2.229875 : Males had 2.23 times the number of anxiety cases compared to females

# Chi-Square Test
## Gender
### Create table for gender
gender_table <- matrix(c(male_count, female_count), nrow=2, byrow = TRUE)
rownames(gender_table) <- c("Male", "Female")
colnames(gender_table) <- c("Anxiety Cases")

Chi_test_gender <- chisq.test(gender_table)
print(Chi_test_gender)
## 
##  Chi-squared test for given probabilities
## 
## data:  gender_table
## X-squared = 523.57, df = 1, p-value < 2.2e-16
# Chi-squared test for given probabilities
# data:  gender_table
# X-squared = 523.57, df = 1, p-value < 2.2e-16

# There is a statistical difference in the number of CADD cases between males and females.
# The p-value is less than 0.05, so we reject the null hypothesis (which assumed there would be no association between gender and CADD prevalence)
# Gender and CADD prevalence are likely associated and not just by chance
# Males are significantly more likely to have documented CADD compared to females.

## Race
### Create table for race
race_table <- CADD_demo %>%
  filter(Group == "Race") %>%
  select(Category, Count) %>%
  column_to_rownames("Category") %>%
  as.matrix()

chisq.test(race_table)
## 
##  Chi-squared test for given probabilities
## 
## data:  race_table
## X-squared = 7046.7, df = 3, p-value < 2.2e-16
# Chi-squared test for given probabilities
# X-squared = 7046.7, df = 3, p-value < 2.2e-16


chi_race <- chisq.test(race_table)
chi_race$stdres
## [1]  83.86681 -24.54802 -29.50567 -29.81312
rownames(race_table)
## [1] "White"                     "Black or African American"
## [3] "Asian"                     "Unknown"
#[1]  83.86681 -24.54802 -29.50567 -29.81312 ("White"   "Black or African American"   "Asian"   "Unknown")

# There is a strong significant difference (less that p value of 0.5) in the distribution of CADD cases
# across racial groups. The chance of this distribution occurred by change is almost zero.
# The standardized residuals demonstrate that white individuals were significantly over represented while 
# Unknown, Black or African American, and Asian were underrepresented. 

## Ethnicity
### Create table for ethnicity 
eth_table <- CADD_demo %>%
  filter(Group == "Ethnicity") %>%
  select(Category, Count) %>%
  column_to_rownames("Category") %>%
  as.matrix()

chisq.test(eth_table)
## 
##  Chi-squared test for given probabilities
## 
## data:  eth_table
## X-squared = 1917, df = 1, p-value < 2.2e-16
chi_ethnicity <- chisq.test(eth_table)
chi_ethnicity$stdres
## [1]  43.78316 -43.78316
rownames(eth_table)
## [1] "Non-Hispanic" "Hispanic"
#Chi-squared test for given probabilities
# X-squared = 1917, df = 1, p-value < 2.2e-16
# 43.78316 -43.78316 ("Non-Hispanic" "Hispanic")

# There is a highly statistical difference in anxiety prevalence between Hispanic 
# Non-Hispanic individuals. The difference is very unlikely to be due to chance.
# Based on the standardized residuals, Non-Hispanics are significantly over represented
# among CADD cases. 

Bar Plots

# Bar plot for gender
gender_plot <- CADD_demo %>%
  filter(Group == "Gender") %>%
  mutate(Proportion = (Count / sum(Count)) * 100)

ggplot(gender_plot, aes(x = reorder(Category, -Proportion), y = Proportion)) +
  geom_col(fill = "Red") + 
  geom_text(aes(label = paste0(round(Proportion, 1), "%")), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Distribution of CADD Cases by Gender",
       y = "Proportion of Cases (%)",
       x = "Gender") +
  theme(legend.position = "none")

# Bar Plot for race
race_plot <- CADD_demo %>%
  filter(Group == "Race") %>%
  mutate(Proportion = (Count / sum(Count)) * 100)

ggplot(race_plot, aes(x = reorder(Category, -Proportion), y = Proportion)) +
  geom_col(fill = "red") + 
  geom_text(aes(label = paste0(round(Proportion, 1), "%")), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Distribution of CADD Cases by Race",
       y = "Proportion of Cases (%)",
       x = "Race") +
  theme(legend.position = "none")

# Bar plot for ethnicity 
ethnicity_plot <- CADD_demo %>%
  filter(Group == "Ethnicity") %>%
  mutate(Proportion = (Count / sum(Count)) * 100)

ggplot(ethnicity_plot, aes(x = reorder(Category, -Proportion), y = Proportion)) +
  geom_col(fill = "red") + 
  geom_text(aes(label = paste0(round(Proportion, 1), "%")), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Distribution of CADD Cases by Ethnicity",
       y = "Proportion of Cases (%)",
       x = "Ethnicity") +
  theme(legend.position = "none")