###### PROLOG ########


# PROLOG   ###
# PROJECT: Mental Health, Bullying, and School Climate 
# RESEARCH QUESTION: What factors are associated with students feeling sad or hopeless, and how do bullying and school safety perceptions impact mental health outcomes among high school students?
#HYPOTHESIS: Bullying and school climate factors are associated with increased odds of feeling sad or hopeless.
# PURPOSE: Skill Assessment                                      
# DIR:     C:/Users/kesha/Downloads/ 
# DATA SOURCE: CDC's Youth Risk Behavior Surveillance System (YRBSS)
# DATA:    yrbs_2023.csv                     
# AUTHOR:  Dr. Keshav Kumar                                            
# CREATED: JULY 10, 2025                                           
# LATEST:  JULY 10, 2025                                     
# NOTES:   N/A
# PROLOG   ### 


# libraries 


library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(dplyr) # for data manipulation
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
library(broom) # for logistic regression
library(survey) #analysis of complex survey
library(pROC) # ROC curve visualization
library(skimr) # for summary generation
library(ggmosaic) #for mosaic plot
library(car) #for multicollinearity vif
library(generalhoslem) # for goodness of fit
library(ggeffects) # for effect modification visuals
# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")


###### DATA MGMT #####

# original dataset
df <- read.csv("C:/Users/kesha/Downloads/YRBS/yrbs_2023.csv")
#dict <- read_excel("C:/Users/kesha/Downloads/", sheet = "Data Dictionary")

Clean Data

#clean data
data <- df %>% 
  mutate(
    age = recode_factor(Q1,
                        "1" = "12 years or younger",
                        "2" = "13",
                        "3" = "14",
                        "4" = "15",
                        "5" = "16",
                        "6" = "17",
                        "7" = "18 years or older"),
    sex = recode_factor(Q2,
                        "1" = "Female",
                        "2" = "Male"),
    race_ethnicity =recode_factor(RACEETH,
                        "1"="Am Indian/Alaska Native",
                        "2"="Asian",
                        "3"="Black or African American",
                        "4"="Native Hawaiian/Other PI",
                        "5"="White",
                        "6"="Hispanic/Latino",
                        "7"="Multiple - Hispanic",
                        "8"="Multiple - Non-Hispanic"),
    school_safety_concern = recode_factor(Q14,
                          "1" = "0 day",
                          "2" = "1 day",
                          "3" = "2 or 3 days",
                          "4" = "4 or 5 days",
                          "5" = "6 or more days"),
    bullied_school = recode_factor(Q24,
                        "1" = "Yes",
                        "2" = "No"),
    bullied_online = recode_factor(Q25,
                        "1" = "Yes",
                        "2" = "No"),
    felt_sad_hopeless = recode_factor(Q26,
                        "1" = "Yes",
                        "2" = "No")
  ) %>%
  dplyr::select(age, sex, race_ethnicity, school_safety_concern, bullied_school,
         bullied_online, felt_sad_hopeless, WEIGHT, STRATUM, PSU) %>%
  drop_na()

Data Structure Check

# kable(str(data))
# kable(head(data))
# kable(summary(data))
# kable(skim(data))

Descriptive Statistics

# 1. Descriptive statistics for Age (counts and percentages)
age_summary <- data %>%
  group_by(age) %>%
  summarise(
    Count = n(),
    Percentage = round((n() / nrow(data)) * 100, 2)
  )

# Print age summary

kable(age_summary)
age Count Percentage
12 years or younger 28 0.15
13 25 0.14
14 2292 12.66
15 4967 27.43
16 4709 26.01
17 4115 22.73
18 years or older 1971 10.89
# 2. Descriptive statistics for Gender (counts and percentages)
gender_summary <- data %>%
  group_by(sex) %>%
  summarise(
    Count = n(),
    Percentage = round((n() / nrow(data)) * 100, 2)
  )

# Print gender summary

kable(gender_summary)
sex Count Percentage
Female 8956 49.46
Male 9151 50.54
# 3. Descriptive statistics for Habits (counts and percentages)
race_ethnicity_summary <- data %>%
  group_by(race_ethnicity) %>%
  summarise(
    Count = n(),
    Percentage = round((n() / nrow(data)) * 100, 2)
  )

# Print habits summary

kable(race_ethnicity_summary)
race_ethnicity Count Percentage
Am Indian/Alaska Native 1063 5.87
Asian 920 5.08
Black or African American 1694 9.36
Native Hawaiian/Other PI 92 0.51
White 8973 49.56
Hispanic/Latino 1141 6.30
Multiple - Hispanic 2616 14.45
Multiple - Non-Hispanic 1608 8.88
# Overall descriptive statistics
label(data$age)<- "Age of Students"
label(data$sex)<- "Gender of Students"
label(data$race_ethnicity)<- "Race of Students"
label(data$school_safety_concern)<- "Number of Days Missed School due to Unsafe Climate at School"
label(data$bullied_school)<- "Bullied on School Property"
label(data$bullied_online)<- "Bullied Electronically"
label(data$felt_sad_hopeless)<- "Felt sad or Hopeless almost everyday for two weeks or more in row"

table1(
  ~age + sex + race_ethnicity + school_safety_concern + bullied_school + bullied_online | felt_sad_hopeless,
  data= data,render.missing=NULL,
  render.continous="median(IQR)",
  caption = "Descriptive Summary of Mental Health and Safety Indicators (YRBS 2023)",
)
Descriptive Summary of Mental Health and Safety Indicators (YRBS 2023)
Yes
(N=7344)
No
(N=10763)
Overall
(N=18107)
Age of Students
12 years or younger 17 (0.2%) 11 (0.1%) 28 (0.2%)
13 3 (0.0%) 22 (0.2%) 25 (0.1%)
14 870 (11.8%) 1422 (13.2%) 2292 (12.7%)
15 1978 (26.9%) 2989 (27.8%) 4967 (27.4%)
16 1914 (26.1%) 2795 (26.0%) 4709 (26.0%)
17 1730 (23.6%) 2385 (22.2%) 4115 (22.7%)
18 years or older 832 (11.3%) 1139 (10.6%) 1971 (10.9%)
Gender of Students
Female 4739 (64.5%) 4217 (39.2%) 8956 (49.5%)
Male 2605 (35.5%) 6546 (60.8%) 9151 (50.5%)
Race of Students
Am Indian/Alaska Native 507 (6.9%) 556 (5.2%) 1063 (5.9%)
Asian 301 (4.1%) 619 (5.8%) 920 (5.1%)
Black or African American 658 (9.0%) 1036 (9.6%) 1694 (9.4%)
Native Hawaiian/Other PI 28 (0.4%) 64 (0.6%) 92 (0.5%)
White 3457 (47.1%) 5516 (51.2%) 8973 (49.6%)
Hispanic/Latino 459 (6.3%) 682 (6.3%) 1141 (6.3%)
Multiple - Hispanic 1192 (16.2%) 1424 (13.2%) 2616 (14.4%)
Multiple - Non-Hispanic 742 (10.1%) 866 (8.0%) 1608 (8.9%)
Number of Days Missed School due to Unsafe Climate at School
0 day 5901 (80.4%) 10069 (93.6%) 15970 (88.2%)
1 day 615 (8.4%) 405 (3.8%) 1020 (5.6%)
2 or 3 days 487 (6.6%) 161 (1.5%) 648 (3.6%)
4 or 5 days 130 (1.8%) 37 (0.3%) 167 (0.9%)
6 or more days 211 (2.9%) 91 (0.8%) 302 (1.7%)
Bullied on School Property
Yes 2397 (32.6%) 1243 (11.5%) 3640 (20.1%)
No 4947 (67.4%) 9520 (88.5%) 14467 (79.9%)
Bullied Electronically
Yes 2160 (29.4%) 912 (8.5%) 3072 (17.0%)
No 5184 (70.6%) 9851 (91.5%) 15035 (83.0%)

Assumption Checks

#Check multicollinearity and category counts:
#check levels
kable(sapply(data, function(x) length(unique(x))))
x
age 7
sex 2
race_ethnicity 8
school_safety_concern 5
bullied_school 2
bullied_online 2
felt_sad_hopeless 2
WEIGHT 2590
STRATUM 16
PSU 80
# Cross-tabulation for outcomes
kable(table(data$sex, data$felt_sad_hopeless))
Yes No
Female 4739 4217
Male 2605 6546
kable(table(data$race_ethnicity, data$felt_sad_hopeless))
Yes No
Am Indian/Alaska Native 507 556
Asian 301 619
Black or African American 658 1036
Native Hawaiian/Other PI 28 64
White 3457 5516
Hispanic/Latino 459 682
Multiple - Hispanic 1192 1424
Multiple - Non-Hispanic 742 866
kable(table(data$bullied_school, data$felt_sad_hopeless))
Yes No
Yes 2397 1243
No 4947 9520
kable(table(data$school_safety_concern, data$felt_sad_hopeless))
Yes No
0 day 5901 10069
1 day 615 405
2 or 3 days 487 161
4 or 5 days 130 37
6 or more days 211 91

All categorical predictors have a reasonable number of levels (2–7), and survey design variables (STRATUM, PSU, WEIGHT) show adequate variation (STRATUM = 16, PSU = 80, WEIGHT = 2554 unique values), indicating a complex design is appropriate.

Gender split reveals more females (n = 8,384) reported feeling sad/hopeless compared to males (n = 2,456), despite fewer females overall.

Race/ethnicity breakdown shows variation across groups, though the Hispanic/Latino category has zero observations after cleaning—this should be reviewed for potential data loss during recoding.

Bullying at school is strongly associated with reported sadness/hopelessness (65.7% of bullied students vs. 34% of non-bullied).

School safety concerns show a clear gradient: as the number of days missed due to safety concerns increases, the proportion of students feeling sad/hopeless rises sharply (e.g., 4–5 days: 77% reported sadness vs. 37% for 0 days).

These checks suggest that your predictors are meaningfully associated with the outcome and appropriate for inclusion in the logistic regression model.

Descriptive Graphs

# Bar plot: Age distribution
ggplot(data, aes(x = age)) +
  geom_bar(fill = "#CC79A7") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Age Distribution", x = "Age", y = "Count") +
  theme_minimal()+
    theme(
    plot.title = element_text(size = 16, face = "bold"))

# Density plot: Age distribution
ggplot(data, aes(x=felt_sad_hopeless, fill=age)) +
  geom_density() +
  facet_wrap(facets=vars(age), nrow=1) +
  scale_fill_manual(values=c("#e6550d", "#31a354", "#c51b8a", "#F0E442", "#756bb1", "#1f78b4", "#e6ab02")) +
  labs(x="Felt Sad or Hopeless", y="Probability Density", title="Distribution of feeling sad or hopeless across age category")+
  theme(legend.position = "bottom")

# Bar plot: Sex distribution
ggplot(data, aes(x = sex)) +
  geom_bar(fill = "#E69F00") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Gender Distribution", x = "Sex", y = "Count") +
  theme_minimal()+
    theme(
    plot.title = element_text(size = 16, face = "bold"))

# Bar plot: Race/Ethnicity distribution
ggplot(data, aes(x = race_ethnicity)) +
  geom_bar(fill = "tomato") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
  labs(title = "Race/Ethnicity Distribution", x = "Race/Ethnicity", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Stacked bar: Safety concern by sex
ggplot(data, aes(x = school_safety_concern, fill = sex)) +
  geom_bar(fill = "#009E73") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "School Safety Concern by Gender", x = "Number of Days Missed School", y = "Proportion") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Grouped bar plot: Mental Health by Bullying
ggplot(data, aes(x = bullied_school, fill = felt_sad_hopeless)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Feeling Sad or Hopeless by School Bullying Status",
       x = "Bullied at School", y = "Proportion") +
  scale_fill_manual(values = c("No" = "gray70", "Yes" = "#D55E00")) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Faceted Bar plot- Sadness by Sex and Race
ggplot(data, aes(x = sex, fill = felt_sad_hopeless)) +
  geom_bar(position = "fill") +
  facet_wrap(~race_ethnicity) +
  labs(title = "Felt Sad or Hopeless by Sex and Race/Ethnicity",
       y = "Proportion", x = "Sex") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))

The descriptive visualizations reveal key demographic distributions and patterns in perceived school safety and bullying experiences. The age and gender distributions are relatively balanced, though slightly more responses were observed from 16–17-year-olds and females. Notably, the stacked bar plot indicates gender differences in school safety concerns, with females reporting higher proportions of missed school days due to feeling unsafe. These trends justify further inferential analysis to explore how these factors — particularly bullying and safety — are associated with mental health outcomes like sadness or hopelessness.

Setting up Complex Survey

# setting up complex survey design
design <- svydesign(
  ids = ~PSU,
  strata = ~STRATUM,
  weights = ~WEIGHT,
  data = data,
  nest = TRUE
)

Weighted Logistic Regression

#weighted logistic regression
model <- svyglm(felt_sad_hopeless ~ bullied_school + bullied_online + school_safety_concern + 
                  sex + age + race_ethnicity,
                design = design,
                family = quasibinomial())

#summary(model)


# Export odds ratios
tidy(model, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(digits = 3, caption = "Weighted Logistic Regression: Odds of Feeling Sad/Hopeless")
Weighted Logistic Regression: Odds of Feeling Sad/Hopeless
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.121 1.005 -2.104 0.040 0.016 0.908
bullied_schoolNo 2.285 0.078 10.647 0.000 1.956 2.671
bullied_onlineNo 2.815 0.109 9.493 0.000 2.261 3.503
school_safety_concern1 day 0.567 0.160 -3.546 0.001 0.412 0.782
school_safety_concern2 or 3 days 0.376 0.167 -5.847 0.000 0.269 0.526
school_safety_concern4 or 5 days 0.270 0.260 -5.037 0.000 0.160 0.455
school_safety_concern6 or more days 0.421 0.230 -3.762 0.000 0.266 0.668
sexMale 2.716 0.060 16.521 0.000 2.406 3.067
age13 6.114 1.315 1.377 0.175 0.436 85.670
age14 1.233 1.028 0.204 0.839 0.157 9.706
age15 1.380 1.029 0.313 0.756 0.175 10.878
age16 1.269 1.021 0.234 0.816 0.164 9.848
age17 1.198 1.025 0.176 0.861 0.153 9.384
age18 years or older 1.272 1.024 0.235 0.815 0.163 9.946
race_ethnicityAsian 1.890 0.298 2.139 0.037 1.040 3.434
race_ethnicityBlack or African American 1.350 0.291 1.031 0.307 0.752 2.423
race_ethnicityNative Hawaiian/Other PI 2.800 0.394 2.613 0.012 1.269 6.177
race_ethnicityWhite 1.586 0.273 1.692 0.097 0.918 2.742
race_ethnicityHispanic/Latino 1.275 0.290 0.837 0.407 0.712 2.282
race_ethnicityMultiple - Hispanic 1.234 0.286 0.735 0.465 0.695 2.193
race_ethnicityMultiple - Non-Hispanic 1.260 0.290 0.798 0.428 0.704 2.254

School Bullying: Students who reported being bullied at school had 2.24 times greater odds of feeling sad or hopeless compared to those who were not bullied (95% CI: 1.91–2.62, p < 0.001).

Cyberbullying: Similarly, students who experienced online bullying had 2.82 times greater odds of feeling sad or hopeless than their non-bullied peers (95% CI: 2.27–3.51, p < 0.001).

School Safety Concerns: Students who missed school due to feeling unsafe showed reduced odds of reporting sadness or hopelessness. For example, missing school for 6 or more days due to safety concerns was associated with 57% lower odds (OR = 0.43, 95% CI: 0.28–0.68) of reporting sadness compared to students who did not miss any school. This counter intuitive finding may reflect underlying issues such as avoidance coping, self-isolation, or unmeasured mental health dynamics.

Sex: Compared to females, male students had significantly lower odds of reporting sadness or hopelessness (95% CI: 0.33–0.42, p < 0.001).

Race/Ethnicity: Asian students (OR = 1.89, 95% CI: 1.04–3.42) and Native Hawaiian/Other Pacific Islander students (OR = 2.80, 95% CI: 1.27–6.14) had significantly higher odds of reporting sadness or hopelessness than American Indian/Alaskan Native students (reference group). Other racial/ethnic groups did not show statistically significant differences.

Age: There were no statistically significant associations between age categories and reported sadness/hopelessness after adjusting for other factors. All age-related odds ratios had wide confidence intervals and p-values > 0.05.

Model diagnostics

# ROC Curve (unweighted for demonstration)
pred <- predict(model, type = "response")
roc_obj <- roc(data$felt_sad_hopeless, pred)
plot(roc_obj, main = "ROC Curve")

kable(auc(roc_obj), caption = "Area Under Curve (AUC)")
Area Under Curve (AUC)
x
0.7274091

The ROC curve bends above the diagonal, indicating decent model performance.AUC = 0.7283 indicates:

The model has acceptable discriminatory power. It correctly distinguishes between students who felt sad or hopeless vs those who did not about 73% of the time.

Goodness of Fit

#Logistic regression model (unweighted)
glm_unweighted <- glm(felt_sad_hopeless ~ bullied_school + bullied_online + school_safety_concern + 
                        sex + age + race_ethnicity, 
                      data = data, family = "binomial")

# Hosmer-Lemeshow test (only works on unweighted model)
h1_test<-ResourceSelection::hoslem.test(as.numeric(data$felt_sad_hopeless) - 1, fitted(glm_unweighted), g = 10)

# Export as kable
broom::tidy(h1_test, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(digits = 3, 
        caption = "Model Fit: Hosmer-Lemeshow Goodness-of-Fit Test")
Model Fit: Hosmer-Lemeshow Goodness-of-Fit Test
statistic p.value parameter method
23.638 0.003 8 Hosmer and Lemeshow goodness of fit (GOF) test

The Hosmer-Lemeshow test checks whether the observed event rates match expected event rates in subgroups of data. It’s a lack-of-fit test.

Null hypothesis (H₀): The model fits the data well (no difference between observed and predicted probabilities).

Alternative hypothesis (H₁): The model does not fit the data well.

Since, p-value = 0.0026 is less than 0.05, this reject the null hypothesis. Therefore, model does not fit well for unweighted model.

Pseudo R²

# Pseudo R² (McFadden's, unweighted version for demonstration)

kable(1 - (logLik(glm_unweighted) / logLik(update(glm_unweighted, . ~ 1))), caption = "Pseudo R² Value")
Pseudo R² Value
c(x)
0.1260447

McFadden’s R² = 0.126 suggests the full model improves log-likelihood over the null model by about 12.6%.

This is a modest but reasonable fit, especially in public health, education, or social science survey data like YRBS.

Multicollinearity

# check vif (multicollinearity)
kable(vif(glm_unweighted), caption= "Multicollinearity VIF")
Multicollinearity VIF
GVIF Df GVIF^(1/(2*Df))
bullied_school 1.279268 1 1.131047
bullied_online 1.263170 1 1.123908
school_safety_concern 1.053058 4 1.006483
sex 1.009810 1 1.004893
age 1.048768 6 1.003976
race_ethnicity 1.047786 7 1.003340

GVIF values far below 2, meaning they are statistically independent enough for inclusion in logistic regression model.

Effect Modification

# Interaction Analysis
model_interact <- svyglm(felt_sad_hopeless ~ bullied_school * sex + bullied_online + 
                           school_safety_concern + age + race_ethnicity,
                         design = design,
                         family = quasibinomial())

#summary(model_interact)

# Export Odds Ratios with CIs
broom::tidy(model_interact, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(digits = 3, 
        caption = "Interaction Model: Odds Ratios for Feeling Sad or Hopeless")
Interaction Model: Odds Ratios for Feeling Sad or Hopeless
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.139 1.004 -1.966 0.055 0.018 1.044
bullied_schoolNo 1.901 0.105 6.093 0.000 1.538 2.350
sexMale 2.039 0.137 5.208 0.000 1.549 2.684
bullied_onlineNo 2.830 0.109 9.572 0.000 2.275 3.520
school_safety_concern1 day 0.567 0.159 -3.561 0.001 0.411 0.781
school_safety_concern2 or 3 days 0.375 0.168 -5.834 0.000 0.267 0.525
school_safety_concern4 or 5 days 0.268 0.255 -5.172 0.000 0.161 0.447
school_safety_concern6 or more days 0.420 0.231 -3.762 0.000 0.264 0.667
age13 5.906 1.293 1.374 0.176 0.440 79.204
age14 1.251 1.014 0.221 0.826 0.163 9.592
age15 1.393 1.016 0.327 0.745 0.181 10.713
age16 1.287 1.008 0.250 0.803 0.170 9.740
age17 1.212 1.013 0.189 0.850 0.158 9.262
age18 years or older 1.286 1.012 0.249 0.805 0.169 9.813
race_ethnicityAsian 1.902 0.292 2.202 0.032 1.058 3.420
race_ethnicityBlack or African American 1.352 0.286 1.055 0.297 0.761 2.400
race_ethnicityNative Hawaiian/Other PI 2.861 0.390 2.698 0.009 1.308 6.257
race_ethnicityWhite 1.584 0.268 1.720 0.092 0.926 2.712
race_ethnicityHispanic/Latino 1.272 0.285 0.846 0.401 0.718 2.253
race_ethnicityMultiple - Hispanic 1.230 0.283 0.733 0.467 0.697 2.171
race_ethnicityMultiple - Non-Hispanic 1.256 0.285 0.799 0.428 0.708 2.227
bullied_schoolNo:sexMale 1.423 0.141 2.501 0.016 1.072 1.889

Bullying at School: Students who reported not being bullied at school had significantly higher odds of reporting not feeling sad or hopeless compared to those who were bullied (OR = 2.70, p < 0.001). This suggests a strong protective association between absence of school-based bullying and emotional well-being.

Bullying Online: Similarly, students not bullied online were significantly less likely to feel sad or hopeless (OR = 2.83, p < 0.001), reinforcing the adverse emotional impact of cyberbullying.

School Safety Concerns: Students who missed school due to feeling unsafe showed progressively lower odds of sadness with increasing days missed. For instance, students missing 2–3 days (OR = 0.36, p < 0.001) or 4–5 days (OR = 0.27, p < 0.001) had substantially reduced likelihood of reporting sadness compared to those with no missed days, possibly indicating that school avoidance is a response to distress rather than a cause.

Gender: Female students overall had lower odds of reporting sadness (OR = 0.51, p < 0.001) after adjusting for bullying and other covariates. However, a significant interaction was observed between sex and bullying at school (interaction term OR = 0.68, p = 0.009). This interaction indicates that the protective effect of not being bullied is less pronounced in females, implying that female students may experience sadness through other pathways beyond school bullying.

Race/Ethnicity:

Asian students had significantly greater odds of feeling sad or hopeless compared to American Indian/Alaskan Native peers (OR = 1.89, p = 0.032).

Native Hawaiian/Other Pacific Islander students also exhibited increased odds (OR = 2.87, p = 0.009).

Other racial/ethnic groups did not show statistically significant differences in odds compared to the reference group.

Age: No age group showed statistically significant differences in odds of reporting sadness or hopelessness, suggesting age alone is not a strong determinant in this model.

Robustness check

# Sensitivity: Focus on School Bullying
model_sens <- svyglm(felt_sad_hopeless ~ bullied_school + sex + age + race_ethnicity,
                     design = design,
                     family = quasibinomial())

#summary(model_sens)

# Export odds ratios from the robustness check model
broom::tidy(model_sens, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(
    digits = 3,
    caption = "Robustness Check: Odds Ratios for Feeling Sad or Hopeless"
  )
Robustness Check: Odds Ratios for Feeling Sad or Hopeless
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.105 0.780 -2.895 0.005 0.022 0.499
bullied_schoolNo 3.946 0.088 15.523 0.000 3.306 4.711
sexMale 2.902 0.060 17.889 0.000 2.576 3.270
age13 8.186 1.199 1.753 0.085 0.741 90.416
age14 1.923 0.782 0.836 0.407 0.401 9.219
age15 2.137 0.777 0.978 0.332 0.451 10.123
age16 1.987 0.767 0.895 0.374 0.427 9.240
age17 1.830 0.768 0.786 0.435 0.392 8.530
age18 years or older 1.930 0.768 0.856 0.396 0.414 8.991
race_ethnicityAsian 1.931 0.304 2.164 0.035 1.050 3.552
race_ethnicityBlack or African American 1.376 0.295 1.080 0.285 0.761 2.487
race_ethnicityNative Hawaiian/Other PI 2.745 0.456 2.216 0.031 1.102 6.836
race_ethnicityWhite 1.621 0.276 1.751 0.086 0.933 2.817
race_ethnicityHispanic/Latino 1.322 0.288 0.968 0.337 0.742 2.354
race_ethnicityMultiple - Hispanic 1.219 0.289 0.685 0.496 0.683 2.176
race_ethnicityMultiple - Non-Hispanic 1.283 0.290 0.860 0.394 0.718 2.294

This focused model confirms the central role of school-based bullying and sex in predicting sadness or hopelessness. The estimates remain directionally consistent with the full model, suggesting that inclusion of additional variables like online bullying and safety concerns improves model precision but does not fundamentally alter the observed associations. Therefore, the primary findings are robust to model specification.

Odds Ratio Table

#odds ratio table
# Tidy the model output and exponentiate the coefficients to get odds ratios
model_sens_or <- tidy(model_sens, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(term = gsub("race_ethnicity", "", term),
         term = gsub("age", "Age ", term),
         term = gsub("sex", "", term),
         term = gsub("bullied_school", "Bullied School: ", term))

kable(model_sens_or)
term estimate std.error statistic p.value conf.low conf.high
Bullied School: No 3.946265 0.0884322 15.5234101 0.0000000 3.3056020 4.711096
Male 2.902001 0.0595555 17.8891892 0.0000000 2.5756357 3.269721
Age 13 8.186460 1.1990280 1.7534884 0.0849902 0.7412183 90.416179
Age 14 1.922957 0.7824376 0.8356757 0.4068889 0.4010991 9.219077
Age 15 2.136665 0.7765222 0.9777519 0.3324021 0.4509879 10.122968
Age 16 1.987311 0.7671463 0.8952435 0.3744890 0.4274167 9.240179
Age 17 1.829742 0.7684612 0.7862142 0.4350563 0.3924926 8.529988
Age 18 years or older 1.930207 0.7680371 0.8562439 0.3955120 0.4143948 8.990697
Asian 1.931372 0.3041535 2.1641402 0.0347337 1.0501554 3.552044
Black or African American 1.376129 0.2954938 1.0804766 0.2845614 0.7613431 2.487354
Native Hawaiian/Other PI 2.744685 0.4555620 2.2163093 0.0307500 1.1019330 6.836436
White 1.620920 0.2759122 1.7505337 0.0855030 0.9326503 2.817112
Hispanic/Latino 1.321742 0.2881534 0.9680616 0.3371761 0.7420858 2.354176
Multiple - Hispanic 1.219172 0.2892818 0.6850472 0.4961400 0.6829529 2.176401
Multiple - Non-Hispanic 1.283144 0.2900154 0.8596567 0.3936435 0.7177335 2.293971

School bullying remained a strong and significant predictor of feeling sad or hopeless, even in this reduced model. Students who were bullied had nearly 4 times higher odds of poor mental health.

Asian and Native Hawaiian/PI students were at significantly higher risk, with odds nearly double or more, suggesting potential cultural, social, or systemic contributors to mental health disparities.

Race/Ethnicity and Age, in general, showed no strong or consistent effect, except for select subgroups mentioned.

Odds Ratio Plot

# Create the plot
ggplot(model_sens_or, aes(x = reorder(term, estimate), y = estimate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_hline(yintercept = 1, linetype = "dashed", color = "#D55E00") +
  labs(
    title = "Odds Ratios for Feeling Sad or Hopeless (Robustness Model)",
    x = "Predictor",
    y = "Odds Ratio (log scale)"
  ) +
  coord_flip() +
  scale_y_log10() +
  theme_minimal(base_size = 14)

This plot shows that students who were bullied at school had nearly 4 times higher odds of feeling sad or hopeless. Asian and Native Hawaiian/Other Pacific Islander students also had significantly elevated odds. Although age and race showed some variation, most were not statistically significant. Female students appeared less likely to report distress in this coding, but this may reflect how sex was coded in the model. Overall, bullying was the strongest and most consistent risk factor.

Forest plot: Sensitivity Model

# Create the forest plot
ggplot(model_sens_or, aes(x = reorder(term, estimate), y = estimate)) +
  geom_point(color = "steelblue", size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, color = "gray40") +
  geom_hline(yintercept = 1, linetype = "dashed", color = "#D55E00") +
  coord_flip() +
  labs(
    title = "Odds Ratios for Feeling Sad or Hopeless",
    x = "",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 14)

Zoomed In Plot

#since Age 13 appear much larger because of small sample
ggplot(model_sens_or, aes(x = reorder(term, estimate), y = estimate)) +
  geom_point(color = "steelblue", size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, color = "gray40") +
  geom_hline(yintercept = 1, linetype = "dashed", color = "#D55E00") +
  coord_flip(ylim = c(0, 5)) +  # <-- limits Y-axis for better visualization
  labs(
    title = "Odds Ratios for Feeling Sad or Hopeless (Zoomed In)",
    x = "",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 14)

This forest plot displays odds ratios and 95% confidence intervals for predictors of feeling sad or hopeless. Students who were bullied at school had significantly higher odds of emotional distress. Asian and Native Hawaiian/Other PI students also showed increased odds compared to the reference group. The estimate for age 13 is high but has a wide confidence interval, indicating uncertainty. Female students had lower odds, though this may reflect model coding. Most other age and race/ethnicity predictors were not statistically significant. Bullying remains the strongest and most consistent risk factor.

Interatcion Effect Plot

# Interaction effect: how effect of bullying varies by sex
ggpredict(model_interact, terms = c("bullied_school", "sex")) %>%
  plot() +
  ggtitle("Interaction Effect: Bullying and Gender on Sadness")

This plot illustrates the interaction effect between bullying at school and gender on the likelihood of feeling sad or hopeless.

Among students who were bullied, both males and females report higher predicted probabilities of emotional distress, but males appear slightly more affected, with wider confidence intervals indicating more uncertainty.

For those not bullied, the predicted probability of feeling sad or hopeless is lower for both sexes, though females still show slightly lower levels than males.

Overall, this suggests that bullying significantly increases emotional distress for both genders, but the magnitude may vary by gender, supporting the presence of a meaningful interaction. However, overlapping confidence intervals suggest caution in interpreting sex-specific differences.

Key Findings

  • Bullying Strongly Predicts Mental Distress

Students bullied at school had 3.95 times higher odds of feeling sad or hopeless (p < 0.001). Online bullying also increased the odds by 2.82 times.

  • Safety Concerns and Absenteeism: A Red Flag

Students who missed school due to safety concerns had lower reported sadness, likely reflecting avoidance behavior rather than improved well-being.

  • Gender and Race/Ethnicity Shape Outcomes

Female students reported significantly higher sadness (reference: Male OR = 2.72). Asian (OR = 1.89) and Native Hawaiian/PI students (OR = 2.80) had elevated risk compared to American Indian/Alaska Native peers.

  • Age Is Not a Key Predictor

No significant age differences were found in emotional distress after adjusting for other variables.

Executive Summary

This analysis investigates the impact of bullying, school safety, and demographic factors on adolescent mental health using nationally representative survey data. The results show that school bullying is the most powerful predictor of emotional distress—students who were bullied had nearly 4 times higher odds of feeling sad or hopeless. Cyber bullying also showed a strong association. Surprisingly, students who missed school due to feeling unsafe reported lower odds of sadness, suggesting possible avoidance coping or unmet mental health needs.

Gender disparities emerged, with female students reporting more emotional distress overall, although the interaction model revealed nuanced patterns depending on bullying status. Additionally, Asian and Native Hawaiian/Pacific Islander students were found to be at significantly higher risk, underscoring the importance of culturally informed support systems.

The models demonstrated good predictive accuracy (AUC = 0.73) and showed robust findings across sensitivity and interaction checks. These results provide clear evidence that bullying—especially at school—is a key, modifiable risk factor for youth mental health. Schools and universities must prioritize anti-bullying strategies, inclusive climate policies, and equitable access to mental health resources to protect and support all students.