Import packages

# Load packages
library(tidyverse)
library(haven)
library(kableExtra)
library(broom)
library(GGally)
library(car)
library(gtsummary)
library(ggeffects)
library(dplyr)

Section 1: Data Loading and Analytical Sample

Load in data from NHANES

#Load datasets from NHANES 2021-2023 survey
Demo <- read_xpt("~/Desktop/EPI553Final/DEMO_L.xpt")
Depression <- read_xpt("~/Desktop/EPI553Final/DPQ_L.xpt")
Alcohol <- read_xpt("~/Desktop/EPI553Final/ALQ_L.xpt")
Vitamin <- read_xpt("~/Desktop/EPI553Final/VID_L.xpt")
Sleep <- read_xpt("~/Desktop/EPI553Final/SLQ_L.xpt")

nrow(Demo)

## [1] 11933

nrow(Alcohol)

## [1] 6337

nrow(Depression)

## [1] 6337

nrow(Vitamin)

## [1] 8727

nrow(Sleep)

## [1] 8501

Total observations in each raw dataset: Demo 11,933 Alcohol 6,337 Depression 6,337 Vitamin 8,727 Sleep 8,501

Merge datasets based on common variable SEQN

#Merge datasets
nhanes_merged <- Demo %>%
  inner_join(Depression, by = "SEQN") %>%
  inner_join(Alcohol, by = "SEQN") %>%
  inner_join(Vitamin, by = "SEQN") %>%
  inner_join(Sleep, by = "SEQN")

#Sample size
nrow(nhanes_merged)

## [1] 6337

Total observations in merged dataset is 6337

Restrict dataset to adults (18 years or older)

#Filter out participants under 18 years old
nhanes_age <- nhanes_merged %>%
  filter(RIDAGEYR >= 18) 

nrow(nhanes_age)

## [1] 6337

Missing outcome observations

nhanes_age <- nhanes_age |>
  filter(!is.na(DPQ010),
         !is.na(DPQ020),
         !is.na(DPQ030), 
         !is.na(DPQ040), 
         !is.na(DPQ050), 
         !is.na(DPQ060), 
         !is.na(DPQ070),
         !is.na(DPQ080), 
         !is.na(DPQ090)
         )
nrow(nhanes_age)

## [1] 5506

After removing all missing observations for all depression variables for the PHQ-9 scale there are 5506 observations.

Section 2: Variable Selection, Recoding, and Data Cleaning

View all variables in data

names(nhanes_age)

##  [1] "SEQN"     "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
##  [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM" "DMQMILIZ" "DMDBORN4"
## [13] "DMDYRUSR" "DMDEDUC2" "DMDMARTZ" "RIDEXPRG" "DMDHHSIZ" "DMDHRGND"
## [19] "DMDHRAGZ" "DMDHREDZ" "DMDHRMAZ" "DMDHSEDZ" "WTINT2YR" "WTMEC2YR"
## [25] "SDMVSTRA" "SDMVPSU"  "INDFMPIR" "DPQ010"   "DPQ020"   "DPQ030"  
## [31] "DPQ040"   "DPQ050"   "DPQ060"   "DPQ070"   "DPQ080"   "DPQ090"  
## [37] "DPQ100"   "ALQ111"   "ALQ121"   "ALQ130"   "ALQ142"   "ALQ270"  
## [43] "ALQ280"   "ALQ151"   "ALQ170"   "WTPH2YR"  "LBXVIDMS" "LBDVIDLC"
## [49] "LBXVD2MS" "LBDVD2LC" "LBXVD3MS" "LBDVD3LC" "LBXVE3MS" "LBDVE3LC"
## [55] "SLQ300"   "SLQ310"   "SLD012"   "SLQ320"   "SLQ330"   "SLD013"

Select variables of interest

#Select variables
nhanes_subset <- nhanes_age %>%
  select(SEQN, RIDAGEYR, RIAGENDR, RIDRETH3, DMDEDUC2, DMDMARTZ, DPQ010, DPQ020, DPQ030, DPQ040, DPQ050, DPQ060, DPQ070, DPQ080, DPQ090, LBXVIDMS, SLD012, ALQ121)

Outcome variable(s): Depression Kept all PHQ-9 items (DPQ010-DPQ090) Exposure: Vitamin D Confounders: Age, Sex, Race/ethnicity, education level, marital status, alcohol consumption, sleep

Recode variables

nhanes_clean <- nhanes_subset |>
  mutate(
    
    #Outcome: Depression 
    DPQ010= case_when(
      DPQ010 %in% c(7, 9) ~ NA_real_,
      DPQ010 >= 0 & DPQ010 <= 3 ~ as.numeric(DPQ010),
      TRUE ~ NA_real_
    ),
    DPQ020= case_when(
      DPQ020 %in% c(7, 9) ~ NA_real_,
      DPQ020 >= 0 & DPQ020 <= 3 ~ as.numeric(DPQ020),
      TRUE ~ NA_real_
    ),
    DPQ030= case_when(
      DPQ030 %in% c(7, 9) ~ NA_real_,
      DPQ030 >= 0 & DPQ030 <= 3 ~ as.numeric(DPQ030),
      TRUE ~ NA_real_
    ),
    DPQ040= case_when(
      DPQ040 %in% c(7, 9) ~ NA_real_,
      DPQ040 >= 0 & DPQ040 <= 3 ~ as.numeric(DPQ040),
      TRUE ~ NA_real_
    ),
    DPQ050= case_when(
      DPQ050 %in% c(7, 9) ~ NA_real_,
      DPQ050 >= 0 & DPQ050 <= 3 ~ as.numeric(DPQ050),
      TRUE ~ NA_real_
    ),
    DPQ060= case_when(
      DPQ060 %in% c(7, 9) ~ NA_real_,
      DPQ060 >= 0 & DPQ060 <= 3 ~ as.numeric(DPQ060),
      TRUE ~ NA_real_
    ),
    DPQ070= case_when(
      DPQ070 %in% c(7, 9) ~ NA_real_,
      DPQ070 >= 0 & DPQ070 <= 3 ~ as.numeric(DPQ070),
      TRUE ~ NA_real_
    ),
    DPQ080= case_when(
      DPQ080 %in% c(7, 9) ~ NA_real_,
      DPQ080 >= 0 & DPQ080 <= 3 ~ as.numeric(DPQ080),
      TRUE ~ NA_real_
    ),
    DPQ090= case_when(
      DPQ090 %in% c(7, 9) ~ NA_real_,
      DPQ090 >= 0 & DPQ090 <= 3 ~ as.numeric(DPQ090),
      TRUE ~ NA_real_
    ),
    
    #Create PHQ-9 score 
    phq9_score= DPQ010 + DPQ020 + DPQ030 + DPQ040 +
                DPQ050 + DPQ060 + DPQ070 + DPQ080 + DPQ090,

    depression = factor(
      ifelse(phq9_score >= 10, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ),

  #Exposure: Vitamin D
  vitaminD= LBXVIDMS,
  
  #Predictors
  age= RIDAGEYR,
  
  sex= factor(RIAGENDR, levels= c(1, 2), labels= c("Male", "Female")),
  
  race_ethnicity= factor(RIDRETH3,
      levels= c(1, 2, 3, 4, 6, 7),
      labels= c("Mexican American", "Other Hispanic", "NH White", "NH Black", "NH Asian", "Other Race")),
  
  education= factor(case_when(
      DMDEDUC2 %in% c(1, 2) ~ "Less than HS",
      DMDEDUC2 == 3 ~ "HS diploma or GED",
      DMDEDUC2 == 4 ~ "Some college or technical school",
      DMDEDUC2 == 5 ~ "College graduate or above",
      DMDEDUC2 %in% c(7,9) ~ NA_character_,
      TRUE ~ NA_character_
    )), 
  
  sleep_hrs= SLD012,
  
  alcohol = case_when(
    ALQ121 == 0 ~ "Never",
    ALQ121 %in% c(1,2) ~ "Daily/Nearly daily",
    ALQ121 == 3 ~ "Frequent",
    ALQ121 %in% c(4, 5) ~ "Weekly",
    ALQ121 %in% c(6, 7, 8) ~ "Occasional",
    ALQ121 %in% c(9, 10) ~ "Rare",
    ALQ121 %in% c(77, 99) ~ NA_character_,
    TRUE ~ NA_character_
  ),
  
  alcohol= factor(
    alcohol, 
    levels= c(
      "Never",
      "Rare",
      "Occasional",
      "Weekly",
      "Frequent",
      "Daily/Nearly daily"
    )
  ),
    
  marital= factor(case_when(
      DMDMARTZ == 1 ~ "Married/Living with Partner",
      DMDMARTZ == 2 ~ "Divorced/Separated",
      DMDMARTZ == 3 ~ "Never Married",
      DMDMARTZ %in% c(77,99) ~ NA_character_,
      TRUE ~ NA_character_
    )), 
  )

Distribution of Depression Variable

nhanes_clean %>%
  gtsummary::tbl_summary(
    include = depression,
    statistic = all_categorical() ~ "{n} ({p}%)"
  )

Characteristic	N = 5,506¹
depression	723 (13%)
Unknown	51
¹ n (%)

References

#Create reference categories for variables
nhanes_clean <- nhanes_clean %>%
  mutate(
    sex = relevel(sex, ref = "Male"),
    race_ethnicity= relevel(race_ethnicity, ref= "NH White"),
    education = relevel(education, ref = "Less than HS"),
    marital = relevel(marital, ref = "Married/Living with Partner"),
    alcohol= relevel(alcohol, ref= "Never")
  )

Missingness

# Report missing values for the key variables
cat("Total N:", nrow(nhanes_clean), "\n\n")

## Total N: 5506

miss_info <- function(x) {
  n <- sum(is.na(x))
  pct <- round(100 * n / length(x), 2)
  paste0(n, " (", pct, "%)")
}

cat("Missing depression:", miss_info(nhanes_clean$depression), "\n")

## Missing depression: 51 (0.93%)

cat("Missing vitaminD:", miss_info(nhanes_clean$vitaminD), "\n")

## Missing vitaminD: 417 (7.57%)

cat("Missing sex:", miss_info(nhanes_clean$sex), "\n")

## Missing sex: 0 (0%)

cat("Missing age:", miss_info(nhanes_clean$age), "\n")

## Missing age: 0 (0%)

cat("Missing race_ethnicity:", miss_info(nhanes_clean$race_ethnicity), "\n")

## Missing race_ethnicity: 0 (0%)

cat("Missing sleep_hrs:", miss_info(nhanes_clean$sleep_hrs), "\n")

## Missing sleep_hrs: 55 (1%)

cat("Missing education:", miss_info(nhanes_clean$education), "\n")

## Missing education: 247 (4.49%)

cat("Missing marital:", miss_info(nhanes_clean$marital), "\n")

## Missing marital: 249 (4.52%)

cat("Missing alcohol:", miss_info(nhanes_clean$alcohol), "\n")

## Missing alcohol: 589 (10.7%)

nhanes_final <- nhanes_clean %>%
  drop_na(depression, vitaminD, age, sex, race_ethnicity, education, marital, sleep_hrs, alcohol)

Random Sample

#Create random sample of data
set.seed(1220)

nhanes_final <- nhanes_final |>
slice_sample(n = 4000)

summary(nhanes_final)

##       SEQN           RIDAGEYR       RIAGENDR       RIDRETH3        DMDEDUC2    
##  Min.   :130379   Min.   :20.0   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:133292   1st Qu.:39.0   1st Qu.:1.00   1st Qu.:3.000   1st Qu.:3.000  
##  Median :136315   Median :57.0   Median :2.00   Median :3.000   Median :4.000  
##  Mean   :136316   Mean   :53.6   Mean   :1.54   Mean   :3.248   Mean   :4.015  
##  3rd Qu.:139296   3rd Qu.:67.0   3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :142310   Max.   :80.0   Max.   :2.00   Max.   :7.000   Max.   :5.000  
##     DMDMARTZ         DPQ010           DPQ020           DPQ030      
##  Min.   :1.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :1.648   Mean   :0.4435   Mean   :0.4415   Mean   :0.7602  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :3.0000   Max.   :3.0000   Max.   :3.0000  
##      DPQ040           DPQ050          DPQ060          DPQ070      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.000   Median :0.0000  
##  Mean   :0.8323   Mean   :0.471   Mean   :0.387   Mean   :0.3685  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :3.0000   Max.   :3.000   Max.   :3.000   Max.   :3.0000  
##      DPQ080          DPQ090         LBXVIDMS          SLD012      
##  Min.   :0.000   Min.   :0.000   Min.   :  9.13   Min.   : 2.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.: 57.48   1st Qu.: 7.000  
##  Median :0.000   Median :0.000   Median : 78.55   Median : 8.000  
##  Mean   :0.163   Mean   :0.058   Mean   : 83.17   Mean   : 7.666  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:103.00   3rd Qu.: 8.500  
##  Max.   :3.000   Max.   :3.000   Max.   :424.00   Max.   :14.000  
##      ALQ121         phq9_score     depression    vitaminD           age      
##  Min.   : 0.000   Min.   : 0.000   No :3522   Min.   :  9.13   Min.   :20.0  
##  1st Qu.: 2.000   1st Qu.: 0.000   Yes: 478   1st Qu.: 57.48   1st Qu.:39.0  
##  Median : 5.000   Median : 2.000              Median : 78.55   Median :57.0  
##  Mean   : 4.893   Mean   : 3.925              Mean   : 83.17   Mean   :53.6  
##  3rd Qu.: 8.000   3rd Qu.: 6.000              3rd Qu.:103.00   3rd Qu.:67.0  
##  Max.   :10.000   Max.   :26.000              Max.   :424.00   Max.   :80.0  
##      sex                race_ethnicity                            education   
##  Male  :1838   NH White        :2577   Less than HS                    : 355  
##  Female:2162   Mexican American: 242   College graduate or above       :1625  
##                Other Hispanic  : 373   HS diploma or GED               : 760  
##                NH Black        : 412   Some college or technical school:1260  
##                NH Asian        : 145                                          
##                Other Race      : 251                                          
##    sleep_hrs                    alcohol                           marital    
##  Min.   : 2.000   Never             :673   Married/Living with Partner:2212  
##  1st Qu.: 7.000   Rare              :837   Divorced/Separated         : 984  
##  Median : 8.000   Occasional        :951   Never Married              : 804  
##  Mean   : 7.666   Weekly            :767                                     
##  3rd Qu.: 8.500   Frequent          :381                                     
##  Max.   :14.000   Daily/Nearly daily:391

Section 3: Descriptive Statistics (Table 1)

nhanes_final %>%
  select(depression, age, sex, race_ethnicity, marital, education, sleep_hrs, vitaminD, alcohol) %>%
  tbl_summary(
    by= depression,
    label = list(
      age ~ "Age",
      race_ethnicity ~ "Race-ethnicity",
      sex ~ "Sex",
      marital ~ "Marital Status",
      education ~ "Highest level of education completed",
      sleep_hrs ~ "Average sleep hours per night",
      alcohol ~ "Alcohol use",
      vitaminD ~ "Vitamin D status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
  ) %>%
  add_n() %>%
  # add_overall() %>%
  bold_labels() %>%
  italicize_levels() %>%
  modify_caption("**Table 1. Descriptive Statistics — NHANES 2021-2023 Analytic Sample, Stratified by Depression Status (n = 4000)**") %>%
  modify_footnote(
    all_stat_cols()~ "Total analytic sample size = 4000, missing values were removed prior to analysis. Mean and sd were produced for continuous variables and freuqency and % for categorical"
  ) %>%
  as_flex_table()

**Table 1. Descriptive Statistics — NHANES 2021-2023 Analytic Sample, Stratified by Depression Status (n = 4000)**
Characteristic	N	No N = 3,5221	Yes N = 4781
Age	4,000	54.4 (16.7)	47.9 (17.1)
Sex	4,000
Male		1,658 (47%)	180 (38%)
Female		1,864 (53%)	298 (62%)
Race-ethnicity	4,000
NH White		2,289 (65%)	288 (60%)
Mexican American		205 (5.8%)	37 (7.7%)
Other Hispanic		331 (9.4%)	42 (8.8%)
NH Black		360 (10%)	52 (11%)
NH Asian		135 (3.8%)	10 (2.1%)
Other Race		202 (5.7%)	49 (10%)
Marital Status	4,000
Married/Living with Partner		2,033 (58%)	179 (37%)
Divorced/Separated		845 (24%)	139 (29%)
Never Married		644 (18%)	160 (33%)
Highest level of education completed	4,000
Less than HS		291 (8.3%)	64 (13%)
College graduate or above		1,496 (42%)	129 (27%)
HS diploma or GED		655 (19%)	105 (22%)
Some college or technical school		1,080 (31%)	180 (38%)
Average sleep hours per night	4,000	7.7 (1.4)	7.4 (2.1)
Vitamin D status	4,000	84.5 (38.3)	73.3 (34.3)
Alcohol use	4,000
Never		579 (16%)	94 (20%)
Rare		721 (20%)	116 (24%)
Occasional		826 (23%)	125 (26%)
Weekly		706 (20%)	61 (13%)
Frequent		340 (9.7%)	41 (8.6%)
Daily/Nearly daily		350 (9.9%)	41 (8.6%)
1Total analytic sample size = 4000, missing values were removed prior to analysis. Mean and sd were produced for continuous variables and freuqency and % for categorical

Section 4: Exploratory Data Analysis (EDA)

Distribution of phq-9 score (outcome variable)

ggplot(nhanes_final, aes(x = phq9_score)) +
  geom_histogram(binwidth = 2, color = "Black", fill = "lightgreen") +
  labs(
    title = "Plot 1. Distribution of PHQ-9 Scores",
    x = "PHQ-9 Score",
    y = "Frequency"
  ) +
  theme_minimal()

Association between the outcome and primary exposure Boxplot of phq-9 score by vitamin D levels

ggplot(nhanes_final, aes(x = depression, y = vitaminD)) +
  geom_boxplot(fill = "lightgreen") +
  labs(
    title = "Plot 2. Vitamin D Levels by Depression Status",
    x = "Depression Status",
    y = "Vitamin D (nmol/L)"
  ) +
  theme_minimal()

Exploratory plot Boxplot of phq-9 score by alcohol use

ggplot(nhanes_final, aes(x = factor(alcohol), y = phq9_score)) +
  geom_boxplot(fill= "lightblue") +
  labs(
    title = "Plot 3. PHQ-9 Score by Alcohol Use",
    x = "Alcohol Use",
    y = "PHQ-9 Score"
  ) +
  theme_minimal()

EPI553 Final Project

Jackie Korosec

2026-04-15

Import packages

Section 1: Data Loading and Analytical Sample

Section 2: Variable Selection, Recoding, and Data Cleaning

Section 3: Descriptive Statistics (Table 1)

Section 4: Exploratory Data Analysis (EDA)