# Load packages
library(tidyverse)
library(haven)
library(kableExtra)
library(broom)
library(GGally)
library(car)
library(gtsummary)
library(ggeffects)
library(dplyr)Load in data from NHANES
#Load datasets from NHANES 2021-2023 survey
Demo <- read_xpt("~/Desktop/EPI553Final/DEMO_L.xpt")
Depression <- read_xpt("~/Desktop/EPI553Final/DPQ_L.xpt")
Alcohol <- read_xpt("~/Desktop/EPI553Final/ALQ_L.xpt")
Vitamin <- read_xpt("~/Desktop/EPI553Final/VID_L.xpt")
Sleep <- read_xpt("~/Desktop/EPI553Final/SLQ_L.xpt")
nrow(Demo)## [1] 11933
## [1] 6337
## [1] 6337
## [1] 8727
## [1] 8501
Total observations in each raw dataset: Demo 11,933 Alcohol 6,337 Depression 6,337 Vitamin 8,727 Sleep 8,501
Merge datasets based on common variable SEQN
#Merge datasets
nhanes_merged <- Demo %>%
inner_join(Depression, by = "SEQN") %>%
inner_join(Alcohol, by = "SEQN") %>%
inner_join(Vitamin, by = "SEQN") %>%
inner_join(Sleep, by = "SEQN")
#Sample size
nrow(nhanes_merged)## [1] 6337
Total observations in merged dataset is 6337
Restrict dataset to adults (18 years or older)
#Filter out participants under 18 years old
nhanes_age <- nhanes_merged %>%
filter(RIDAGEYR >= 18)
nrow(nhanes_age)## [1] 6337
Missing outcome observations
nhanes_age <- nhanes_age |>
filter(!is.na(DPQ010),
!is.na(DPQ020),
!is.na(DPQ030),
!is.na(DPQ040),
!is.na(DPQ050),
!is.na(DPQ060),
!is.na(DPQ070),
!is.na(DPQ080),
!is.na(DPQ090)
)
nrow(nhanes_age)## [1] 5506
After removing all missing observations for all depression variables for the PHQ-9 scale there are 5506 observations.
View all variables in data
## [1] "SEQN" "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
## [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM" "DMQMILIZ" "DMDBORN4"
## [13] "DMDYRUSR" "DMDEDUC2" "DMDMARTZ" "RIDEXPRG" "DMDHHSIZ" "DMDHRGND"
## [19] "DMDHRAGZ" "DMDHREDZ" "DMDHRMAZ" "DMDHSEDZ" "WTINT2YR" "WTMEC2YR"
## [25] "SDMVSTRA" "SDMVPSU" "INDFMPIR" "DPQ010" "DPQ020" "DPQ030"
## [31] "DPQ040" "DPQ050" "DPQ060" "DPQ070" "DPQ080" "DPQ090"
## [37] "DPQ100" "ALQ111" "ALQ121" "ALQ130" "ALQ142" "ALQ270"
## [43] "ALQ280" "ALQ151" "ALQ170" "WTPH2YR" "LBXVIDMS" "LBDVIDLC"
## [49] "LBXVD2MS" "LBDVD2LC" "LBXVD3MS" "LBDVD3LC" "LBXVE3MS" "LBDVE3LC"
## [55] "SLQ300" "SLQ310" "SLD012" "SLQ320" "SLQ330" "SLD013"
Select variables of interest
#Select variables
nhanes_subset <- nhanes_age %>%
select(SEQN, RIDAGEYR, RIAGENDR, RIDRETH3, DMDEDUC2, DMDMARTZ, DPQ010, DPQ020, DPQ030, DPQ040, DPQ050, DPQ060, DPQ070, DPQ080, DPQ090, LBXVIDMS, SLD012, ALQ121)Outcome variable(s): Depression Kept all PHQ-9 items (DPQ010-DPQ090) Exposure: Vitamin D Confounders: Age, Sex, Race/ethnicity, education level, marital status, alcohol consumption, sleep
Recode variables
nhanes_clean <- nhanes_subset |>
mutate(
#Outcome: Depression
DPQ010= case_when(
DPQ010 %in% c(7, 9) ~ NA_real_,
DPQ010 >= 0 & DPQ010 <= 3 ~ as.numeric(DPQ010),
TRUE ~ NA_real_
),
DPQ020= case_when(
DPQ020 %in% c(7, 9) ~ NA_real_,
DPQ020 >= 0 & DPQ020 <= 3 ~ as.numeric(DPQ020),
TRUE ~ NA_real_
),
DPQ030= case_when(
DPQ030 %in% c(7, 9) ~ NA_real_,
DPQ030 >= 0 & DPQ030 <= 3 ~ as.numeric(DPQ030),
TRUE ~ NA_real_
),
DPQ040= case_when(
DPQ040 %in% c(7, 9) ~ NA_real_,
DPQ040 >= 0 & DPQ040 <= 3 ~ as.numeric(DPQ040),
TRUE ~ NA_real_
),
DPQ050= case_when(
DPQ050 %in% c(7, 9) ~ NA_real_,
DPQ050 >= 0 & DPQ050 <= 3 ~ as.numeric(DPQ050),
TRUE ~ NA_real_
),
DPQ060= case_when(
DPQ060 %in% c(7, 9) ~ NA_real_,
DPQ060 >= 0 & DPQ060 <= 3 ~ as.numeric(DPQ060),
TRUE ~ NA_real_
),
DPQ070= case_when(
DPQ070 %in% c(7, 9) ~ NA_real_,
DPQ070 >= 0 & DPQ070 <= 3 ~ as.numeric(DPQ070),
TRUE ~ NA_real_
),
DPQ080= case_when(
DPQ080 %in% c(7, 9) ~ NA_real_,
DPQ080 >= 0 & DPQ080 <= 3 ~ as.numeric(DPQ080),
TRUE ~ NA_real_
),
DPQ090= case_when(
DPQ090 %in% c(7, 9) ~ NA_real_,
DPQ090 >= 0 & DPQ090 <= 3 ~ as.numeric(DPQ090),
TRUE ~ NA_real_
),
#Create PHQ-9 score
phq9_score= DPQ010 + DPQ020 + DPQ030 + DPQ040 +
DPQ050 + DPQ060 + DPQ070 + DPQ080 + DPQ090,
depression = factor(
ifelse(phq9_score >= 10, 1, 0),
levels = c(0, 1),
labels = c("No", "Yes")
),
#Exposure: Vitamin D
vitaminD= LBXVIDMS,
#Predictors
age= RIDAGEYR,
sex= factor(RIAGENDR, levels= c(1, 2), labels= c("Male", "Female")),
race_ethnicity= factor(RIDRETH3,
levels= c(1, 2, 3, 4, 6, 7),
labels= c("Mexican American", "Other Hispanic", "NH White", "NH Black", "NH Asian", "Other Race")),
education= factor(case_when(
DMDEDUC2 %in% c(1, 2) ~ "Less than HS",
DMDEDUC2 == 3 ~ "HS diploma or GED",
DMDEDUC2 == 4 ~ "Some college or technical school",
DMDEDUC2 == 5 ~ "College graduate or above",
DMDEDUC2 %in% c(7,9) ~ NA_character_,
TRUE ~ NA_character_
)),
sleep_hrs= SLD012,
alcohol = case_when(
ALQ121 == 0 ~ "Never",
ALQ121 %in% c(1,2) ~ "Daily/Nearly daily",
ALQ121 == 3 ~ "Frequent",
ALQ121 %in% c(4, 5) ~ "Weekly",
ALQ121 %in% c(6, 7, 8) ~ "Occasional",
ALQ121 %in% c(9, 10) ~ "Rare",
ALQ121 %in% c(77, 99) ~ NA_character_,
TRUE ~ NA_character_
),
alcohol= factor(
alcohol,
levels= c(
"Never",
"Rare",
"Occasional",
"Weekly",
"Frequent",
"Daily/Nearly daily"
)
),
marital= factor(case_when(
DMDMARTZ == 1 ~ "Married/Living with Partner",
DMDMARTZ == 2 ~ "Divorced/Separated",
DMDMARTZ == 3 ~ "Never Married",
DMDMARTZ %in% c(77,99) ~ NA_character_,
TRUE ~ NA_character_
)),
)Distribution of Depression Variable
nhanes_clean %>%
gtsummary::tbl_summary(
include = depression,
statistic = all_categorical() ~ "{n} ({p}%)"
)| Characteristic | N = 5,5061 |
|---|---|
| depression | 723 (13%) |
| Â Â Â Â Unknown | 51 |
| 1 n (%) | |
References
#Create reference categories for variables
nhanes_clean <- nhanes_clean %>%
mutate(
sex = relevel(sex, ref = "Male"),
race_ethnicity= relevel(race_ethnicity, ref= "NH White"),
education = relevel(education, ref = "Less than HS"),
marital = relevel(marital, ref = "Married/Living with Partner"),
alcohol= relevel(alcohol, ref= "Never")
)Missingness
## Total N: 5506
miss_info <- function(x) {
n <- sum(is.na(x))
pct <- round(100 * n / length(x), 2)
paste0(n, " (", pct, "%)")
}
cat("Missing depression:", miss_info(nhanes_clean$depression), "\n")## Missing depression: 51 (0.93%)
## Missing vitaminD: 417 (7.57%)
## Missing sex: 0 (0%)
## Missing age: 0 (0%)
## Missing race_ethnicity: 0 (0%)
## Missing sleep_hrs: 55 (1%)
## Missing education: 247 (4.49%)
## Missing marital: 249 (4.52%)
## Missing alcohol: 589 (10.7%)
nhanes_final <- nhanes_clean %>%
drop_na(depression, vitaminD, age, sex, race_ethnicity, education, marital, sleep_hrs, alcohol)Random Sample
#Create random sample of data
set.seed(1220)
nhanes_final <- nhanes_final |>
slice_sample(n = 4000)
summary(nhanes_final)## SEQN RIDAGEYR RIAGENDR RIDRETH3 DMDEDUC2
## Min. :130379 Min. :20.0 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:133292 1st Qu.:39.0 1st Qu.:1.00 1st Qu.:3.000 1st Qu.:3.000
## Median :136315 Median :57.0 Median :2.00 Median :3.000 Median :4.000
## Mean :136316 Mean :53.6 Mean :1.54 Mean :3.248 Mean :4.015
## 3rd Qu.:139296 3rd Qu.:67.0 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :142310 Max. :80.0 Max. :2.00 Max. :7.000 Max. :5.000
## DMDMARTZ DPQ010 DPQ020 DPQ030
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :1.648 Mean :0.4435 Mean :0.4415 Mean :0.7602
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :3.000 Max. :3.0000 Max. :3.0000 Max. :3.0000
## DPQ040 DPQ050 DPQ060 DPQ070
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.000 Median :0.0000
## Mean :0.8323 Mean :0.471 Mean :0.387 Mean :0.3685
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :3.000 Max. :3.000 Max. :3.0000
## DPQ080 DPQ090 LBXVIDMS SLD012
## Min. :0.000 Min. :0.000 Min. : 9.13 Min. : 2.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.: 57.48 1st Qu.: 7.000
## Median :0.000 Median :0.000 Median : 78.55 Median : 8.000
## Mean :0.163 Mean :0.058 Mean : 83.17 Mean : 7.666
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:103.00 3rd Qu.: 8.500
## Max. :3.000 Max. :3.000 Max. :424.00 Max. :14.000
## ALQ121 phq9_score depression vitaminD age
## Min. : 0.000 Min. : 0.000 No :3522 Min. : 9.13 Min. :20.0
## 1st Qu.: 2.000 1st Qu.: 0.000 Yes: 478 1st Qu.: 57.48 1st Qu.:39.0
## Median : 5.000 Median : 2.000 Median : 78.55 Median :57.0
## Mean : 4.893 Mean : 3.925 Mean : 83.17 Mean :53.6
## 3rd Qu.: 8.000 3rd Qu.: 6.000 3rd Qu.:103.00 3rd Qu.:67.0
## Max. :10.000 Max. :26.000 Max. :424.00 Max. :80.0
## sex race_ethnicity education
## Male :1838 NH White :2577 Less than HS : 355
## Female:2162 Mexican American: 242 College graduate or above :1625
## Other Hispanic : 373 HS diploma or GED : 760
## NH Black : 412 Some college or technical school:1260
## NH Asian : 145
## Other Race : 251
## sleep_hrs alcohol marital
## Min. : 2.000 Never :673 Married/Living with Partner:2212
## 1st Qu.: 7.000 Rare :837 Divorced/Separated : 984
## Median : 8.000 Occasional :951 Never Married : 804
## Mean : 7.666 Weekly :767
## 3rd Qu.: 8.500 Frequent :381
## Max. :14.000 Daily/Nearly daily:391
nhanes_final %>%
select(depression, age, sex, race_ethnicity, marital, education, sleep_hrs, vitaminD, alcohol) %>%
tbl_summary(
by= depression,
label = list(
age ~ "Age",
race_ethnicity ~ "Race-ethnicity",
sex ~ "Sex",
marital ~ "Marital Status",
education ~ "Highest level of education completed",
sleep_hrs ~ "Average sleep hours per night",
alcohol ~ "Alcohol use",
vitaminD ~ "Vitamin D status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
) %>%
add_n() %>%
# add_overall() %>%
bold_labels() %>%
italicize_levels() %>%
modify_caption("**Table 1. Descriptive Statistics — NHANES 2021-2023 Analytic Sample, Stratified by Depression Status (n = 4000)**") %>%
modify_footnote(
all_stat_cols()~ "Total analytic sample size = 4000, missing values were removed prior to analysis. Mean and sd were produced for continuous variables and freuqency and % for categorical"
) %>%
as_flex_table()Characteristic | N | No | Yes |
|---|---|---|---|
Age | 4,000 | 54.4 (16.7) | 47.9 (17.1) |
Sex | 4,000 | ||
Male | 1,658 (47%) | 180 (38%) | |
Female | 1,864 (53%) | 298 (62%) | |
Race-ethnicity | 4,000 | ||
NH White | 2,289 (65%) | 288 (60%) | |
Mexican American | 205 (5.8%) | 37 (7.7%) | |
Other Hispanic | 331 (9.4%) | 42 (8.8%) | |
NH Black | 360 (10%) | 52 (11%) | |
NH Asian | 135 (3.8%) | 10 (2.1%) | |
Other Race | 202 (5.7%) | 49 (10%) | |
Marital Status | 4,000 | ||
Married/Living with Partner | 2,033 (58%) | 179 (37%) | |
Divorced/Separated | 845 (24%) | 139 (29%) | |
Never Married | 644 (18%) | 160 (33%) | |
Highest level of education completed | 4,000 | ||
Less than HS | 291 (8.3%) | 64 (13%) | |
College graduate or above | 1,496 (42%) | 129 (27%) | |
HS diploma or GED | 655 (19%) | 105 (22%) | |
Some college or technical school | 1,080 (31%) | 180 (38%) | |
Average sleep hours per night | 4,000 | 7.7 (1.4) | 7.4 (2.1) |
Vitamin D status | 4,000 | 84.5 (38.3) | 73.3 (34.3) |
Alcohol use | 4,000 | ||
Never | 579 (16%) | 94 (20%) | |
Rare | 721 (20%) | 116 (24%) | |
Occasional | 826 (23%) | 125 (26%) | |
Weekly | 706 (20%) | 61 (13%) | |
Frequent | 340 (9.7%) | 41 (8.6%) | |
Daily/Nearly daily | 350 (9.9%) | 41 (8.6%) | |
1Total analytic sample size = 4000, missing values were removed prior to analysis. Mean and sd were produced for continuous variables and freuqency and % for categorical | |||
Distribution of phq-9 score (outcome variable)
ggplot(nhanes_final, aes(x = phq9_score)) +
geom_histogram(binwidth = 2, color = "Black", fill = "lightgreen") +
labs(
title = "Plot 1. Distribution of PHQ-9 Scores",
x = "PHQ-9 Score",
y = "Frequency"
) +
theme_minimal()Association between the outcome and primary exposure Boxplot of phq-9 score by vitamin D levels
ggplot(nhanes_final, aes(x = depression, y = vitaminD)) +
geom_boxplot(fill = "lightgreen") +
labs(
title = "Plot 2. Vitamin D Levels by Depression Status",
x = "Depression Status",
y = "Vitamin D (nmol/L)"
) +
theme_minimal()Exploratory plot Boxplot of phq-9 score by alcohol use
ggplot(nhanes_final, aes(x = factor(alcohol), y = phq9_score)) +
geom_boxplot(fill= "lightblue") +
labs(
title = "Plot 3. PHQ-9 Score by Alcohol Use",
x = "Alcohol Use",
y = "PHQ-9 Score"
) +
theme_minimal()