This data is from the NHANES (National Health and Nutrition Examination Survey) from the CDC (https://www.cdc.gov/nchs/nhanes/index.htm). Here, my aim is to tidy the data in order to recreate an analysis on the effect of sleep on metabolic syndrome, seen here: https://pubmed.ncbi.nlm.nih.gov/31717770/.
# Libraries
library(NHANES)
library(foreign)
library(tidyverse)
library(stats)
library(tibble)
library(table1)
library(Hmisc)
library(mgcv)
library(visibly)
# Importing the data
#demographic data
demog = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
destfile = demog,
mode = "wb")
demog = read.xport(demog)
demog = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
destfile = demog,
mode = "wb")
demog = read.xport(demog)
#body measurements
size = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BMX_J.XPT",
destfile = size,
mode = "wb")
size = read.xport(size)
#blood pressure
bp = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPX_J.XPT",
destfile = bp,
mode = "wb")
bp = read.xport(bp)
#HDL
hdl = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HDL_J.XPT",
destfile = hdl,
mode = "wb")
hdl = read.xport(hdl)
#Triglycerides
fat = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TRIGLY_J.XPT",
destfile = fat,
mode = "wb")
fat = read.xport(fat)
#fasting glucose
f_gluc = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/GLU_J.XPT",
destfile = f_gluc,
mode = "wb")
f_gluc = read.xport(f_gluc)
#prescription medications
##### refer to appendix of meds documentation for code definitions
meds = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/RXQ_RX_J.XPT",
destfile = meds,
mode = "wb")
meds = read.xport(meds) %>%
filter(RXDRSC1 == "R73" | RXDRSC1 == "I10" | RXDRSC1 == "I10.P") %>%
mutate(RelMeds = case_when(
RXDRSC1 %in% c("R73") ~ "Rx_Glucose",
RXDRSC1 %in% c("I10", "I10.P") ~ "Rx_BP")) %>%
pivot_wider(names_from = RelMeds, values_from = RXDUSE) %>%
select(SEQN, Rx_Glucose)%>%
distinct(SEQN, Rx_Glucose)
#sleep time
sleep = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SLQ_J.XPT",
destfile = sleep,
mode = "wb")
sleep = read.xport(sleep)
#Time sitting
sit = tempfile()
download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/PAQ_J.XPT",
destfile = sit,
mode = "wb")
sit = read.xport(sit)
Note that there are quite a few systolic and diastolic blood pressure measurements; most subjects will have data for only some of these. Therefore we coalesce these columns into a single set of variables: systolic and diastolic blood pressure.
#
bp = bp %>%
mutate(Systolic_coalesced = coalesce(BPXSY1, BPXSY2, BPXSY3, BPXSY4)) %>%
mutate(Dias_coalesced = coalesce(BPXDI1, BPXDI2, BPXDI3, BPXDI4))
Next, I do the bulk of my data tidying. Here I merge the datasets, re-code missing values, factorize categorical variables, etc. In the source paper, metabolic syndrome is defined as having 3 or more of the following criteria: obesity (measured by waist circumference), elevated serum triglycerides, elevated serum HDL cholesterol, high blood pressure, and high fasting blood glucose.
A note on my approach: if you read through the source paper, you will see that the authors have done a ‘completer’ analysis, i.e., they removed any subject missing any of the data they were interested in. My opinion (backed up by statistical theory) is that this is not a great approach. Completers tend to be different from the original population, so a completer sample is no longer a random sample, and, critically, the ways it diverges from a random sample are unknowable. This is a Bad Thing if your aim is inference.
#Merging data sets by subject id (SEQN)
data2 = inner_join(demog, sleep, by="SEQN") %>% # keep all common to demog and sleep
left_join(bp, by="SEQN") %>%
left_join(f_gluc, by="SEQN") %>%
left_join(fat, by="SEQN") %>%
left_join(hdl, by="SEQN") %>%
left_join(size, by="SEQN") %>%
left_join(meds, by="SEQN") %>%
left_join(sit, by="SEQN")
# Select the variables of interest:
data2 = data2 %>%
select(SEQN,
RIAGENDR,#Gender
RIDAGEYR,#Age in years at screening
RIDRETH3,#Race/Hispanic origin w/ NH Asian
DMDEDUC3,#Education level - Children/Youth 6-19
DMDEDUC2,#Education level - Adults 20+
DMDMARTL,#Marital status
WTINT2YR, #Full sample 2 year interview weight
WTMEC2YR, #Full sample 2 year MEC exam weight
BMXWAIST, #Waist circumference
Systolic_coalesced, #Systolic: Blood pres coalesced
Dias_coalesced, #Diastolic: Blood pres coalesced
LBDHDD, #Direct HDL-Cholesterol (mg/dL)
LBXTR, #Triglyceride (mg/dL)
LBXGLU, #Fasting Glucose (mg/dL)
Rx_Glucose, #Treatment for high blood glucose
SLD012, #Sleep hours - weekdays or workdays
SLD013, #Sleep hours - weekends
PAD680, #minutes sitting
#Survey design variables
SDMVPSU,
SDMVSTRA,
WTMEC2YR)%>%
# Re-code missing data to "na"
mutate_at(c("SLD012", "SLD013"),
na_if, 77777) %>%
mutate_at(c("SLD012", "SLD013"),
na_if, 99999) %>%
mutate_at(c("DMDEDUC3", "DMDEDUC2", "DMDMARTL"),
na_if, 77) %>%
mutate_at(c("DMDEDUC3", "DMDEDUC2", "DMDMARTL"),
na_if, 99) %>%
mutate_at(c("DMDEDUC2"),
na_if, 7) %>%
mutate_at(c("DMDEDUC2"),
na_if, 9) %>%
mutate_at(vars(starts_with("BPX")),
na_if, 7) %>%
mutate_at(vars(starts_with("DPQ")),
na_if, 9) %>%
mutate_at(vars(starts_with("RXDR")),
na_if, 77777) %>%
mutate_at(vars(starts_with("RXDR")),
na_if, 99999) %>%
mutate_at(vars(starts_with("RXDR")),
na_if, 55555) %>%
mutate_at(c("PAD680"),
na_if, 7777) %>%
mutate_at(c("PAD680"),
na_if, 9999) %>%
# Filter out subjects under the age of 18
filter(RIDAGEYR >= 18) %>%
# Factorize gender variable
mutate(RIAGENDR = factor(RIAGENDR,
labels = c('Male', 'Female'))) %>%
# Creating binary variables for diagnostic criteria
# factor 1: high waist circumference; cut-off is 102 cm for males and 88 cm for females
mutate(mets1 = if_else(BMXWAIST >= 102, 1,
if_else(BMXWAIST >= 88 & RIAGENDR == 'Female', 1, 0))) %>%
# factor 2: elevated triglycerides
mutate(mets2 = if_else(LBXTR >= 150, 1,0)) %>% #no meds indicated for elev triglycer
# factor 3: Direct HDL-Cholesterol (mg/dL)
mutate(mets3 = if_else(LBDHDD < 50 & RIAGENDR == 'Female', 1,
if_else(LBDHDD < 40 & RIAGENDR == 'Male', 1, 0))) %>%
# factor 4: high blood pressure
mutate(mets4 = if_else(Systolic_coalesced >= 130, 1,
if_else(Dias_coalesced >= 85, 1, 0))) %>%
# factor 5: high blood glucose
mutate(mets5 = if_else(LBXGLU >= 100, 1,
if_else(Rx_Glucose == 1, 1,0))) %>%
# Average sleep duration
mutate(avgsleep = (5/7)*SLD012 + (2/7)*SLD013) %>%
# Average time sitting
mutate(hoursit = PAD680/60) %>%
# Filter out subjects missing sleep variable
filter(!is.na(avgsleep)) %>%
# Filter out subjects missing all 5 mets categories
filter(!(is.na(mets1) & is.na(mets2) & is.na(mets3) & is.na(mets4) &
is.na(mets5))) %>% #no one missing 4 categories
#filter(avgsleep < (2*sd(avgsleep, na.rm = T) + mean(avgsleep))) %>%
# age categories
mutate(age_cat = cut(RIDAGEYR,
breaks = quantile(RIDAGEYR, probs = c(0, 1/4, 2/4, 3/4, 1)),
include.lowest = T,
labels = c("18-34", "35-52", "53-65", "66+"))) %>%
# Sleep categories
mutate(sleep_cat = cut(avgsleep,
breaks = quantile(avgsleep, probs = c(0, 1/4, 2/4, 3/4, 1)),
include.lowest = T,
#labels = c("<3 - 7", ">7 - 7.86", ">7.86 - 8.57", ">8.57 - 10.9")
)) %>%
# Tallying up diagnostic criteria
mutate(metstot = rowSums(select(., starts_with("mets")), na.rm = T)) %>%
mutate(MetS = factor(metstot >= 3, labels = c("No", "Yes"))) %>%
# Race/Ethnicity
mutate(RIDRETH3 = case_when(
RIDRETH3 %in% c(1,2) ~ "Hispanic",
RIDRETH3 %in% c(3) ~ "White",
RIDRETH3 %in% c(4) ~ "Black",
RIDRETH3 %in% c(6) ~ "Asian",
RIDRETH3 %in% c(7) ~ "Other/Multi-Racial")) %>%
mutate(RIDRETH3 = factor(RIDRETH3)) %>%
# Education
mutate(Ed_status = case_when(
DMDEDUC3 %in% c(0:8, 55, 66) ~ "< 9th Grade",
DMDEDUC3 %in% c(9:12) ~ "Some High School",
DMDEDUC3 %in% c(13:14) ~ "High School Graduate/GED",
DMDEDUC3 %in% c(15) ~ "Some College",
DMDEDUC2 %in% c(1) ~ "< 9th Grade",
DMDEDUC2 %in% c(2) ~ "Some High School",
DMDEDUC2 %in% c(3) ~ "High School Graduate/GED",
DMDEDUC2 %in% c(4) ~ "Some College",
DMDEDUC2 %in% c(5) ~ "College Graduate")) %>%
mutate(Ed_status = factor(Ed_status)) %>%
# Marital Status
mutate(DMDMARTL = case_when(
DMDMARTL %in% c(1,6) ~ "Married/Living with Partner",
DMDMARTL %in% c(2) ~ "Widowed",
DMDMARTL %in% c(3:4) ~ "Divorced/Separated",
DMDMARTL %in% c(5) ~ "Never Married")) %>%
mutate(DMDMARTL = factor(DMDMARTL))
view(data2)
save(data2, file="nhanesdata2.RData")
summary(data2$sleep_cat)
## [2,7] (7,7.93] (7.93,8.71] (8.71,14]
## 1675 1173 1345 1354
tabledata = data2 %>%
select(SEQN,
RIAGENDR,
RIDAGEYR,
RIDRETH3,
DMDMARTL,
mets1,
mets2,
mets3,
mets4,
mets5,
metstot,
MetS,
avgsleep,
age_cat,
sleep_cat,
Ed_status,
BMXWAIST,
Systolic_coalesced,
Dias_coalesced,
LBDHDD,
LBXTR,
LBXGLU,
hoursit)
Then I recreate Table 1:
load("tabledata.RData")
label(tabledata$RIAGENDR) <- "Sex"
label(tabledata$age_cat) <- "Age"
label(tabledata$RIDRETH3) <- "Ethnicity"
label(tabledata$DMDMARTL) <- "Marital Status"
label(tabledata$MetS) <- "Metabolic Syndrome"
label(tabledata$avgsleep) <- "Average Sleep Duration"
label(tabledata$Ed_status) <- "Education"
label(tabledata$BMXWAIST) <- "Waist Circumference"
label(tabledata$BP_coalesced) <- "Systolic Blood Pressure"
label(tabledata$Dias_coalesced) <- "Diastolic Blood Pressure"
label(tabledata$LBDHDD) <- "High Density Lipoprotein"
label(tabledata$LBXTR) <- "Triglyceride"
label(tabledata$LBXGLU) <- "Fasting Blood Glucose"
label(tabledata$RIAGENDR) <- "Gender"
label(tabledata$hoursit) <- "Time Sitting"
units(tabledata$age_cat) <- "years"
units(tabledata$avgsleep) <- "hours"
units(tabledata$BMXWAIST) <- "cm"
units(tabledata$BP_coalesced) <- "mmHg"
units(tabledata$Dias_coalesced) <- "mmHg"
units(tabledata$LBDHDD) <- "mg/dL"
units(tabledata$LBXTR) <- "mg/dL"
units(tabledata$LBXGLU) <- "mg/dL"
units(tabledata$hoursit) <- "hours/day"
table1(~ age_cat + RIAGENDR + RIDRETH3 + DMDMARTL + Ed_status + hoursit + avgsleep +
Dias_coalesced + BP_coalesced + LBXGLU + LBDHDD + LBXTR + BMXWAIST
| MetS, data=tabledata, overall="Total", caption = "Table 1",
render.continuous=c(.="Mean (SD)", .="Median (IQR)"))
| No (N=4214) |
Yes (N=1160) |
Total (N=5374) |
|
|---|---|---|---|
| Age (years) | |||
| 18-34 | 1275 (30.3%) | 123 (10.6%) | 1398 (26.0%) |
| 35-52 | 1067 (25.3%) | 303 (26.1%) | 1370 (25.5%) |
| 53-65 | 972 (23.1%) | 374 (32.2%) | 1346 (25.0%) |
| 66+ | 900 (21.4%) | 360 (31.0%) | 1260 (23.4%) |
| Gender | |||
| Male | 2086 (49.5%) | 525 (45.3%) | 2611 (48.6%) |
| Female | 2128 (50.5%) | 635 (54.7%) | 2763 (51.4%) |
| Ethnicity | |||
| Asian | 658 (15.6%) | 123 (10.6%) | 781 (14.5%) |
| Black | 1023 (24.3%) | 219 (18.9%) | 1242 (23.1%) |
| Hispanic | 902 (21.4%) | 324 (27.9%) | 1226 (22.8%) |
| Other/Multi-Racial | 204 (4.8%) | 74.0 (6.4%) | 278 (5.2%) |
| White | 1427 (33.9%) | 420 (36.2%) | 1847 (34.4%) |
| Marital Status | |||
| Divorced/Separated | 553 (13.1%) | 207 (17.8%) | 760 (14.1%) |
| Married/Living with Partner | 2360 (56.0%) | 698 (60.2%) | 3058 (56.9%) |
| Never Married | 761 (18.1%) | 138 (11.9%) | 899 (16.7%) |
| Widowed | 301 (7.1%) | 107 (9.2%) | 408 (7.6%) |
| Missing | 239 (5.7%) | 10.0 (0.9%) | 249 (4.6%) |
| Education | |||
| < 9th Grade | 296 (7.0%) | 132 (11.4%) | 428 (8.0%) |
| College Graduate | 1049 (24.9%) | 208 (17.9%) | 1257 (23.4%) |
| High School Graduate/GED | 1045 (24.8%) | 293 (25.3%) | 1338 (24.9%) |
| Some College | 1361 (32.3%) | 363 (31.3%) | 1724 (32.1%) |
| Some High School | 456 (10.8%) | 163 (14.1%) | 619 (11.5%) |
| Missing | 7.00 (0.2%) | 1.00 (0.1%) | 8.00 (0.1%) |
| Time Sitting (hours/day) | |||
| Mean (SD) | 5.49 (3.30) | 5.65 (3.43) | 5.53 (3.33) |
| Median (IQR) | 5.00 (5.00) | 5.00 (5.00) | 5.00 (5.00) |
| Missing | 27.0 (0.6%) | 6.00 (0.5%) | 33.0 (0.6%) |
| Average Sleep Duration (hours) | |||
| Mean (SD) | 7.70 (1.38) | 7.64 (1.41) | 7.69 (1.39) |
| Median (IQR) | 7.86 (1.57) | 7.71 (1.71) | 7.86 (1.57) |
| Diastolic Blood Pressure (mmHg) | |||
| Mean (SD) | 71.1 (13.0) | 75.5 (15.1) | 72.1 (13.6) |
| Median (IQR) | 72.0 (14.0) | 76.0 (16.0) | 72.0 (16.0) |
| Missing | 226 (5.4%) | 22.0 (1.9%) | 248 (4.6%) |
| Systolic Blood Pressure (mmHg) | |||
| Mean (SD) | 124 (19.4) | 138 (20.2) | 127 (20.4) |
| Median (IQR) | 120 (22.0) | 136 (22.0) | 124 (26.0) |
| Missing | 226 (5.4%) | 22.0 (1.9%) | 248 (4.6%) |
| Fasting Blood Glucose (mg/dL) | |||
| Mean (SD) | 105 (25.5) | 131 (47.9) | 114 (37.4) |
| Median (IQR) | 99.0 (14.0) | 114 (29.8) | 104 (20.0) |
| Missing | 2626 (62.3%) | 258 (22.2%) | 2884 (53.7%) |
| High Density Lipoprotein (mg/dL) | |||
| Mean (SD) | 55.7 (15.4) | 44.3 (11.8) | 53.1 (15.4) |
| Median (IQR) | 54.0 (19.0) | 42.0 (11.8) | 51.0 (19.0) |
| Missing | 321 (7.6%) | 6.00 (0.5%) | 327 (6.1%) |
| Triglyceride (mg/dL) | |||
| Mean (SD) | 83.6 (43.5) | 163 (147) | 113 (103) |
| Median (IQR) | 75.0 (52.5) | 137 (97.0) | 92.0 (73.5) |
| Missing | 2663 (63.2%) | 264 (22.8%) | 2927 (54.5%) |
| Waist Circumference (cm) | |||
| Mean (SD) | 97.1 (16.4) | 111 (15.3) | 100 (17.2) |
| Median (IQR) | 95.8 (21.9) | 109 (18.8) | 99.0 (22.9) |
| Missing | 281 (6.7%) | 15.0 (1.3%) | 296 (5.5%) |
Here I recreate the first model from the source paper, a univariate generalized additive model of metabolic syndrome (yes/no) on average sleep duration:
gam.fit = gam(MetS ~ s(avgsleep),family=binomial(),data=data2)
summary(gam.fit)
##
## Family: binomial
## Link function: logit
##
## Formula:
## MetS ~ s(avgsleep)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.28967 0.03264 -39.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(avgsleep) 1.001 1.003 1.243 0.265
##
## R-sq.(adj) = 4.69e-05 Deviance explained = 0.0215%
## UBRE = 0.044023 Scale est. = 1 n = 5547
fits = predict(gam.fit, newdata=data2, type='response', se=T)
predicts = data.frame(data2, fits) %>%
mutate(lower = fit - 1.96*se.fit,
upper = fit + 1.96*se.fit)
(plot_gam = ggplot(aes(x=avgsleep,y=fit), data=predicts) +
geom_ribbon(aes(ymin = lower, ymax=upper), fill='gray90') +
geom_line(color='#00aaff') +
theme_trueMinimal() + labs(y= "Metabolic Syndrome Prediction", x = "Average Sleep (hours)", title = "Predicting Metabolic Syndrome"))