This data is from the NHANES (National Health and Nutrition Examination Survey) from the CDC (https://www.cdc.gov/nchs/nhanes/index.htm). Here, my aim is to tidy the data in order to recreate an analysis on the effect of sleep on metabolic syndrome, seen here: https://pubmed.ncbi.nlm.nih.gov/31717770/.

Data Source:

# Libraries 
library(NHANES)
library(foreign)
library(tidyverse)
library(stats)
library(tibble)
library(table1)
library(Hmisc)
library(mgcv)
library(visibly)

# Importing the data

#demographic data
demog = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
              destfile = demog,
              mode = "wb")
demog = read.xport(demog)

demog = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
              destfile = demog,
              mode = "wb")
demog = read.xport(demog)


#body measurements
size = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BMX_J.XPT",
              destfile = size,
              mode = "wb")
size = read.xport(size)

#blood pressure
bp = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPX_J.XPT",
              destfile = bp,
              mode = "wb")
bp = read.xport(bp)

#HDL
hdl = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HDL_J.XPT",
              destfile = hdl,
              mode = "wb")
hdl = read.xport(hdl)

#Triglycerides 
fat = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TRIGLY_J.XPT",
              destfile = fat,
              mode = "wb")
fat = read.xport(fat)

#fasting glucose
f_gluc = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/GLU_J.XPT",
              destfile = f_gluc,
              mode = "wb")
f_gluc = read.xport(f_gluc)

#prescription medications
##### refer to appendix of meds documentation for code definitions

meds = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/RXQ_RX_J.XPT",
              destfile = meds,
              mode = "wb")
meds = read.xport(meds) %>%
  
  filter(RXDRSC1 == "R73" | RXDRSC1 == "I10" | RXDRSC1 == "I10.P") %>%
  
  mutate(RelMeds = case_when(
    RXDRSC1 %in% c("R73") ~ "Rx_Glucose",
    RXDRSC1 %in% c("I10", "I10.P") ~ "Rx_BP")) %>%
  
  pivot_wider(names_from = RelMeds, values_from = RXDUSE) %>%
  
  select(SEQN, Rx_Glucose)%>%
  
  distinct(SEQN, Rx_Glucose) 


#sleep time
sleep = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SLQ_J.XPT",
              destfile = sleep,
              mode = "wb")
sleep = read.xport(sleep)


#Time sitting
sit = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/PAQ_J.XPT",
              destfile = sit,
              mode = "wb")

sit = read.xport(sit)

Note that there are quite a few systolic and diastolic blood pressure measurements; most subjects will have data for only some of these. Therefore we coalesce these columns into a single set of variables: systolic and diastolic blood pressure.

#
bp = bp %>% 
  mutate(Systolic_coalesced = coalesce(BPXSY1, BPXSY2, BPXSY3, BPXSY4)) %>%
  mutate(Dias_coalesced = coalesce(BPXDI1, BPXDI2, BPXDI3, BPXDI4))

Data Tidying

Next, I do the bulk of my data tidying. Here I merge the datasets, re-code missing values, factorize categorical variables, etc. In the source paper, metabolic syndrome is defined as having 3 or more of the following criteria: obesity (measured by waist circumference), elevated serum triglycerides, elevated serum HDL cholesterol, high blood pressure, and high fasting blood glucose.

A note on my approach: if you read through the source paper, you will see that the authors have done a ‘completer’ analysis, i.e., they removed any subject missing any of the data they were interested in. My opinion (backed up by statistical theory) is that this is not a great approach. Completers tend to be different from the original population, so a completer sample is no longer a random sample, and, critically, the ways it diverges from a random sample are unknowable. This is a Bad Thing if your aim is inference.

#Merging data sets by subject id (SEQN)
data2 = inner_join(demog, sleep, by="SEQN") %>% # keep all common to demog and sleep
  left_join(bp, by="SEQN") %>%    
  left_join(f_gluc, by="SEQN") %>%      
  left_join(fat, by="SEQN") %>% 
  left_join(hdl, by="SEQN") %>%
  left_join(size, by="SEQN") %>%
  left_join(meds, by="SEQN") %>%
  left_join(sit, by="SEQN")

# Select the variables of interest:
data2 = data2 %>%
select(SEQN, 
       RIAGENDR,#Gender
       RIDAGEYR,#Age in years at screening
       RIDRETH3,#Race/Hispanic origin w/ NH Asian
       DMDEDUC3,#Education level - Children/Youth 6-19
       DMDEDUC2,#Education level - Adults 20+
       DMDMARTL,#Marital status
       WTINT2YR, #Full sample 2 year interview weight
       WTMEC2YR, #Full sample 2 year MEC exam weight
       BMXWAIST, #Waist circumference 
       Systolic_coalesced, #Systolic: Blood pres coalesced
       Dias_coalesced, #Diastolic: Blood pres coalesced
       LBDHDD, #Direct HDL-Cholesterol (mg/dL)
       LBXTR, #Triglyceride (mg/dL)
       LBXGLU, #Fasting Glucose (mg/dL)
       Rx_Glucose, #Treatment for high blood glucose
       SLD012, #Sleep hours - weekdays or workdays
       SLD013, #Sleep hours - weekends
       PAD680, #minutes sitting
       #Survey design variables
       SDMVPSU,
       SDMVSTRA,
       WTMEC2YR)%>%
  
# Re-code missing data to "na"
  
  mutate_at(c("SLD012", "SLD013"),
            na_if, 77777) %>%
  mutate_at(c("SLD012", "SLD013"),
            na_if, 99999) %>%
  mutate_at(c("DMDEDUC3", "DMDEDUC2", "DMDMARTL"),
            na_if, 77) %>%
  mutate_at(c("DMDEDUC3", "DMDEDUC2", "DMDMARTL"),
            na_if, 99) %>%
  mutate_at(c("DMDEDUC2"),
            na_if, 7) %>%
  mutate_at(c("DMDEDUC2"),
            na_if, 9) %>%
  mutate_at(vars(starts_with("BPX")),
            na_if, 7) %>%
  mutate_at(vars(starts_with("DPQ")),
            na_if, 9) %>%
  mutate_at(vars(starts_with("RXDR")),
            na_if, 77777) %>%
  mutate_at(vars(starts_with("RXDR")),
            na_if, 99999) %>%
  mutate_at(vars(starts_with("RXDR")),
            na_if, 55555) %>%  
  mutate_at(c("PAD680"),
            na_if, 7777) %>%
  mutate_at(c("PAD680"),
            na_if, 9999) %>%

# Filter out subjects under the age of 18  
  filter(RIDAGEYR >= 18) %>%

# Factorize gender variable  
  mutate(RIAGENDR = factor(RIAGENDR, 
                         labels = c('Male', 'Female'))) %>%

# Creating binary variables for diagnostic criteria
  
# factor 1: high waist circumference; cut-off is 102 cm for males and 88 cm for females  
mutate(mets1 = if_else(BMXWAIST >= 102, 1,
                       if_else(BMXWAIST >= 88 & RIAGENDR == 'Female', 1, 0))) %>%

# factor 2: elevated triglycerides  
mutate(mets2 = if_else(LBXTR >= 150, 1,0)) %>% #no meds indicated for elev triglycer       

# factor 3: Direct HDL-Cholesterol (mg/dL)   
mutate(mets3 = if_else(LBDHDD < 50 & RIAGENDR == 'Female', 1, 
                       if_else(LBDHDD < 40 & RIAGENDR == 'Male', 1, 0))) %>%  

# factor 4: high blood pressure 
mutate(mets4 = if_else(Systolic_coalesced >= 130, 1,
               if_else(Dias_coalesced >= 85, 1, 0))) %>%

# factor 5: high blood glucose   
mutate(mets5 =  if_else(LBXGLU >= 100, 1, 
                if_else(Rx_Glucose == 1, 1,0))) %>% 

# Average sleep duration    
mutate(avgsleep = (5/7)*SLD012 + (2/7)*SLD013) %>%

# Average time sitting    
mutate(hoursit = PAD680/60) %>% 
 
# Filter out subjects missing sleep variable 
filter(!is.na(avgsleep)) %>%

# Filter out subjects missing all 5 mets categories  
filter(!(is.na(mets1) & is.na(mets2) & is.na(mets3) & is.na(mets4) & 
           is.na(mets5))) %>%    #no one missing 4 categories
  

#filter(avgsleep < (2*sd(avgsleep, na.rm = T) + mean(avgsleep))) %>%

# age categories 
mutate(age_cat = cut(RIDAGEYR, 
                     breaks = quantile(RIDAGEYR, probs = c(0, 1/4, 2/4, 3/4, 1)), 
                     include.lowest = T,
                     labels = c("18-34", "35-52", "53-65", "66+"))) %>%

# Sleep categories  
mutate(sleep_cat = cut(avgsleep, 
                       breaks = quantile(avgsleep, probs = c(0, 1/4, 2/4, 3/4, 1)), 
                       include.lowest = T,
                       #labels = c("<3 - 7", ">7 - 7.86", ">7.86 - 8.57", ">8.57 - 10.9")
                       )) %>%   

# Tallying up diagnostic criteria    
  mutate(metstot = rowSums(select(., starts_with("mets")), na.rm = T)) %>%
  mutate(MetS = factor(metstot >= 3, labels = c("No", "Yes"))) %>%

# Race/Ethnicity
  mutate(RIDRETH3 = case_when(
    RIDRETH3 %in% c(1,2) ~ "Hispanic",
    RIDRETH3 %in% c(3) ~ "White",
    RIDRETH3 %in% c(4) ~ "Black",
    RIDRETH3 %in% c(6) ~ "Asian",
    RIDRETH3 %in% c(7) ~ "Other/Multi-Racial"))  %>%
  
  mutate(RIDRETH3 = factor(RIDRETH3)) %>%
  
# Education
mutate(Ed_status = case_when(
  DMDEDUC3 %in% c(0:8, 55, 66) ~ "< 9th Grade",
  DMDEDUC3 %in% c(9:12) ~ "Some High School",
  DMDEDUC3 %in% c(13:14) ~ "High School Graduate/GED",
  DMDEDUC3 %in% c(15) ~ "Some College",
  DMDEDUC2 %in% c(1) ~ "< 9th Grade",
  DMDEDUC2 %in% c(2) ~ "Some High School",
  DMDEDUC2 %in% c(3) ~ "High School Graduate/GED",
  DMDEDUC2 %in% c(4) ~ "Some College",
  DMDEDUC2 %in% c(5) ~ "College Graduate")) %>%
  
  mutate(Ed_status = factor(Ed_status))  %>%

# Marital Status    
  mutate(DMDMARTL = case_when(
    DMDMARTL %in% c(1,6) ~ "Married/Living with Partner",
    DMDMARTL %in% c(2) ~ "Widowed",
    DMDMARTL %in% c(3:4) ~ "Divorced/Separated",
    DMDMARTL %in% c(5) ~ "Never Married")) %>%
  
  mutate(DMDMARTL = factor(DMDMARTL))      
view(data2)
save(data2, file="nhanesdata2.RData")

summary(data2$sleep_cat)
##       [2,7]    (7,7.93] (7.93,8.71]   (8.71,14] 
##        1675        1173        1345        1354
tabledata = data2 %>%
  select(SEQN, 
         RIAGENDR,
         RIDAGEYR,
         RIDRETH3,
         DMDMARTL,
         mets1,
         mets2,
         mets3,
         mets4,
         mets5,
         metstot,
         MetS,
         avgsleep,
         age_cat,
         sleep_cat,
         Ed_status, 
         BMXWAIST,
         Systolic_coalesced,
         Dias_coalesced,
         LBDHDD,
         LBXTR,
         LBXGLU, 
         hoursit)

Then I recreate Table 1:

load("tabledata.RData")
label(tabledata$RIAGENDR)       <- "Sex" 
label(tabledata$age_cat)       <- "Age" 
label(tabledata$RIDRETH3)     <- "Ethnicity" 
label(tabledata$DMDMARTL) <- "Marital Status"
label(tabledata$MetS) <- "Metabolic Syndrome"
label(tabledata$avgsleep) <- "Average Sleep Duration"
label(tabledata$Ed_status) <- "Education"
label(tabledata$BMXWAIST) <- "Waist Circumference" 
label(tabledata$BP_coalesced) <- "Systolic Blood Pressure" 
label(tabledata$Dias_coalesced) <- "Diastolic Blood Pressure"
label(tabledata$LBDHDD) <- "High Density Lipoprotein" 
label(tabledata$LBXTR) <- "Triglyceride" 
label(tabledata$LBXGLU) <- "Fasting Blood Glucose" 
label(tabledata$RIAGENDR)     <- "Gender" 
label(tabledata$hoursit) <- "Time Sitting"
  units(tabledata$age_cat)       <- "years" 
  units(tabledata$avgsleep) <- "hours"
  units(tabledata$BMXWAIST) <- "cm" 
  units(tabledata$BP_coalesced) <- "mmHg" 
  units(tabledata$Dias_coalesced) <- "mmHg"
  units(tabledata$LBDHDD) <- "mg/dL" 
  units(tabledata$LBXTR) <- "mg/dL" 
  units(tabledata$LBXGLU) <- "mg/dL" 
  units(tabledata$hoursit) <- "hours/day"
table1(~ age_cat + RIAGENDR + RIDRETH3 + DMDMARTL + Ed_status + hoursit + avgsleep + 
         Dias_coalesced + BP_coalesced + LBXGLU + LBDHDD + LBXTR + BMXWAIST 
       | MetS, data=tabledata, overall="Total", caption = "Table 1",
       render.continuous=c(.="Mean (SD)", .="Median (IQR)"))
Table 1
No
(N=4214)
Yes
(N=1160)
Total
(N=5374)
Age (years)
18-34 1275 (30.3%) 123 (10.6%) 1398 (26.0%)
35-52 1067 (25.3%) 303 (26.1%) 1370 (25.5%)
53-65 972 (23.1%) 374 (32.2%) 1346 (25.0%)
66+ 900 (21.4%) 360 (31.0%) 1260 (23.4%)
Gender
Male 2086 (49.5%) 525 (45.3%) 2611 (48.6%)
Female 2128 (50.5%) 635 (54.7%) 2763 (51.4%)
Ethnicity
Asian 658 (15.6%) 123 (10.6%) 781 (14.5%)
Black 1023 (24.3%) 219 (18.9%) 1242 (23.1%)
Hispanic 902 (21.4%) 324 (27.9%) 1226 (22.8%)
Other/Multi-Racial 204 (4.8%) 74.0 (6.4%) 278 (5.2%)
White 1427 (33.9%) 420 (36.2%) 1847 (34.4%)
Marital Status
Divorced/Separated 553 (13.1%) 207 (17.8%) 760 (14.1%)
Married/Living with Partner 2360 (56.0%) 698 (60.2%) 3058 (56.9%)
Never Married 761 (18.1%) 138 (11.9%) 899 (16.7%)
Widowed 301 (7.1%) 107 (9.2%) 408 (7.6%)
Missing 239 (5.7%) 10.0 (0.9%) 249 (4.6%)
Education
< 9th Grade 296 (7.0%) 132 (11.4%) 428 (8.0%)
College Graduate 1049 (24.9%) 208 (17.9%) 1257 (23.4%)
High School Graduate/GED 1045 (24.8%) 293 (25.3%) 1338 (24.9%)
Some College 1361 (32.3%) 363 (31.3%) 1724 (32.1%)
Some High School 456 (10.8%) 163 (14.1%) 619 (11.5%)
Missing 7.00 (0.2%) 1.00 (0.1%) 8.00 (0.1%)
Time Sitting (hours/day)
Mean (SD) 5.49 (3.30) 5.65 (3.43) 5.53 (3.33)
Median (IQR) 5.00 (5.00) 5.00 (5.00) 5.00 (5.00)
Missing 27.0 (0.6%) 6.00 (0.5%) 33.0 (0.6%)
Average Sleep Duration (hours)
Mean (SD) 7.70 (1.38) 7.64 (1.41) 7.69 (1.39)
Median (IQR) 7.86 (1.57) 7.71 (1.71) 7.86 (1.57)
Diastolic Blood Pressure (mmHg)
Mean (SD) 71.1 (13.0) 75.5 (15.1) 72.1 (13.6)
Median (IQR) 72.0 (14.0) 76.0 (16.0) 72.0 (16.0)
Missing 226 (5.4%) 22.0 (1.9%) 248 (4.6%)
Systolic Blood Pressure (mmHg)
Mean (SD) 124 (19.4) 138 (20.2) 127 (20.4)
Median (IQR) 120 (22.0) 136 (22.0) 124 (26.0)
Missing 226 (5.4%) 22.0 (1.9%) 248 (4.6%)
Fasting Blood Glucose (mg/dL)
Mean (SD) 105 (25.5) 131 (47.9) 114 (37.4)
Median (IQR) 99.0 (14.0) 114 (29.8) 104 (20.0)
Missing 2626 (62.3%) 258 (22.2%) 2884 (53.7%)
High Density Lipoprotein (mg/dL)
Mean (SD) 55.7 (15.4) 44.3 (11.8) 53.1 (15.4)
Median (IQR) 54.0 (19.0) 42.0 (11.8) 51.0 (19.0)
Missing 321 (7.6%) 6.00 (0.5%) 327 (6.1%)
Triglyceride (mg/dL)
Mean (SD) 83.6 (43.5) 163 (147) 113 (103)
Median (IQR) 75.0 (52.5) 137 (97.0) 92.0 (73.5)
Missing 2663 (63.2%) 264 (22.8%) 2927 (54.5%)
Waist Circumference (cm)
Mean (SD) 97.1 (16.4) 111 (15.3) 100 (17.2)
Median (IQR) 95.8 (21.9) 109 (18.8) 99.0 (22.9)
Missing 281 (6.7%) 15.0 (1.3%) 296 (5.5%)

Model

Here I recreate the first model from the source paper, a univariate generalized additive model of metabolic syndrome (yes/no) on average sleep duration:

gam.fit = gam(MetS ~ s(avgsleep),family=binomial(),data=data2)

summary(gam.fit)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## MetS ~ s(avgsleep)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.28967    0.03264  -39.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value
## s(avgsleep) 1.001  1.003  1.243   0.265
## 
## R-sq.(adj) =  4.69e-05   Deviance explained = 0.0215%
## UBRE = 0.044023  Scale est. = 1         n = 5547
fits = predict(gam.fit, newdata=data2, type='response', se=T)
predicts = data.frame(data2, fits) %>% 
  mutate(lower = fit - 1.96*se.fit,
         upper = fit + 1.96*se.fit)

(plot_gam = ggplot(aes(x=avgsleep,y=fit), data=predicts) +
  geom_ribbon(aes(ymin = lower, ymax=upper), fill='gray90') +
  geom_line(color='#00aaff') +
  theme_trueMinimal() + labs(y= "Metabolic Syndrome Prediction", x = "Average Sleep (hours)", title = "Predicting Metabolic Syndrome"))