Tidying and Generalized Additive Models

This data is from the NHANES (National Health and Nutrition Examination Survey) from the CDC (https://www.cdc.gov/nchs/nhanes/index.htm). Here, my aim is to tidy the data in order to recreate an analysis on the effect of sleep on metabolic syndrome, seen here: https://pubmed.ncbi.nlm.nih.gov/31717770/.

Data Source:

# Libraries 
library(NHANES)
library(foreign)
library(tidyverse)
library(stats)
library(tibble)
library(table1)
library(Hmisc)
library(mgcv)
library(visibly)

# Importing the data

#demographic data
demog = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
              destfile = demog,
              mode = "wb")
demog = read.xport(demog)

demog = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
              destfile = demog,
              mode = "wb")
demog = read.xport(demog)


#body measurements
size = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BMX_J.XPT",
              destfile = size,
              mode = "wb")
size = read.xport(size)

#blood pressure
bp = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPX_J.XPT",
              destfile = bp,
              mode = "wb")
bp = read.xport(bp)

#HDL
hdl = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HDL_J.XPT",
              destfile = hdl,
              mode = "wb")
hdl = read.xport(hdl)

#Triglycerides 
fat = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TRIGLY_J.XPT",
              destfile = fat,
              mode = "wb")
fat = read.xport(fat)

#fasting glucose
f_gluc = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/GLU_J.XPT",
              destfile = f_gluc,
              mode = "wb")
f_gluc = read.xport(f_gluc)

#prescription medications
##### refer to appendix of meds documentation for code definitions

meds = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/RXQ_RX_J.XPT",
              destfile = meds,
              mode = "wb")
meds = read.xport(meds) %>%
  
  filter(RXDRSC1 == "R73" | RXDRSC1 == "I10" | RXDRSC1 == "I10.P") %>%
  
  mutate(RelMeds = case_when(
    RXDRSC1 %in% c("R73") ~ "Rx_Glucose",
    RXDRSC1 %in% c("I10", "I10.P") ~ "Rx_BP")) %>%
  
  pivot_wider(names_from = RelMeds, values_from = RXDUSE) %>%
  
  select(SEQN, Rx_Glucose)%>%
  
  distinct(SEQN, Rx_Glucose) 


#sleep time
sleep = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SLQ_J.XPT",
              destfile = sleep,
              mode = "wb")
sleep = read.xport(sleep)


#Time sitting
sit = tempfile()

download.file("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/PAQ_J.XPT",
              destfile = sit,
              mode = "wb")

sit = read.xport(sit)

Note that there are quite a few systolic and diastolic blood pressure measurements; most subjects will have data for only some of these. Therefore we coalesce these columns into a single set of variables: systolic and diastolic blood pressure.

#
bp = bp %>% 
  mutate(Systolic_coalesced = coalesce(BPXSY1, BPXSY2, BPXSY3, BPXSY4)) %>%
  mutate(Dias_coalesced = coalesce(BPXDI1, BPXDI2, BPXDI3, BPXDI4))

Data Tidying

Next, I do the bulk of my data tidying. Here I merge the datasets, re-code missing values, factorize categorical variables, etc. In the source paper, metabolic syndrome is defined as having 3 or more of the following criteria: obesity (measured by waist circumference), elevated serum triglycerides, elevated serum HDL cholesterol, high blood pressure, and high fasting blood glucose.

A note on my approach: if you read through the source paper, you will see that the authors have done a ‘completer’ analysis, i.e., they removed any subject missing any of the data they were interested in. My opinion (backed up by statistical theory) is that this is not a great approach. Completers tend to be different from the original population, so a completer sample is no longer a random sample, and, critically, the ways it diverges from a random sample are unknowable. This is a Bad Thing if your aim is inference.

#Merging data sets by subject id (SEQN)
data2 = inner_join(demog, sleep, by="SEQN") %>% # keep all common to demog and sleep
  left_join(bp, by="SEQN") %>%    
  left_join(f_gluc, by="SEQN") %>%      
  left_join(fat, by="SEQN") %>% 
  left_join(hdl, by="SEQN") %>%
  left_join(size, by="SEQN") %>%
  left_join(meds, by="SEQN") %>%
  left_join(sit, by="SEQN")

# Select the variables of interest:
data2 = data2 %>%
select(SEQN, 
       RIAGENDR,#Gender
       RIDAGEYR,#Age in years at screening
       RIDRETH3,#Race/Hispanic origin w/ NH Asian
       DMDEDUC3,#Education level - Children/Youth 6-19
       DMDEDUC2,#Education level - Adults 20+
       DMDMARTL,#Marital status
       WTINT2YR, #Full sample 2 year interview weight
       WTMEC2YR, #Full sample 2 year MEC exam weight
       BMXWAIST, #Waist circumference 
       Systolic_coalesced, #Systolic: Blood pres coalesced
       Dias_coalesced, #Diastolic: Blood pres coalesced
       LBDHDD, #Direct HDL-Cholesterol (mg/dL)
       LBXTR, #Triglyceride (mg/dL)
       LBXGLU, #Fasting Glucose (mg/dL)
       Rx_Glucose, #Treatment for high blood glucose
       SLD012, #Sleep hours - weekdays or workdays
       SLD013, #Sleep hours - weekends
       PAD680, #minutes sitting
       #Survey design variables
       SDMVPSU,
       SDMVSTRA,
       WTMEC2YR)%>%
  
# Re-code missing data to "na"
  
  mutate_at(c("SLD012", "SLD013"),
            na_if, 77777) %>%
  mutate_at(c("SLD012", "SLD013"),
            na_if, 99999) %>%
  mutate_at(c("DMDEDUC3", "DMDEDUC2", "DMDMARTL"),
            na_if, 77) %>%
  mutate_at(c("DMDEDUC3", "DMDEDUC2", "DMDMARTL"),
            na_if, 99) %>%
  mutate_at(c("DMDEDUC2"),
            na_if, 7) %>%
  mutate_at(c("DMDEDUC2"),
            na_if, 9) %>%
  mutate_at(vars(starts_with("BPX")),
            na_if, 7) %>%
  mutate_at(vars(starts_with("DPQ")),
            na_if, 9) %>%
  mutate_at(vars(starts_with("RXDR")),
            na_if, 77777) %>%
  mutate_at(vars(starts_with("RXDR")),
            na_if, 99999) %>%
  mutate_at(vars(starts_with("RXDR")),
            na_if, 55555) %>%  
  mutate_at(c("PAD680"),
            na_if, 7777) %>%
  mutate_at(c("PAD680"),
            na_if, 9999) %>%

# Filter out subjects under the age of 18  
  filter(RIDAGEYR >= 18) %>%

# Factorize gender variable  
  mutate(RIAGENDR = factor(RIAGENDR, 
                         labels = c('Male', 'Female'))) %>%

# Creating binary variables for diagnostic criteria
  
# factor 1: high waist circumference; cut-off is 102 cm for males and 88 cm for females  
mutate(mets1 = if_else(BMXWAIST >= 102, 1,
                       if_else(BMXWAIST >= 88 & RIAGENDR == 'Female', 1, 0))) %>%

# factor 2: elevated triglycerides  
mutate(mets2 = if_else(LBXTR >= 150, 1,0)) %>% #no meds indicated for elev triglycer       

# factor 3: Direct HDL-Cholesterol (mg/dL)   
mutate(mets3 = if_else(LBDHDD < 50 & RIAGENDR == 'Female', 1, 
                       if_else(LBDHDD < 40 & RIAGENDR == 'Male', 1, 0))) %>%  

# factor 4: high blood pressure 
mutate(mets4 = if_else(Systolic_coalesced >= 130, 1,
               if_else(Dias_coalesced >= 85, 1, 0))) %>%

# factor 5: high blood glucose   
mutate(mets5 =  if_else(LBXGLU >= 100, 1, 
                if_else(Rx_Glucose == 1, 1,0))) %>% 

# Average sleep duration    
mutate(avgsleep = (5/7)*SLD012 + (2/7)*SLD013) %>%

# Average time sitting    
mutate(hoursit = PAD680/60) %>% 
 
# Filter out subjects missing sleep variable 
filter(!is.na(avgsleep)) %>%

# Filter out subjects missing all 5 mets categories  
filter(!(is.na(mets1) & is.na(mets2) & is.na(mets3) & is.na(mets4) & 
           is.na(mets5))) %>%    #no one missing 4 categories
  

#filter(avgsleep < (2*sd(avgsleep, na.rm = T) + mean(avgsleep))) %>%

# age categories 
mutate(age_cat = cut(RIDAGEYR, 
                     breaks = quantile(RIDAGEYR, probs = c(0, 1/4, 2/4, 3/4, 1)), 
                     include.lowest = T,
                     labels = c("18-34", "35-52", "53-65", "66+"))) %>%

# Sleep categories  
mutate(sleep_cat = cut(avgsleep, 
                       breaks = quantile(avgsleep, probs = c(0, 1/4, 2/4, 3/4, 1)), 
                       include.lowest = T,
                       #labels = c("<3 - 7", ">7 - 7.86", ">7.86 - 8.57", ">8.57 - 10.9")
                       )) %>%   

# Tallying up diagnostic criteria    
  mutate(metstot = rowSums(select(., starts_with("mets")), na.rm = T)) %>%
  mutate(MetS = factor(metstot >= 3, labels = c("No", "Yes"))) %>%

# Race/Ethnicity
  mutate(RIDRETH3 = case_when(
    RIDRETH3 %in% c(1,2) ~ "Hispanic",
    RIDRETH3 %in% c(3) ~ "White",
    RIDRETH3 %in% c(4) ~ "Black",
    RIDRETH3 %in% c(6) ~ "Asian",
    RIDRETH3 %in% c(7) ~ "Other/Multi-Racial"))  %>%
  
  mutate(RIDRETH3 = factor(RIDRETH3)) %>%
  
# Education
mutate(Ed_status = case_when(
  DMDEDUC3 %in% c(0:8, 55, 66) ~ "< 9th Grade",
  DMDEDUC3 %in% c(9:12) ~ "Some High School",
  DMDEDUC3 %in% c(13:14) ~ "High School Graduate/GED",
  DMDEDUC3 %in% c(15) ~ "Some College",
  DMDEDUC2 %in% c(1) ~ "< 9th Grade",
  DMDEDUC2 %in% c(2) ~ "Some High School",
  DMDEDUC2 %in% c(3) ~ "High School Graduate/GED",
  DMDEDUC2 %in% c(4) ~ "Some College",
  DMDEDUC2 %in% c(5) ~ "College Graduate")) %>%
  
  mutate(Ed_status = factor(Ed_status))  %>%

# Marital Status    
  mutate(DMDMARTL = case_when(
    DMDMARTL %in% c(1,6) ~ "Married/Living with Partner",
    DMDMARTL %in% c(2) ~ "Widowed",
    DMDMARTL %in% c(3:4) ~ "Divorced/Separated",
    DMDMARTL %in% c(5) ~ "Never Married")) %>%
  
  mutate(DMDMARTL = factor(DMDMARTL))      
view(data2)
save(data2, file="nhanesdata2.RData")

summary(data2$sleep_cat)

##       [2,7]    (7,7.93] (7.93,8.71]   (8.71,14] 
##        1675        1173        1345        1354

tabledata = data2 %>%
  select(SEQN, 
         RIAGENDR,
         RIDAGEYR,
         RIDRETH3,
         DMDMARTL,
         mets1,
         mets2,
         mets3,
         mets4,
         mets5,
         metstot,
         MetS,
         avgsleep,
         age_cat,
         sleep_cat,
         Ed_status, 
         BMXWAIST,
         Systolic_coalesced,
         Dias_coalesced,
         LBDHDD,
         LBXTR,
         LBXGLU, 
         hoursit)

Then I recreate Table 1:

load("tabledata.RData")
label(tabledata$RIAGENDR)       <- "Sex" 
label(tabledata$age_cat)       <- "Age" 
label(tabledata$RIDRETH3)     <- "Ethnicity" 
label(tabledata$DMDMARTL) <- "Marital Status"
label(tabledata$MetS) <- "Metabolic Syndrome"
label(tabledata$avgsleep) <- "Average Sleep Duration"
label(tabledata$Ed_status) <- "Education"
label(tabledata$BMXWAIST) <- "Waist Circumference" 
label(tabledata$BP_coalesced) <- "Systolic Blood Pressure" 
label(tabledata$Dias_coalesced) <- "Diastolic Blood Pressure"
label(tabledata$LBDHDD) <- "High Density Lipoprotein" 
label(tabledata$LBXTR) <- "Triglyceride" 
label(tabledata$LBXGLU) <- "Fasting Blood Glucose" 
label(tabledata$RIAGENDR)     <- "Gender" 
label(tabledata$hoursit) <- "Time Sitting"
  units(tabledata$age_cat)       <- "years" 
  units(tabledata$avgsleep) <- "hours"
  units(tabledata$BMXWAIST) <- "cm" 
  units(tabledata$BP_coalesced) <- "mmHg" 
  units(tabledata$Dias_coalesced) <- "mmHg"
  units(tabledata$LBDHDD) <- "mg/dL" 
  units(tabledata$LBXTR) <- "mg/dL" 
  units(tabledata$LBXGLU) <- "mg/dL" 
  units(tabledata$hoursit) <- "hours/day"
table1(~ age_cat + RIAGENDR + RIDRETH3 + DMDMARTL + Ed_status + hoursit + avgsleep + 
         Dias_coalesced + BP_coalesced + LBXGLU + LBDHDD + LBXTR + BMXWAIST 
       | MetS, data=tabledata, overall="Total", caption = "Table 1",
       render.continuous=c(.="Mean (SD)", .="Median (IQR)"))

Table 1
	No (N=4214)	Yes (N=1160)	Total (N=5374)
Age (years)
18-34	1275 (30.3%)	123 (10.6%)	1398 (26.0%)
35-52	1067 (25.3%)	303 (26.1%)	1370 (25.5%)
53-65	972 (23.1%)	374 (32.2%)	1346 (25.0%)
66+	900 (21.4%)	360 (31.0%)	1260 (23.4%)
Gender
Male	2086 (49.5%)	525 (45.3%)	2611 (48.6%)
Female	2128 (50.5%)	635 (54.7%)	2763 (51.4%)
Ethnicity
Asian	658 (15.6%)	123 (10.6%)	781 (14.5%)
Black	1023 (24.3%)	219 (18.9%)	1242 (23.1%)
Hispanic	902 (21.4%)	324 (27.9%)	1226 (22.8%)
Other/Multi-Racial	204 (4.8%)	74.0 (6.4%)	278 (5.2%)
White	1427 (33.9%)	420 (36.2%)	1847 (34.4%)
Marital Status
Divorced/Separated	553 (13.1%)	207 (17.8%)	760 (14.1%)
Married/Living with Partner	2360 (56.0%)	698 (60.2%)	3058 (56.9%)
Never Married	761 (18.1%)	138 (11.9%)	899 (16.7%)
Widowed	301 (7.1%)	107 (9.2%)	408 (7.6%)
Missing	239 (5.7%)	10.0 (0.9%)	249 (4.6%)
Education
< 9th Grade	296 (7.0%)	132 (11.4%)	428 (8.0%)
College Graduate	1049 (24.9%)	208 (17.9%)	1257 (23.4%)
High School Graduate/GED	1045 (24.8%)	293 (25.3%)	1338 (24.9%)
Some College	1361 (32.3%)	363 (31.3%)	1724 (32.1%)
Some High School	456 (10.8%)	163 (14.1%)	619 (11.5%)
Missing	7.00 (0.2%)	1.00 (0.1%)	8.00 (0.1%)
Time Sitting (hours/day)
Mean (SD)	5.49 (3.30)	5.65 (3.43)	5.53 (3.33)
Median (IQR)	5.00 (5.00)	5.00 (5.00)	5.00 (5.00)
Missing	27.0 (0.6%)	6.00 (0.5%)	33.0 (0.6%)
Average Sleep Duration (hours)
Mean (SD)	7.70 (1.38)	7.64 (1.41)	7.69 (1.39)
Median (IQR)	7.86 (1.57)	7.71 (1.71)	7.86 (1.57)
Diastolic Blood Pressure (mmHg)
Mean (SD)	71.1 (13.0)	75.5 (15.1)	72.1 (13.6)
Median (IQR)	72.0 (14.0)	76.0 (16.0)	72.0 (16.0)
Missing	226 (5.4%)	22.0 (1.9%)	248 (4.6%)
Systolic Blood Pressure (mmHg)
Mean (SD)	124 (19.4)	138 (20.2)	127 (20.4)
Median (IQR)	120 (22.0)	136 (22.0)	124 (26.0)
Missing	226 (5.4%)	22.0 (1.9%)	248 (4.6%)
Fasting Blood Glucose (mg/dL)
Mean (SD)	105 (25.5)	131 (47.9)	114 (37.4)
Median (IQR)	99.0 (14.0)	114 (29.8)	104 (20.0)
Missing	2626 (62.3%)	258 (22.2%)	2884 (53.7%)
High Density Lipoprotein (mg/dL)
Mean (SD)	55.7 (15.4)	44.3 (11.8)	53.1 (15.4)
Median (IQR)	54.0 (19.0)	42.0 (11.8)	51.0 (19.0)
Missing	321 (7.6%)	6.00 (0.5%)	327 (6.1%)
Triglyceride (mg/dL)
Mean (SD)	83.6 (43.5)	163 (147)	113 (103)
Median (IQR)	75.0 (52.5)	137 (97.0)	92.0 (73.5)
Missing	2663 (63.2%)	264 (22.8%)	2927 (54.5%)
Waist Circumference (cm)
Mean (SD)	97.1 (16.4)	111 (15.3)	100 (17.2)
Median (IQR)	95.8 (21.9)	109 (18.8)	99.0 (22.9)
Missing	281 (6.7%)	15.0 (1.3%)	296 (5.5%)

Model

Here I recreate the first model from the source paper, a univariate generalized additive model of metabolic syndrome (yes/no) on average sleep duration:

gam.fit = gam(MetS ~ s(avgsleep),family=binomial(),data=data2)

summary(gam.fit)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## MetS ~ s(avgsleep)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.28967    0.03264  -39.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value
## s(avgsleep) 1.001  1.003  1.243   0.265
## 
## R-sq.(adj) =  4.69e-05   Deviance explained = 0.0215%
## UBRE = 0.044023  Scale est. = 1         n = 5547

fits = predict(gam.fit, newdata=data2, type='response', se=T)
predicts = data.frame(data2, fits) %>% 
  mutate(lower = fit - 1.96*se.fit,
         upper = fit + 1.96*se.fit)

(plot_gam = ggplot(aes(x=avgsleep,y=fit), data=predicts) +
  geom_ribbon(aes(ymin = lower, ymax=upper), fill='gray90') +
  geom_line(color='#00aaff') +
  theme_trueMinimal() + labs(y= "Metabolic Syndrome Prediction", x = "Average Sleep (hours)", title = "Predicting Metabolic Syndrome"))

Tidying and Generalized Additive Models

Rachel Cabrera

1/3/2022

Data Source:

Data Tidying

Model