Using splines in NHANES Data

library(splines)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
library(foreign)
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.3
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some

Load in NHANES data

load("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/nhanes_10_18_merged.Rdata")

1. Define an outcome and 1 continuous predictor

# The outcome variable used will be a1c levels of diabetes. The continuous predictor will be age and bmi.

2. Using the gam() function, estimate a model with only linear terms in your model

library(mgcv) #one library for GAMs

nhanes_all$diab2 <- Recode(nhanes_all$diab, recodes= "'yes'=1; 'no'=0")

nhns_gam <- gam(diab2~ridageyr+bmxbmi+sex+race_eth+educ,
                data=nhanes_all,
                weights=weight/mean(weight, na.rm=T),
                family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(nhns_gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## diab2 ~ ridageyr + bmxbmi + sex + race_eth + educ
## 
## Parametric coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -8.498970   0.258785 -32.842  < 2e-16 ***
## ridageyr            0.057154   0.002456  23.267  < 2e-16 ***
## bmxbmi              0.093525   0.004511  20.734  < 2e-16 ***
## sexMale             0.439629   0.070168   6.265 3.72e-10 ***
## race_eth2 hispanic  0.684258   0.101269   6.757 1.41e-11 ***
## race_eth3 black     0.667812   0.103956   6.424 1.33e-10 ***
## race_eth4 other     1.019279   0.137630   7.406 1.30e-13 ***
## educ2 hs           -0.199424   0.103231  -1.932  0.05338 .  
## educ3 some col     -0.329817   0.100149  -3.293  0.00099 ***
## educ4 col grad     -0.587749   0.108780  -5.403 6.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.0913   Deviance explained = 16.2%
## UBRE = -0.474  Scale est. = 1         n = 11335

3. Repeat Step 2, but include a smooth term for at least one continuous variable

nhns_gam2 <- gam(diab2~s(ridageyr)+s(bmxbmi)+sex+race_eth+educ,
                data=nhanes_all,
                weights=weight/mean(weight, na.rm=T),
                family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(nhns_gam2)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## diab2 ~ s(ridageyr) + s(bmxbmi) + sex + race_eth + educ
## 
## Parametric coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.01399    0.10740 -28.063  < 2e-16 ***
## sexMale             0.41899    0.07056   5.938 2.89e-09 ***
## race_eth2 hispanic  0.64861    0.10212   6.351 2.14e-10 ***
## race_eth3 black     0.66136    0.10361   6.383 1.74e-10 ***
## race_eth4 other     1.08309    0.13951   7.764 8.26e-15 ***
## educ2 hs           -0.22085    0.10380  -2.128 0.033373 *  
## educ3 some col     -0.37283    0.10079  -3.699 0.000216 ***
## educ4 col grad     -0.64878    0.10981  -5.908 3.46e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value    
## s(ridageyr) 2.874  3.594  464.7  <2e-16 ***
## s(bmxbmi)   3.746  4.736  427.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.109   Deviance explained = 17.9%
## UBRE = -0.48363  Scale est. = 1         n = 11335

4. Test if the model in step 3 is fitting the data better than the purely linear model.

anova( nhns_gam2, nhns_gam, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: diab2 ~ s(ridageyr) + s(bmxbmi) + sex + race_eth + educ
## Model 2: diab2 ~ ridageyr + bmxbmi + sex + race_eth + educ
##   Resid. Df Resid. Dev      Df Deviance  Pr(>Chi)    
## 1     11319     5823.8                               
## 2     11325     5942.2 -6.3302  -118.45 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The model with smooth terms fit the data better

5. Produce a plot of the smooth effect from the model in step 3

plot(nhns_gam2)