Splines and Generalized Additive Models

#Packages
library(haven)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(splines)
library(mgcv)

## Loading required package: nlme

## Warning: package 'nlme' was built under R version 4.0.4

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.8-35. For overview type 'help("mgcv-package")'.

#Import Data
BRFSS19<-read_xpt("C:/Users/adolp/Desktop/Statistics 2/BRFSS/LLCP2019.XPT")

#Clean Variables
nams<-names(BRFSS19)
head(BRFSS19, n=10)

## # A tibble: 10 x 342
##    `_STATE` FMONTH IDATE   IMONTH IDAY  IYEAR DISPCODE SEQNO     `_PSU` CTELENM1
##       <dbl>  <dbl> <chr>   <chr>  <chr> <chr>    <dbl> <chr>      <dbl>    <dbl>
##  1        1      1 011820~ 01     18    2019      1100 201900~   2.02e9        1
##  2        1      1 011320~ 01     13    2019      1100 201900~   2.02e9        1
##  3        1      1 011820~ 01     18    2019      1100 201900~   2.02e9        1
##  4        1      1 011820~ 01     18    2019      1200 201900~   2.02e9        1
##  5        1      1 010420~ 01     04    2019      1100 201900~   2.02e9        1
##  6        1      1 011820~ 01     18    2019      1200 201900~   2.02e9        1
##  7        1      1 010420~ 01     04    2019      1100 201900~   2.02e9        1
##  8        1      1 012320~ 01     23    2019      1100 201900~   2.02e9        1
##  9        1      1 012420~ 01     24    2019      1100 201900~   2.02e9        1
## 10        1      1 011320~ 01     13    2019      1100 201900~   2.02e9        1
## # ... with 332 more variables: PVTRESD1 <dbl>, COLGHOUS <dbl>, STATERE1 <dbl>,
## #   CELPHONE <dbl>, LADULT1 <dbl>, COLGSEX <dbl>, NUMADULT <dbl>,
## #   LANDSEX <dbl>, NUMMEN <dbl>, NUMWOMEN <dbl>, RESPSLCT <dbl>,
## #   SAFETIME <dbl>, CTELNUM1 <dbl>, CELLFON5 <dbl>, CADULT1 <dbl>,
## #   CELLSEX <dbl>, PVTRESD3 <dbl>, CCLGHOUS <dbl>, CSTATE1 <dbl>,
## #   LANDLINE <dbl>, HHADULT <dbl>, SEXVAR <dbl>, GENHLTH <dbl>, PHYSHLTH <dbl>,
## #   MENTHLTH <dbl>, POORHLTH <dbl>, HLTHPLN1 <dbl>, PERSDOC2 <dbl>,
## #   MEDCOST <dbl>, CHECKUP1 <dbl>, BPHIGH4 <dbl>, BPMEDS <dbl>, CHOLCHK2 <dbl>,
## #   TOLDHI2 <dbl>, CHOLMED2 <dbl>, CVDINFR4 <dbl>, CVDCRHD4 <dbl>,
## #   CVDSTRK3 <dbl>, ASTHMA3 <dbl>, ASTHNOW <dbl>, CHCSCNCR <dbl>,
## #   CHCOCNCR <dbl>, CHCCOPD2 <dbl>, ADDEPEV3 <dbl>, CHCKDNY2 <dbl>,
## #   DIABETE4 <dbl>, DIABAGE3 <dbl>, HAVARTH4 <dbl>, ARTHEXER <dbl>,
## #   ARTHEDU <dbl>, LMTJOIN3 <dbl>, ARTHDIS2 <dbl>, JOINPAI2 <dbl>,
## #   MARITAL <dbl>, EDUCA <dbl>, RENTHOM1 <dbl>, NUMHHOL3 <dbl>, NUMPHON3 <dbl>,
## #   CPDEMO1B <dbl>, VETERAN3 <dbl>, EMPLOY1 <dbl>, CHILDREN <dbl>,
## #   INCOME2 <dbl>, WEIGHT2 <dbl>, HEIGHT3 <dbl>, PREGNANT <dbl>, DEAF <dbl>,
## #   BLIND <dbl>, DECIDE <dbl>, DIFFWALK <dbl>, DIFFDRES <dbl>, DIFFALON <dbl>,
## #   SMOKE100 <dbl>, SMOKDAY2 <dbl>, STOPSMK2 <dbl>, LASTSMK2 <dbl>,
## #   USENOW3 <dbl>, ALCDAY5 <dbl>, AVEDRNK3 <dbl>, DRNK3GE5 <dbl>,
## #   MAXDRNKS <dbl>, EXERANY2 <dbl>, EXRACT11 <dbl>, EXEROFT1 <dbl>,
## #   EXERHMM1 <dbl>, EXRACT21 <dbl>, EXEROFT2 <dbl>, EXERHMM2 <dbl>,
## #   STRENGTH <dbl>, FRUIT2 <dbl>, FRUITJU2 <dbl>, FVGREEN1 <dbl>,
## #   FRENCHF1 <dbl>, POTATOE1 <dbl>, VEGETAB2 <dbl>, FLUSHOT7 <dbl>,
## #   FLSHTMY3 <dbl>, TETANUS1 <dbl>, PNEUVAC4 <dbl>, HIVTST7 <dbl>, ...

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(BRFSS19)<-newnames

#Recode Variables
#Independent Variables
BRFSS19$MaxDrnk<-Recode(BRFSS19$maxdrnks, recodes = "88=0; 77=NA; 99=NA")
BRFSS19$Alc<-Recode(BRFSS19$acedrink, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Dep<-Recode(BRFSS19$acedeprs, recodes ="7:9=NA; 1=1;2=0") 
BRFSS19$Gen<-Recode(BRFSS19$birthsex, recodes = "7:9=NA; 1=1;2=0")
#Dependent Variable (Mental health days = 1, No mental health days = 0)
BRFSS19$Mhlth<-Recode(BRFSS19$ment14d, recodes ="7:9=NA; 1=0;2=1;3=1")

#Complete Cases
library(dplyr)
BRFSS19sub<-BRFSS19%>%
  select(MaxDrnk,Alc,Dep,Gen,Mhlth,llcpwt, ststr )%>%
  filter(complete.cases(.))

brfgam<-gam(Mhlth~s(MaxDrnk)+ Alc + Dep+ Gen,
            data=BRFSS19,
            weights = ststr/mean(ststr, na.rm=T),
            family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(brfgam)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Mhlth ~ s(MaxDrnk) + Alc + Dep + Gen
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.41435    0.05371  -7.714 1.22e-14 ***
## Alc          0.39901    0.07965   5.010 5.45e-07 ***
## Dep          0.90511    0.08440  10.724  < 2e-16 ***
## Gen         -0.60030    0.06987  -8.592  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df Chi.sq p-value    
## s(MaxDrnk) 2.975   3.69  76.19  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0821   Deviance explained =  6.3%
## UBRE = 0.78377  Scale est. = 1         n = 2969

plot(brfgam)

brfgam2<-gam(Mhlth~ MaxDrnk+ Alc + Dep+ Gen,
            data=BRFSS19,
            weights = ststr/mean(ststr, na.rm=T),
            family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(brfgam2)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Mhlth ~ MaxDrnk + Alc + Dep + Gen
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.67209    0.05782 -11.623  < 2e-16 ***
## MaxDrnk      0.06539    0.01055   6.195 5.81e-10 ***
## Alc          0.41011    0.07921   5.177 2.25e-07 ***
## Dep          0.91171    0.08397  10.858  < 2e-16 ***
## Gen         -0.54286    0.06872  -7.900 2.79e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.0736   Deviance explained = 5.56%
## UBRE = 0.79638  Scale est. = 1         n = 2969

anova( brfgam2, brfgam, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: Mhlth ~ MaxDrnk + Alc + Dep + Gen
## Model 2: Mhlth ~ s(MaxDrnk) + Alc + Dep + Gen
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1    2964.0     5323.5                              
## 2    2961.3     5282.1 2.6897   41.388 3.337e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The continuous independent variable (Most drinks on single occasion past 30 days) on the dichotomous dependent variable (mental health days/no mental health days). There was a smooth non-linear association between the IV and DV with higher probability of having a mental health day at fewer drinks up until around 10 drinks, and then the relationship becomes negative, with the decrease in the probability of having a mental health day at higher drinks. In comparing the linear model and the additive model, the additive model explains the model better than the linear model.

Splines and Generalized Additive Models

Adolph Delgado

4/20/2021