load Packages

library(splines)
library(mgcv)
library(ggplot2)
library(dplyr)
library(car)

BRFSS DATA

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

set.seed(1234)
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]
renam<-names(brfss_17)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  renam))
names(brfss_17)<-newnames

Recode variables

#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")

# sex
brfss_17$male<-ifelse(brfss_17$sex==1,1,0)

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")


brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chrobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")

1) Define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor. My outcome variable for this homework is diabetes and my predictor variables are race, education, age, sex, and smoking

2) Using the gam() function, estimate a model with only linear terms in your model

wadt1<-gam(depdisorder ~ healthdays + bmi + sex + educ + race_eth,
            data=brfss_17,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(wadt1)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## depdisorder ~ healthdays + bmi + sex + educ + race_eth
## 
## Parametric coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.2354726  0.0344544 -93.906  < 2e-16 ***
## healthdays            0.0526890  0.0006126  86.008  < 2e-16 ***
## bmi                   0.0269174  0.0008935  30.127  < 2e-16 ***
## sex                   0.5886372  0.0118743  49.572  < 2e-16 ***
## educ0Prim            -0.0534858  0.0341241  -1.567    0.117    
## educ1somehs           0.3056937  0.0236962  12.901  < 2e-16 ***
## educ3somecol          0.1112310  0.0155881   7.136 9.63e-13 ***
## educ4colgrad         -0.1852941  0.0166096 -11.156  < 2e-16 ***
## race_ethhispanic     -0.4562135  0.0180147 -25.324  < 2e-16 ***
## race_ethnh black     -0.4832453  0.0189477 -25.504  < 2e-16 ***
## race_ethnh multirace  0.2384164  0.0452322   5.271 1.36e-07 ***
## race_ethnh other     -0.4289050  0.0263370 -16.285  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.0774   Deviance explained = 7.37%
## UBRE = -0.11492  Scale est. = 1         n = 202836

3) Repeat Step 2, but include a smooth term for at least one continuous variable

wadt2<-gam(depdisorder~ s(healthdays) + s(bmi) + sex + educ + race_eth,
            data=brfss_17,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(wadt2)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## depdisorder ~ s(healthdays) + s(bmi) + sex + educ + race_eth
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.23401    0.02242 -99.658  < 2e-16 ***
## sex                   0.54446    0.01194  45.608  < 2e-16 ***
## educ0Prim            -0.05467    0.03422  -1.598     0.11    
## educ1somehs           0.28454    0.02380  11.957  < 2e-16 ***
## educ3somecol          0.11114    0.01569   7.085 1.39e-12 ***
## educ4colgrad         -0.18003    0.01675 -10.750  < 2e-16 ***
## race_ethhispanic     -0.44488    0.01813 -24.540  < 2e-16 ***
## race_ethnh black     -0.49131    0.01907 -25.757  < 2e-16 ***
## race_ethnh multirace  0.21826    0.04551   4.796 1.62e-06 ***
## race_ethnh other     -0.43106    0.02657 -16.225  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq p-value    
## s(healthdays) 8.811  8.984 9245.4  <2e-16 ***
## s(bmi)        8.808  8.982  970.3  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0908   Deviance explained = 8.63%
## UBRE = -0.1268  Scale est. = 1         n = 202836

4) Test if the model in step 3 is fitting the data better than the purely linear model.

anova( wadt1, wadt2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: depdisorder ~ healthdays + bmi + sex + educ + race_eth
## Model 2: depdisorder ~ s(healthdays) + s(bmi) + sex + educ + race_eth
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1    202824     179502                              
## 2    202808     177060 15.966     2442 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

In comparing the models, the smooth fits the data better to the purely linear model. This is evident from its very small p-value.

5) Produce a plot of the smooth effect from the model in step 3

plot(wadt2)