load Packages
library(splines)
library(mgcv)
library(ggplot2)
library(dplyr)
library(car)
BRFSS DATA
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
set.seed(1234)
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]
renam<-names(brfss_17)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = renam))
names(brfss_17)<-newnames
Recode variables
#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")
# sex
brfss_17$male<-ifelse(brfss_17$sex==1,1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2=1; 3:4=0; else=NA")
brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chrobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")
1) Define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor. My outcome variable for this homework is diabetes and my predictor variables are race, education, age, sex, and smoking
2) Using the gam() function, estimate a model with only linear terms in your model
wadt1<-gam(depdisorder ~ healthdays + bmi + sex + educ + race_eth,
data=brfss_17,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(wadt1)
##
## Family: binomial
## Link function: logit
##
## Formula:
## depdisorder ~ healthdays + bmi + sex + educ + race_eth
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.2354726 0.0344544 -93.906 < 2e-16 ***
## healthdays 0.0526890 0.0006126 86.008 < 2e-16 ***
## bmi 0.0269174 0.0008935 30.127 < 2e-16 ***
## sex 0.5886372 0.0118743 49.572 < 2e-16 ***
## educ0Prim -0.0534858 0.0341241 -1.567 0.117
## educ1somehs 0.3056937 0.0236962 12.901 < 2e-16 ***
## educ3somecol 0.1112310 0.0155881 7.136 9.63e-13 ***
## educ4colgrad -0.1852941 0.0166096 -11.156 < 2e-16 ***
## race_ethhispanic -0.4562135 0.0180147 -25.324 < 2e-16 ***
## race_ethnh black -0.4832453 0.0189477 -25.504 < 2e-16 ***
## race_ethnh multirace 0.2384164 0.0452322 5.271 1.36e-07 ***
## race_ethnh other -0.4289050 0.0263370 -16.285 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.0774 Deviance explained = 7.37%
## UBRE = -0.11492 Scale est. = 1 n = 202836
3) Repeat Step 2, but include a smooth term for at least one continuous variable
wadt2<-gam(depdisorder~ s(healthdays) + s(bmi) + sex + educ + race_eth,
data=brfss_17,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(wadt2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## depdisorder ~ s(healthdays) + s(bmi) + sex + educ + race_eth
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.23401 0.02242 -99.658 < 2e-16 ***
## sex 0.54446 0.01194 45.608 < 2e-16 ***
## educ0Prim -0.05467 0.03422 -1.598 0.11
## educ1somehs 0.28454 0.02380 11.957 < 2e-16 ***
## educ3somecol 0.11114 0.01569 7.085 1.39e-12 ***
## educ4colgrad -0.18003 0.01675 -10.750 < 2e-16 ***
## race_ethhispanic -0.44488 0.01813 -24.540 < 2e-16 ***
## race_ethnh black -0.49131 0.01907 -25.757 < 2e-16 ***
## race_ethnh multirace 0.21826 0.04551 4.796 1.62e-06 ***
## race_ethnh other -0.43106 0.02657 -16.225 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(healthdays) 8.811 8.984 9245.4 <2e-16 ***
## s(bmi) 8.808 8.982 970.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0908 Deviance explained = 8.63%
## UBRE = -0.1268 Scale est. = 1 n = 202836
4) Test if the model in step 3 is fitting the data better than the purely linear model.
anova( wadt1, wadt2, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: depdisorder ~ healthdays + bmi + sex + educ + race_eth
## Model 2: depdisorder ~ s(healthdays) + s(bmi) + sex + educ + race_eth
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 202824 179502
## 2 202808 177060 15.966 2442 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In comparing the models, the smooth fits the data better to the purely linear model. This is evident from its very small p-value.
5) Produce a plot of the smooth effect from the model in step 3
plot(wadt2)

