load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
#Data Prep
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
##Get TX Cities
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)
##Recodes
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0", as.factor=T)
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
brfss_17$menthlth<-recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
brfss_17$employa <- Recode(brfss_17$employ1, recodes="1:2=1; 2:6=0; else=NA", as.factor=T)
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
names(brfss_17)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2" "medcost" "checkup1"
## [13] "bphigh4" "bpmeds" "cholchk1" "toldhi2" "cholmed1" "cvdinfr4"
## [19] "cvdcrhd4" "cvdstrk3" "asthma3" "asthnow" "chcscncr" "chcocncr"
## [25] "chccopd1" "havarth3" "addepev2" "chckidny" "diabete3" "diabage2"
## [31] "lmtjoin3" "arthdis2" "arthsocl" "joinpai1" "sex" "marital"
## [37] "educa" "renthom1" "numhhol2" "numphon2" "cpdemo1a" "veteran3"
## [43] "employ1" "children" "income2" "internet" "weight2" "height3"
## [49] "pregnant" "deaf" "blind" "decide" "diffwalk" "diffdres"
## [55] "diffalon" "smoke100" "smokday2" "stopsmk2" "lastsmk2" "usenow3"
## [61] "ecigaret" "ecignow" "alcday5" "avedrnk2" "drnk3ge5" "maxdrnks"
## [67] "fruit2" "fruitju2" "fvgreen1" "frenchf1" "potatoe1" "vegetab2"
## [73] "exerany2" "exract11" "exeroft1" "exerhmm1" "exract21" "exeroft2"
## [79] "exerhmm2" "strength" "seatbelt" "flushot6" "flshtmy2" "pneuvac3"
## [85] "shingle2" "hivtst6" "hivtstd3" "hivrisk5" "casthdx2" "casthno2"
## [91] "callbckz" "wdusenow" "wdinftrk" "wdhowoft" "wdshare" "namtribe"
## [97] "namothr" "urbnrrl" "ststr" "impsex" "rfhlth" "phys14d"
## [103] "ment14d" "hcvu651" "rfhype5" "cholch1" "rfchol1" "michd"
## [109] "ltasth1" "casthm1" "asthms1" "drdxar1" "lmtact1" "lmtwrk1"
## [115] "lmtscl1" "prace1" "mrace1" "hispanc" "race" "raceg21"
## [121] "racegr3" "ageg5yr" "age65yr" "age80" "ageg" "wtkg3"
## [127] "bmi5" "bmi5cat" "rfbmi5" "educag" "incomg" "smoker3"
## [133] "rfsmok3" "ecigsts" "curecig" "drnkany5" "rfbing5" "drnkwek"
## [139] "rfdrhv5" "ftjuda2" "frutda2" "grenda1" "frnchda" "potada1"
## [145] "vegeda2" "misfrt1" "misveg1" "frtres1" "vegres1" "frutsu1"
## [151] "vegesu1" "frtlt1a" "veglt1a" "frt16a" "veg23a" "fruite1"
## [157] "vegete1" "totinda" "minac11" "minac21" "pacat1" "paindx1"
## [163] "pa150r2" "pa300r2" "pa30021" "pastrng" "parec1" "pastae1"
## [169] "rfseat2" "rfseat3" "flshot6" "pneumo2" "aidtst3" "mmsa"
## [175] "mmsawt" "seqno" "mmsaname" "tx" "educ" "ins"
## [181] "marst" "badhealth" "employ" "agec" "smoke" "employa"
## [187] "bmi" "black" "white" "other" "hispanic" "race_eth"
View(brfss_17)
# Using a data set of your choice, do the following:
# 1) define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor.
##The outcome variable will be employment status, which is a binomial outcome variable. The predictors will be bmi, sex, and race/ethnicity. BMI will be the continious variable.
library(splines)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
sub<-brfss_17%>%
select(employa,menthlth, sex, ins, agec, mmsawt, ststr, bmi, race_eth, smoke) %>%
filter( complete.cases(.))
#survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
# 2) Using the gam() function, estimate a model with only linear terms in your model
##Linear One
brfgam1<-gam(employa~bmi+sex+race_eth+smoke,
data=sub,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(brfgam1)
##
## Family: binomial
## Link function: logit
##
## Formula:
## employa ~ bmi + sex + race_eth + smoke
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.141020 0.175419 0.804 0.42145
## bmi 0.019383 0.006015 3.222 0.00127 **
## sexMale 1.447758 0.078294 18.491 < 2e-16 ***
## race_ethhispanic -0.462265 0.083497 -5.536 3.09e-08 ***
## race_ethnh black -0.041350 0.123717 -0.334 0.73820
## race_ethnh multirace -0.579233 0.331415 -1.748 0.08051 .
## race_ethnh other -1.283531 0.134459 -9.546 < 2e-16 ***
## smokeCurrent 0.013120 0.102757 0.128 0.89840
## smokeFormer 0.502737 0.112869 4.454 8.42e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.12 Deviance explained = 10.7%
## UBRE = 0.017336 Scale est. = 1 n = 4529
# 3) Repeat Step 2, but include a smooth term for at least one continuous variable
##GAM Fit
brfgam2<-gam(employa~s(bmi)+sex+race_eth+smoke,
data=sub,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(brfgam2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## employa ~ s(bmi) + sex + race_eth + smoke
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.73670 0.06998 10.527 < 2e-16 ***
## sexMale 1.41735 0.07902 17.937 < 2e-16 ***
## race_ethhispanic -0.50388 0.08478 -5.943 2.8e-09 ***
## race_ethnh black -0.06139 0.12423 -0.494 0.6212
## race_ethnh multirace -0.60949 0.32988 -1.848 0.0647 .
## race_ethnh other -1.28422 0.13531 -9.491 < 2e-16 ***
## smokeCurrent 0.03086 0.10373 0.297 0.7661
## smokeFormer 0.47229 0.11357 4.159 3.2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(bmi) 8.096 8.743 37.54 1.32e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.126 Deviance explained = 11.4%
## UBRE = 0.013245 Scale est. = 1 n = 4529
#4) Test if the model in step 3 is fitting the data better than the purely linear model.
anova( brfgam1, brfgam2, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: employa ~ bmi + sex + race_eth + smoke
## Model 2: employa ~ s(bmi) + sex + race_eth + smoke
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 4520.0 4589.5
## 2 4512.3 4556.8 7.7426 32.72 5.611e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The second model is the better model where I used the smooth term for BMI.
#5) Produce a plot of the smooth effect from the model in step 3
plot(brfgam2)
