library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
library(splines)
library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]
#nice MSA name
brfss_17$mmsa_name<-substr(brfss_17$mmsaname, 1,nchar(brfss_17$mmsaname)-31)
#sex
brfss_17$male<-ifelse(brfss_17$sex==1, 1, 0)
#BMI
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)
#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")
#Healthy mental health days
brfss_17$healthmdays<-Recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-Recode(brfss_17$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor = T)
brfss_17$inc<-as.ordered(brfss_17$inc)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='employloyed')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2=1; 3:4=0; else=NA")
#brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
brfss_17$obese<-ifelse(is.na(brfss_17$bmi)==T, NA,
ifelse(brfss_17$bmi>30,1,0))
b17<-brfss_17%>%
select( ststr, agec,bmi,healthmdays, age80, healthdays, ins,educ, black, healthmdays,hispanic, other,smoke,obese, badhealth, male, mmsawt, mmsa_name )%>%
filter(complete.cases(.))
- define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor.
#outcome for this homework is a binomial variable-insurance. continous variable is healthy days.
- Using the gam() function, estimate a model with only linear terms in your model
b17gam<-gam(ins~+bmi+healthmdays+male+badhealth+obese,
data=brfss_17,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(b17gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## ins ~ +bmi + healthmdays + male + badhealth + obese
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.1498720 0.0462814 46.452 < 2e-16 ***
## bmi 0.0110202 0.0018313 6.018 1.77e-09 ***
## healthmdays -0.0129460 0.0008356 -15.494 < 2e-16 ***
## male -0.3929575 0.0144399 -27.213 < 2e-16 ***
## badhealth -0.3319876 0.0183536 -18.088 < 2e-16 ***
## obese -0.0984653 0.0248185 -3.967 7.27e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.00677 Deviance explained = 0.987%
## UBRE = -0.32439 Scale est. = 1 n = 207289
- Repeat Step 2, but include a smooth term for at least one continuous variable
b17gam2<-gam(ins~+s(bmi)+s(healthmdays)+male+badhealth+obese,
data=brfss_17,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
summary(b17gam2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## ins ~ +s(bmi) + s(healthmdays) + male + badhealth + obese
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.42471 0.01645 147.38 <2e-16 ***
## male -0.40987 0.01464 -27.99 <2e-16 ***
## badhealth -0.32324 0.01839 -17.58 <2e-16 ***
## obese -0.09839 0.03843 -2.56 0.0105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(bmi) 6.251 7.315 137.0 <2e-16 ***
## s(healthmdays) 8.746 8.972 306.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00784 Deviance explained = 1.12%
## UBRE = -0.32516 Scale est. = 1 n = 207289
plot(b17gam2)


- Test if the model in step 3 is fitting the data better than the purely linear model.
#based on the results, the second model has statistical significance when using the smooth term for bmi.
anova( b17gam, b17gam2, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: ins ~ +bmi + healthmdays + male + badhealth + obese
## Model 2: ins ~ +s(bmi) + s(healthmdays) + male + badhealth + obese
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 207283 140034
## 2 207269 139849 14.287 184.84 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
- Produce a plot of the smooth effect from the model in step 3
plot(b17gam2)

