#Load data for analysis#sub-setting and Re-codding variables for analysis purposes
brfss <- readRDS("brfss_177.rds")
# Cleaning the variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = renam))
names(brfss)<-newnames
homewk3 <- brfss %>%
select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,age80,menthlth,smokday2,sleptim1) %>%
mutate( smokers=ifelse(smoke100==1 & smokday2%in%c(1,2),"Current",
ifelse(smoke100==1 & smokday2==3, "Former",
ifelse(smoke100==2, "Never", NA)))) %>%
mutate(bmi = car::Recode(bmi5cat,recodes="1='Underweight';2='Normal';3='Overweight'; 4='Obese';else=NA",as.factor=T ),
educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ),
Physicala = car::Recode(exerany2,recodes="1='Yess';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ),
sleepadq=ifelse(sleptim1>=7 & !sleptim1%in%c(77,99), 0,
ifelse(sleptim1<7,1, NA )),
mentald=ifelse(menthlth<31, "Yess",
ifelse(menthlth==88, "No",NA)))
#Data containing only variables used in the analysis
Final_data <- homewk3 %>%
select(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,age80,Physicala, sleepadq,sex,racegr3,mentald,ststr) %>%
mutate(sleepadq2=ifelse(sleepadq==0, 'Yes',ifelse(sleepadq==1, 'No' , NA))) %>%
# mutate(sex=as.factor(ifelse(sex==1|sex==2, sex, NA))) %>%
filter(complete.cases(.))
brfgam_l<-gam(sleepadq ~ bmi+ racegr3+ age80+ educa+ mentald+ Physicala+ sex+ smokers,
data=Final_data,
weights =llcpwt/mean(llcpwt, na.rm=T),
family=binomial)
summary(brfgam_l)
##
## Family: binomial
## Link function: logit
##
## Formula:
## sleepadq ~ bmi + racegr3 + age80 + educa + mentald + Physicala +
## sex + smokers
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1340231 0.0757882 -1.768 0.076996 .
## bmiObese 0.2018764 0.0262357 7.695 1.42e-14 ***
## bmiOverweight 0.1227373 0.0254387 4.825 1.40e-06 ***
## bmiUnderweight 0.1040214 0.0736250 1.413 0.157699
## racegr3NH-Black 0.4411480 0.0462836 9.531 < 2e-16 ***
## racegr3NH-White -0.1043472 0.0332505 -3.138 0.001700 **
## racegr3Other 0.1664155 0.0475354 3.501 0.000464 ***
## age80 -0.0087235 0.0006308 -13.829 < 2e-16 ***
## educa>HS 0.1960309 0.0628224 3.120 0.001806 **
## educaHS 0.1966306 0.0628201 3.130 0.001748 **
## mentaldYess 0.3720249 0.0217132 17.134 < 2e-16 ***
## PhysicalaYess -0.2389309 0.0235681 -10.138 < 2e-16 ***
## sexMale 0.0814455 0.0209533 3.887 0.000101 ***
## smokersFormer -0.3817093 0.0315687 -12.091 < 2e-16 ***
## smokersNever -0.4927090 0.0277012 -17.787 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.0343 Deviance explained = 2.78%
## UBRE = 0.25067 Scale est. = 1 n = 44489
Here, I use a GAM fit with a smooth term for age
brfgam_2<-gam(sleepadq ~ bmi+ racegr3+ s(age80)+ educa+ mentald+ Physicala+ sex+ smokers,
data=Final_data,
weights =llcpwt/mean(llcpwt, na.rm=T),
family=binomial)
summary(brfgam_2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## sleepadq ~ bmi + racegr3 + s(age80) + educa + mentald + Physicala +
## sex + smokers
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.67603 0.06849 -9.870 < 2e-16 ***
## bmiObese 0.15478 0.02659 5.822 5.81e-09 ***
## bmiOverweight 0.09208 0.02560 3.597 0.000322 ***
## bmiUnderweight 0.17109 0.07395 2.314 0.020690 *
## racegr3NH-Black 0.43851 0.04636 9.459 < 2e-16 ***
## racegr3NH-White -0.09910 0.03331 -2.975 0.002929 **
## racegr3Other 0.16706 0.04762 3.508 0.000451 ***
## educa>HS 0.20637 0.06317 3.267 0.001087 **
## educaHS 0.23626 0.06320 3.738 0.000185 ***
## mentaldYess 0.37501 0.02176 17.238 < 2e-16 ***
## PhysicalaYess -0.24052 0.02365 -10.170 < 2e-16 ***
## sexMale 0.08369 0.02101 3.984 6.77e-05 ***
## smokersFormer -0.33543 0.03184 -10.535 < 2e-16 ***
## smokersNever -0.44067 0.02812 -15.671 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(age80) 6.473 7.609 380.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0388 Deviance explained = 3.14%
## UBRE = 0.24629 Scale est. = 1 n = 44489
anova( brfgam_l, brfgam_2, test="Chisq")
Comparing the two models, the model with a smooth term for the age variable fits the model better. This is evident because of the very small p-value.
plot(brfgam_2)