I will be using the same BRFSS data set that I used in my previous homework. I will using the number of healthy days as my outcome and my predictor variables will be bmi (this will be smoothed), income, smoking, and employment status.
#smaller sample
samps<- sample(1:dim(brfss20)[1], size = 10000, replace = F)
brfss20<- brfss20[samps,]
#smoking
brfss20$smoke <- Recode(brfss20$smoke100, recodes = "2=0; 7=NA; 9=NA", as.factor=T)
#healthy days
brfss20$healthdays<-Recode(brfss20$physhlth, recodes = "88=0; 77=NA; 99=NA", as.factor=T)
#bmi
brfss20$bmi<-brfss20$bmi5/100
#income
brfss20$income <- Recode(brfss20$income2, recodes = "77=NA; 99=NA", as.factor=T)
#employ
brfss20$employ<-Recode(brfss20$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20a <- brfss20 %>% select(healthdays, smoke, bmi, income, employ, mmsawt)
lingam <- gam(healthdays ~ bmi + income + smoke + employ,
data = brfss20a,
weights = mmsawt/mean(mmsawt, na.rm = T),
family = binomial)
summary(lingam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## healthdays ~ bmi + income + smoke + employ
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.266258 0.190285 -6.655 2.84e-11 ***
## bmi 0.018500 0.004286 4.316 1.59e-05 ***
## income2 -0.082452 0.189492 -0.435 0.663475
## income3 0.078961 0.170246 0.464 0.642787
## income4 -0.450095 0.165684 -2.717 0.006596 **
## income5 -0.059937 0.163123 -0.367 0.713296
## income6 -0.542452 0.156856 -3.458 0.000544 ***
## income7 -0.413842 0.154847 -2.673 0.007527 **
## income8 -0.527002 0.144655 -3.643 0.000269 ***
## smoke1 0.211993 0.056590 3.746 0.000180 ***
## employnilf 0.109677 0.079167 1.385 0.165935
## employretired 0.116406 0.074383 1.565 0.117595
## employunable 1.406244 0.121769 11.548 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.0515 Deviance explained = 4.06%
## UBRE = 0.10028 Scale est. = 1 n = 7202
smoothgam <- gam(healthdays ~ s(bmi) + income + smoke + employ,
data = brfss20a,
weights = mmsawt/mean(mmsawt, na.rm = T),
family = binomial)
summary(smoothgam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## healthdays ~ s(bmi) + income + smoke + employ
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.79405 0.14577 -5.447 5.12e-08 ***
## income2 -0.05605 0.19003 -0.295 0.768018
## income3 0.08577 0.17105 0.501 0.616078
## income4 -0.41542 0.16617 -2.500 0.012420 *
## income5 -0.02590 0.16394 -0.158 0.874480
## income6 -0.50487 0.15735 -3.209 0.001334 **
## income7 -0.36699 0.15545 -2.361 0.018229 *
## income8 -0.48311 0.14532 -3.324 0.000886 ***
## smoke1 0.21807 0.05679 3.840 0.000123 ***
## employnilf 0.11562 0.07938 1.457 0.145235
## employretired 0.13242 0.07480 1.770 0.076696 .
## employunable 1.43956 0.12246 11.755 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(bmi) 5.662 6.814 49.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0559 Deviance explained = 4.48%
## UBRE = 0.096732 Scale est. = 1 n = 7202
plot(smoothgam)
anova( lingam,
smoothgam,
test="Chisq")
## Analysis of Deviance Table
##
## Model 1: healthdays ~ bmi + income + smoke + employ
## Model 2: healthdays ~ s(bmi) + income + smoke + employ
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7189.0 7898.2
## 2 7183.2 7863.3 5.8135 34.894 3.744e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The model with the smoothed BMI does better.