load Packages

library(splines)
library(mgcv)
library(ggplot2)
library(dplyr)
library(car)

BRFSS DATA

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

set.seed(1234)
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]

renam<-names(brfss_17)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  renam))
names(brfss_17)<-newnames

Recode variables

#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")

# sex
brfss_17$male<-ifelse(brfss_17$sex==1,1,0)

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")


brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chrobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")

1) Define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor. My outcome variable for this homework is diabetes and my predictor variables are race, education, age, sex, and smoking

2) Using the gam() function, estimate a model with only linear terms in your model

wadt1<-gam(depdisorder ~ healthdays + bmi + sex + educ + race_eth,
            data=brfss_17,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(wadt1)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## depdisorder ~ healthdays + bmi + sex + educ + race_eth
## 
## Parametric coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.2354726  0.0344544 -93.906  < 2e-16 ***
## healthdays            0.0526890  0.0006126  86.008  < 2e-16 ***
## bmi                   0.0269174  0.0008935  30.127  < 2e-16 ***
## sex                   0.5886372  0.0118743  49.572  < 2e-16 ***
## educ0Prim            -0.0534858  0.0341241  -1.567    0.117    
## educ1somehs           0.3056937  0.0236962  12.901  < 2e-16 ***
## educ3somecol          0.1112310  0.0155881   7.136 9.63e-13 ***
## educ4colgrad         -0.1852941  0.0166096 -11.156  < 2e-16 ***
## race_ethhispanic     -0.4562135  0.0180147 -25.324  < 2e-16 ***
## race_ethnh black     -0.4832453  0.0189477 -25.504  < 2e-16 ***
## race_ethnh multirace  0.2384164  0.0452322   5.271 1.36e-07 ***
## race_ethnh other     -0.4289050  0.0263370 -16.285  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.0774   Deviance explained = 7.37%
## UBRE = -0.11492  Scale est. = 1         n = 202836

3) Repeat Step 2, but include a smooth term for at least one continuous variable

wadt2<-gam(depdisorder~ s(healthdays) + s(bmi) + sex + educ + race_eth,
            data=brfss_17,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(wadt2)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## depdisorder ~ s(healthdays) + s(bmi) + sex + educ + race_eth
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.23401    0.02242 -99.658  < 2e-16 ***
## sex                   0.54446    0.01194  45.608  < 2e-16 ***
## educ0Prim            -0.05467    0.03422  -1.598     0.11    
## educ1somehs           0.28454    0.02380  11.957  < 2e-16 ***
## educ3somecol          0.11114    0.01569   7.085 1.39e-12 ***
## educ4colgrad         -0.18003    0.01675 -10.750  < 2e-16 ***
## race_ethhispanic     -0.44488    0.01813 -24.540  < 2e-16 ***
## race_ethnh black     -0.49131    0.01907 -25.757  < 2e-16 ***
## race_ethnh multirace  0.21826    0.04551   4.796 1.62e-06 ***
## race_ethnh other     -0.43106    0.02657 -16.225  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq p-value    
## s(healthdays) 8.811  8.984 9245.4  <2e-16 ***
## s(bmi)        8.808  8.982  970.3  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0908   Deviance explained = 8.63%
## UBRE = -0.1268  Scale est. = 1         n = 202836

4) Test if the model in step 3 is fitting the data better than the purely linear model.

anova( wadt1, wadt2, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: depdisorder ~ healthdays + bmi + sex + educ + race_eth
## Model 2: depdisorder ~ s(healthdays) + s(bmi) + sex + educ + race_eth
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1    202824     179502                              
## 2    202808     177060 15.966     2442 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

In comparing the models, the smooth fits the data better to the purely linear model. This is evident from its very small p-value.

5) Produce a plot of the smooth effect from the model in step 3

plot(wadt2)

DEM 7283 - Splines and Generalized Additive Models

Assogba Albert

4/25/2021

load Packages

BRFSS DATA

Recode variables

1) Define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor. My outcome variable for this homework is diabetes and my predictor variables are race, education, age, sex, and smoking

2) Using the gam() function, estimate a model with only linear terms in your model

3) Repeat Step 2, but include a smooth term for at least one continuous variable

4) Test if the model in step 3 is fitting the data better than the purely linear model.

In comparing the models, the smooth fits the data better to the purely linear model. This is evident from its very small p-value.

5) Produce a plot of the smooth effect from the model in step 3