load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

#Data Prep
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

##Get TX Cities 
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)

##Recodes
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)

brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0", as.factor=T)

brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
                   
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

brfss_17$menthlth<-recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")

brfss_17$employa <- Recode(brfss_17$employ1, recodes="1:2=1; 2:6=0; else=NA", as.factor=T)

brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

names(brfss_17)
##   [1] "dispcode"  "statere1"  "safetime"  "hhadult"   "genhlth"   "physhlth" 
##   [7] "menthlth"  "poorhlth"  "hlthpln1"  "persdoc2"  "medcost"   "checkup1" 
##  [13] "bphigh4"   "bpmeds"    "cholchk1"  "toldhi2"   "cholmed1"  "cvdinfr4" 
##  [19] "cvdcrhd4"  "cvdstrk3"  "asthma3"   "asthnow"   "chcscncr"  "chcocncr" 
##  [25] "chccopd1"  "havarth3"  "addepev2"  "chckidny"  "diabete3"  "diabage2" 
##  [31] "lmtjoin3"  "arthdis2"  "arthsocl"  "joinpai1"  "sex"       "marital"  
##  [37] "educa"     "renthom1"  "numhhol2"  "numphon2"  "cpdemo1a"  "veteran3" 
##  [43] "employ1"   "children"  "income2"   "internet"  "weight2"   "height3"  
##  [49] "pregnant"  "deaf"      "blind"     "decide"    "diffwalk"  "diffdres" 
##  [55] "diffalon"  "smoke100"  "smokday2"  "stopsmk2"  "lastsmk2"  "usenow3"  
##  [61] "ecigaret"  "ecignow"   "alcday5"   "avedrnk2"  "drnk3ge5"  "maxdrnks" 
##  [67] "fruit2"    "fruitju2"  "fvgreen1"  "frenchf1"  "potatoe1"  "vegetab2" 
##  [73] "exerany2"  "exract11"  "exeroft1"  "exerhmm1"  "exract21"  "exeroft2" 
##  [79] "exerhmm2"  "strength"  "seatbelt"  "flushot6"  "flshtmy2"  "pneuvac3" 
##  [85] "shingle2"  "hivtst6"   "hivtstd3"  "hivrisk5"  "casthdx2"  "casthno2" 
##  [91] "callbckz"  "wdusenow"  "wdinftrk"  "wdhowoft"  "wdshare"   "namtribe" 
##  [97] "namothr"   "urbnrrl"   "ststr"     "impsex"    "rfhlth"    "phys14d"  
## [103] "ment14d"   "hcvu651"   "rfhype5"   "cholch1"   "rfchol1"   "michd"    
## [109] "ltasth1"   "casthm1"   "asthms1"   "drdxar1"   "lmtact1"   "lmtwrk1"  
## [115] "lmtscl1"   "prace1"    "mrace1"    "hispanc"   "race"      "raceg21"  
## [121] "racegr3"   "ageg5yr"   "age65yr"   "age80"     "ageg"      "wtkg3"    
## [127] "bmi5"      "bmi5cat"   "rfbmi5"    "educag"    "incomg"    "smoker3"  
## [133] "rfsmok3"   "ecigsts"   "curecig"   "drnkany5"  "rfbing5"   "drnkwek"  
## [139] "rfdrhv5"   "ftjuda2"   "frutda2"   "grenda1"   "frnchda"   "potada1"  
## [145] "vegeda2"   "misfrt1"   "misveg1"   "frtres1"   "vegres1"   "frutsu1"  
## [151] "vegesu1"   "frtlt1a"   "veglt1a"   "frt16a"    "veg23a"    "fruite1"  
## [157] "vegete1"   "totinda"   "minac11"   "minac21"   "pacat1"    "paindx1"  
## [163] "pa150r2"   "pa300r2"   "pa30021"   "pastrng"   "parec1"    "pastae1"  
## [169] "rfseat2"   "rfseat3"   "flshot6"   "pneumo2"   "aidtst3"   "mmsa"     
## [175] "mmsawt"    "seqno"     "mmsaname"  "tx"        "educ"      "ins"      
## [181] "marst"     "badhealth" "employ"    "agec"      "smoke"     "employa"  
## [187] "bmi"       "black"     "white"     "other"     "hispanic"  "race_eth"
View(brfss_17)
# Using a data set of your choice, do the following:

# 1) define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor.
##The outcome variable will be employment status, which is a binomial outcome variable. The predictors will be bmi, sex, and race/ethnicity. BMI will be the continious variable. 


library(splines)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
sub<-brfss_17%>%
  select(employa,menthlth, sex, ins, agec, mmsawt, ststr, bmi, race_eth, smoke) %>%
  filter( complete.cases(.))

#survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )

# 2) Using the gam() function, estimate a model with only linear terms in your model
##Linear One
brfgam1<-gam(employa~bmi+sex+race_eth+smoke,
            data=sub,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(brfgam1)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## employa ~ bmi + sex + race_eth + smoke
## 
## Parametric coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.141020   0.175419   0.804  0.42145    
## bmi                   0.019383   0.006015   3.222  0.00127 ** 
## sexMale               1.447758   0.078294  18.491  < 2e-16 ***
## race_ethhispanic     -0.462265   0.083497  -5.536 3.09e-08 ***
## race_ethnh black     -0.041350   0.123717  -0.334  0.73820    
## race_ethnh multirace -0.579233   0.331415  -1.748  0.08051 .  
## race_ethnh other     -1.283531   0.134459  -9.546  < 2e-16 ***
## smokeCurrent          0.013120   0.102757   0.128  0.89840    
## smokeFormer           0.502737   0.112869   4.454 8.42e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =   0.12   Deviance explained = 10.7%
## UBRE = 0.017336  Scale est. = 1         n = 4529
# 3) Repeat Step 2, but include a smooth term for at least one continuous variable
##GAM Fit
brfgam2<-gam(employa~s(bmi)+sex+race_eth+smoke,
            data=sub,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(brfgam2)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## employa ~ s(bmi) + sex + race_eth + smoke
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.73670    0.06998  10.527  < 2e-16 ***
## sexMale               1.41735    0.07902  17.937  < 2e-16 ***
## race_ethhispanic     -0.50388    0.08478  -5.943  2.8e-09 ***
## race_ethnh black     -0.06139    0.12423  -0.494   0.6212    
## race_ethnh multirace -0.60949    0.32988  -1.848   0.0647 .  
## race_ethnh other     -1.28422    0.13531  -9.491  < 2e-16 ***
## smokeCurrent          0.03086    0.10373   0.297   0.7661    
## smokeFormer           0.47229    0.11357   4.159  3.2e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df Chi.sq  p-value    
## s(bmi) 8.096  8.743  37.54 1.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.126   Deviance explained = 11.4%
## UBRE = 0.013245  Scale est. = 1         n = 4529
#4) Test if the model in step 3 is fitting the data better than the purely linear model.
anova( brfgam1, brfgam2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: employa ~ bmi + sex + race_eth + smoke
## Model 2: employa ~ s(bmi) + sex + race_eth + smoke
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1    4520.0     4589.5                              
## 2    4512.3     4556.8 7.7426    32.72 5.611e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The second model is the better model where I used the smooth term for BMI. 

#5) Produce a plot of the smooth effect from the model in step 3
plot(brfgam2)