library(mgcv) #one library for GAMs
## Loading required package: nlme
## This is mgcv 1.8-35. For overview type 'help("mgcv-package")'.
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(splines)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
#load data
#load("C:/Users/okabe/OneDrive/Pictures/Stats 2/brfss_2017 (1).Rdata")
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

View(brfss_17)



set.seed(1234)
samp<-sample(1:dim(brfss_17)[1], size = 25000) #smaller sample for brevity
brfss_17<-brfss_17[samp,]
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#BMI
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", 
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")

#rEcigeret
brfss_17$ecigaret<-Recode(brfss_17$ecigaret, recodes ="7:9=NA; 1=1;2=0")

Using a data set of your choice, do the following: 1) define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor.

##The outcome variable I would like to use is exercise and the continuous variable I would like to use is Body Mass Index(BMI0
  1. Using the gam() function, estimate a model with only linear terms in your model
brfss_17<-brfss_17%>%
  select(  ststr, agec, educ, black,ins,bmi, white, other,smoke, badhealth, exercise, mmsawt )%>%
  filter(complete.cases(.))
brfgam_l<-gam(exercise~bmi+educ+black+white,
              data=brfss_17,
              weights = mmsawt/mean(mmsawt, na.rm=T),
              family=binomial)

summary(brfgam_l)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## exercise ~ bmi + educ + black + white
## 
## Parametric coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.723468   0.080990  21.280  < 2e-16 ***
## bmi          -0.036733   0.002489 -14.757  < 2e-16 ***
## educ0Prim    -0.513129   0.080338  -6.387 1.69e-10 ***
## educ1somehs  -0.161561   0.063599  -2.540   0.0111 *  
## educ3somecol  0.316942   0.041326   7.669 1.73e-14 ***
## educ4colgrad  0.844521   0.045735  18.465  < 2e-16 ***
## black        -0.007585   0.054127  -0.140   0.8886    
## white         0.217145   0.039868   5.447 5.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.0449   Deviance explained = 4.09%
## UBRE = 0.074102  Scale est. = 1         n = 20952
##In this model we see that being white,education and BMI are statistically signifcant.
  1. Repeat Step 2, but include a smooth term for at least one continuous variable
brfgam<-gam(exercise~s(bmi)+educ+black+white,
            data=brfss_17,
            weights = mmsawt/mean(mmsawt, na.rm=T),
            family=binomial)
summary(brfgam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## exercise ~ s(bmi) + educ + black + white
## 
## Parametric coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.702347   0.042348  16.585  < 2e-16 ***
## educ0Prim    -0.525186   0.080629  -6.514 7.34e-11 ***
## educ1somehs  -0.165455   0.063700  -2.597  0.00939 ** 
## educ3somecol  0.325396   0.041429   7.854 4.02e-15 ***
## educ4colgrad  0.833740   0.045817  18.197  < 2e-16 ***
## black        -0.009687   0.054217  -0.179  0.85819    
## white         0.213996   0.039963   5.355 8.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##         edf Ref.df Chi.sq p-value    
## s(bmi) 8.66  8.943  266.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0466   Deviance explained = 4.31%
## UBRE = 0.072419  Scale est. = 1         n = 20952
plot(brfgam)

##Using the smooth term increases the statistical significance of  education and being White(race)
  1. Test if the model in step 3 is fitting the data better than the purely linear model.
anova( brfgam_l, brfgam, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: exercise ~ bmi + educ + black + white
## Model 2: exercise ~ s(bmi) + educ + black + white
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1     20944      22489                              
## 2     20936      22438 7.9429   50.571 2.996e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##The model that fits the data better is the second model that has the smooth term
  1. Produce a plot of the smooth effect from the model in step 3
plot(brfgam)

##This shows that there is a slight change in relationship between BMI  and exercise. However that relationship declines when there is a higher BMI