library(mgcv) #one library for GAMs
## Loading required package: nlme
## This is mgcv 1.8-35. For overview type 'help("mgcv-package")'.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(splines)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
#load data
#load("C:/Users/okabe/OneDrive/Pictures/Stats 2/brfss_2017 (1).Rdata")
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
View(brfss_17)
set.seed(1234)
samp<-sample(1:dim(brfss_17)[1], size = 25000) #smaller sample for brevity
brfss_17<-brfss_17[samp,]
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#BMI
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA",
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")
#rEcigeret
brfss_17$ecigaret<-Recode(brfss_17$ecigaret, recodes ="7:9=NA; 1=1;2=0")
Using a data set of your choice, do the following: 1) define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor.
##The outcome variable I would like to use is exercise and the continuous variable I would like to use is Body Mass Index(BMI0
brfss_17<-brfss_17%>%
select( ststr, agec, educ, black,ins,bmi, white, other,smoke, badhealth, exercise, mmsawt )%>%
filter(complete.cases(.))
brfgam_l<-gam(exercise~bmi+educ+black+white,
data=brfss_17,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
summary(brfgam_l)
##
## Family: binomial
## Link function: logit
##
## Formula:
## exercise ~ bmi + educ + black + white
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.723468 0.080990 21.280 < 2e-16 ***
## bmi -0.036733 0.002489 -14.757 < 2e-16 ***
## educ0Prim -0.513129 0.080338 -6.387 1.69e-10 ***
## educ1somehs -0.161561 0.063599 -2.540 0.0111 *
## educ3somecol 0.316942 0.041326 7.669 1.73e-14 ***
## educ4colgrad 0.844521 0.045735 18.465 < 2e-16 ***
## black -0.007585 0.054127 -0.140 0.8886
## white 0.217145 0.039868 5.447 5.14e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.0449 Deviance explained = 4.09%
## UBRE = 0.074102 Scale est. = 1 n = 20952
##In this model we see that being white,education and BMI are statistically signifcant.
brfgam<-gam(exercise~s(bmi)+educ+black+white,
data=brfss_17,
weights = mmsawt/mean(mmsawt, na.rm=T),
family=binomial)
summary(brfgam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## exercise ~ s(bmi) + educ + black + white
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.702347 0.042348 16.585 < 2e-16 ***
## educ0Prim -0.525186 0.080629 -6.514 7.34e-11 ***
## educ1somehs -0.165455 0.063700 -2.597 0.00939 **
## educ3somecol 0.325396 0.041429 7.854 4.02e-15 ***
## educ4colgrad 0.833740 0.045817 18.197 < 2e-16 ***
## black -0.009687 0.054217 -0.179 0.85819
## white 0.213996 0.039963 5.355 8.57e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(bmi) 8.66 8.943 266.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0466 Deviance explained = 4.31%
## UBRE = 0.072419 Scale est. = 1 n = 20952
plot(brfgam)
##Using the smooth term increases the statistical significance of education and being White(race)
anova( brfgam_l, brfgam, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: exercise ~ bmi + educ + black + white
## Model 2: exercise ~ s(bmi) + educ + black + white
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 20944 22489
## 2 20936 22438 7.9429 50.571 2.996e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##The model that fits the data better is the second model that has the smooth term
plot(brfgam)
##This shows that there is a slight change in relationship between BMI and exercise. However that relationship declines when there is a higher BMI