library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.1
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

#Using the Brfss_17 data wetry and use three variables to determine health; race, smoking, obesity to answer does race play a role in health disparities


nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

#Poor or fair self rated health
brfss_17$badhealth<-Recode(brfss_17$genhlth,
                           recodes="4:5=1; 1:3=0; else=NA")


#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3,
                       recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3,
                       recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3,
                       recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3,
                          recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3,
                          recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
                          as.factor = T)




#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

brfss_17$obese<-ifelse(brfss_17$bmi>=30,
                       1,
                       0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
                       recodes="1:2 ='Current'; 3 ='Former';4 ='NeverSmoked'; else = NA",
                       as.factor=T)

brfss_17$smoke<-relevel(brfss_17$smoke,
                        ref = "NeverSmoked")

#smokes
brfss_17$smoking<-Recode(brfss_17$smokday2,
                         recodes= "1:2=1; 3=0; 7:9=NA")
sub<-brfss_17 %>%
  select(badhealth,mmsaname, bmi, ,race_eth,white, black, hispanic, other, smoking, mmsawt, ststr) %>%
  filter( complete.cases( . ))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
               strata= ~ststr,
               weights= ~mmsawt
               , data = sub )


#hispanic
mylogit <- glm(badhealth ~hispanic+ smoking + obese, data = brfss_17, family = "binomial")

summary(mylogit)
## 
## Call:
## glm(formula = badhealth ~ hispanic + smoking + obese, family = "binomial", 
##     data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1524  -0.7462  -0.6036  -0.6036   1.8935  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.61048    0.01269 -126.95   <2e-16 ***
## hispanic     0.43011    0.03006   14.31   <2e-16 ***
## smoking      0.47414    0.01705   27.81   <2e-16 ***
## obese        0.64712    0.01705   37.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92217  on 84854  degrees of freedom
## Residual deviance: 89882  on 84851  degrees of freedom
##   (146020 observations deleted due to missingness)
## AIC: 89890
## 
## Number of Fisher Scoring iterations: 4
confint(mylogit)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) -1.6354064 -1.5856769
## hispanic     0.3710369  0.4888621
## smoking      0.4407173  0.5075432
## obese        0.6136969  0.6805262
exp(coef(mylogit))
## (Intercept)    hispanic     smoking       obese 
##   0.1997909   1.5374292   1.6066353   1.9100324
library(broom)
mylogit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR LowerOR_Ci UpperOR_Ci
(Intercept) -1.610 0.013 -126.948 0 0.200 0.195 0.205
hispanic 0.430 0.030 14.310 0 1.537 1.449 1.631
smoking 0.474 0.017 27.813 0 1.607 1.554 1.661
obese 0.647 0.017 37.958 0 1.910 1.847 1.975
library(sjPlot)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
plot_model(mylogit,
           title = "odds for getting poor health")

newdata <- with(brfss_17, data.frame(hispanic=mean(hispanic,na.rm=TRUE ), smoking=mean(smoking, na.rm=TRUE), obese=mean(obese, na.rm=TRUE)))

predict(mylogit, newdata, type="response")
##         1 
## 0.2275014
#black

mylogit1 <- glm(badhealth ~black+ smoking + obese, data = brfss_17, family = "binomial")

summary(mylogit1)
## 
## Call:
## glm(formula = badhealth ~ black + smoking + obese, family = "binomial", 
##     data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1363  -0.7416  -0.6033  -0.6033   1.8939  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.61154    0.01267 -127.17   <2e-16 ***
## black        0.41559    0.02654   15.66   <2e-16 ***
## smoking      0.46121    0.01711   26.96   <2e-16 ***
## obese        0.63720    0.01708   37.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92217  on 84854  degrees of freedom
## Residual deviance: 89842  on 84851  degrees of freedom
##   (146020 observations deleted due to missingness)
## AIC: 89850
## 
## Number of Fisher Scoring iterations: 4
confint(mylogit1)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) -1.6364374 -1.5867625
## black        0.3634525  0.4675003
## smoking      0.4276694  0.4947277
## obese        0.6037168  0.6706559
exp(coef(mylogit1))
## (Intercept)       black     smoking       obese 
##   0.1995795   1.5152678   1.5859937   1.8911697
library(broom)
mylogit1%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR LowerOR_Ci UpperOR_Ci
(Intercept) -1.612 0.013 -127.171 0 0.200 0.195 0.205
black 0.416 0.027 15.658 0 1.515 1.438 1.596
smoking 0.461 0.017 26.961 0 1.586 1.534 1.640
obese 0.637 0.017 37.314 0 1.891 1.829 1.956
library(sjPlot)
plot_model(mylogit1,
           title = "odds for getting poor health")

newdata1 <- with(brfss_17, data.frame(black=mean(black,na.rm=TRUE ), smoking=mean(smoking, na.rm=TRUE), obese=mean(obese, na.rm=TRUE)))

predict(mylogit1, newdata1, type="response")
##         1 
## 0.2262858
#white



mylogit2 <- glm(badhealth ~white+ smoking + obese, data = brfss_17, family = "binomial")

summary(mylogit2)
## 
## Call:
## glm(formula = badhealth ~ white + smoking + obese, family = "binomial", 
##     data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1169  -0.7808  -0.5891  -0.5891   1.9167  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.21019    0.01986  -60.95   <2e-16 ***
## white       -0.45323    0.01916  -23.66   <2e-16 ***
## smoking      0.43454    0.01720   25.26   <2e-16 ***
## obese        0.63159    0.01710   36.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92217  on 84854  degrees of freedom
## Residual deviance: 89534  on 84851  degrees of freedom
##   (146020 observations deleted due to missingness)
## AIC: 89542
## 
## Number of Fisher Scoring iterations: 4
confint(mylogit2)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) -1.2491870 -1.1713547
## white       -0.4907447 -0.4156417
## smoking      0.4008093  0.4682495
## obese        0.5980542  0.6650983
exp(coef(mylogit2))
## (Intercept)       white     smoking       obese 
##   0.2981392   0.6355696   1.5442573   1.8805899
library(broom)
mylogit2%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR LowerOR_Ci UpperOR_Ci
(Intercept) -1.210 0.020 -60.951 0 0.298 0.287 0.310
white -0.453 0.019 -23.656 0 0.636 0.612 0.660
smoking 0.435 0.017 25.258 0 1.544 1.493 1.597
obese 0.632 0.017 36.928 0 1.881 1.819 1.945
library(sjPlot)
plot_model(mylogit2,
           title = "odds for getting poor health")

newdata2 <- with(brfss_17, data.frame(white=mean(white,na.rm=TRUE ), smoking=mean(smoking, na.rm=TRUE), obese=mean(obese, na.rm=TRUE)))

predict(mylogit2, newdata2, type="response")
##         1 
## 0.2285688
#results: The probability seems to be around 0.22 for each race, the issue of overfitting may have caused this similarity amongst each race