homework3

library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.1
## v readr   1.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

#Using the Brfss_17 data wetry and use three variables to determine health; race, smoking, obesity to answer does race play a role in health disparities


nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

#Poor or fair self rated health
brfss_17$badhealth<-Recode(brfss_17$genhlth,
                           recodes="4:5=1; 1:3=0; else=NA")


#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3,
                       recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3,
                       recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3,
                       recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3,
                          recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3,
                          recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
                          as.factor = T)




#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

brfss_17$obese<-ifelse(brfss_17$bmi>=30,
                       1,
                       0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
                       recodes="1:2 ='Current'; 3 ='Former';4 ='NeverSmoked'; else = NA",
                       as.factor=T)

brfss_17$smoke<-relevel(brfss_17$smoke,
                        ref = "NeverSmoked")

#smokes
brfss_17$smoking<-Recode(brfss_17$smokday2,
                         recodes= "1:2=1; 3=0; 7:9=NA")
sub<-brfss_17 %>%
  select(badhealth,mmsaname, bmi, ,race_eth,white, black, hispanic, other, smoking, mmsawt, ststr) %>%
  filter( complete.cases( . ))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
               strata= ~ststr,
               weights= ~mmsawt
               , data = sub )


#hispanic
mylogit <- glm(badhealth ~hispanic+ smoking + obese, data = brfss_17, family = "binomial")

summary(mylogit)

## 
## Call:
## glm(formula = badhealth ~ hispanic + smoking + obese, family = "binomial", 
##     data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1524  -0.7462  -0.6036  -0.6036   1.8935  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.61048    0.01269 -126.95   <2e-16 ***
## hispanic     0.43011    0.03006   14.31   <2e-16 ***
## smoking      0.47414    0.01705   27.81   <2e-16 ***
## obese        0.64712    0.01705   37.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92217  on 84854  degrees of freedom
## Residual deviance: 89882  on 84851  degrees of freedom
##   (146020 observations deleted due to missingness)
## AIC: 89890
## 
## Number of Fisher Scoring iterations: 4

confint(mylogit)

## Waiting for profiling to be done...

##                  2.5 %     97.5 %
## (Intercept) -1.6354064 -1.5856769
## hispanic     0.3710369  0.4888621
## smoking      0.4407173  0.5075432
## obese        0.6136969  0.6805262

exp(coef(mylogit))

## (Intercept)    hispanic     smoking       obese 
##   0.1997909   1.5374292   1.6066353   1.9100324

library(broom)
mylogit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	OR	LowerOR_Ci	UpperOR_Ci
(Intercept)	-1.610	0.013	-126.948	0.200	0.195	0.205
hispanic	0.430	0.030	14.310	1.537	1.449	1.631
smoking	0.474	0.017	27.813	1.607	1.554	1.661
obese	0.647	0.017	37.958	1.910	1.847	1.975

library(sjPlot)

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

plot_model(mylogit,
           title = "odds for getting poor health")

newdata <- with(brfss_17, data.frame(hispanic=mean(hispanic,na.rm=TRUE ), smoking=mean(smoking, na.rm=TRUE), obese=mean(obese, na.rm=TRUE)))

predict(mylogit, newdata, type="response")

##         1 
## 0.2275014

#black

mylogit1 <- glm(badhealth ~black+ smoking + obese, data = brfss_17, family = "binomial")

summary(mylogit1)

## 
## Call:
## glm(formula = badhealth ~ black + smoking + obese, family = "binomial", 
##     data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1363  -0.7416  -0.6033  -0.6033   1.8939  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.61154    0.01267 -127.17   <2e-16 ***
## black        0.41559    0.02654   15.66   <2e-16 ***
## smoking      0.46121    0.01711   26.96   <2e-16 ***
## obese        0.63720    0.01708   37.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92217  on 84854  degrees of freedom
## Residual deviance: 89842  on 84851  degrees of freedom
##   (146020 observations deleted due to missingness)
## AIC: 89850
## 
## Number of Fisher Scoring iterations: 4

confint(mylogit1)

## Waiting for profiling to be done...

##                  2.5 %     97.5 %
## (Intercept) -1.6364374 -1.5867625
## black        0.3634525  0.4675003
## smoking      0.4276694  0.4947277
## obese        0.6037168  0.6706559

exp(coef(mylogit1))

## (Intercept)       black     smoking       obese 
##   0.1995795   1.5152678   1.5859937   1.8911697

library(broom)
mylogit1%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	OR	LowerOR_Ci	UpperOR_Ci
(Intercept)	-1.612	0.013	-127.171	0.200	0.195	0.205
black	0.416	0.027	15.658	1.515	1.438	1.596
smoking	0.461	0.017	26.961	1.586	1.534	1.640
obese	0.637	0.017	37.314	1.891	1.829	1.956

library(sjPlot)
plot_model(mylogit1,
           title = "odds for getting poor health")

newdata1 <- with(brfss_17, data.frame(black=mean(black,na.rm=TRUE ), smoking=mean(smoking, na.rm=TRUE), obese=mean(obese, na.rm=TRUE)))

predict(mylogit1, newdata1, type="response")

##         1 
## 0.2262858

#white



mylogit2 <- glm(badhealth ~white+ smoking + obese, data = brfss_17, family = "binomial")

summary(mylogit2)

## 
## Call:
## glm(formula = badhealth ~ white + smoking + obese, family = "binomial", 
##     data = brfss_17)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1169  -0.7808  -0.5891  -0.5891   1.9167  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.21019    0.01986  -60.95   <2e-16 ***
## white       -0.45323    0.01916  -23.66   <2e-16 ***
## smoking      0.43454    0.01720   25.26   <2e-16 ***
## obese        0.63159    0.01710   36.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92217  on 84854  degrees of freedom
## Residual deviance: 89534  on 84851  degrees of freedom
##   (146020 observations deleted due to missingness)
## AIC: 89542
## 
## Number of Fisher Scoring iterations: 4

confint(mylogit2)

## Waiting for profiling to be done...

##                  2.5 %     97.5 %
## (Intercept) -1.2491870 -1.1713547
## white       -0.4907447 -0.4156417
## smoking      0.4008093  0.4682495
## obese        0.5980542  0.6650983

exp(coef(mylogit2))

## (Intercept)       white     smoking       obese 
##   0.2981392   0.6355696   1.5442573   1.8805899

library(broom)
mylogit2%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	OR	LowerOR_Ci	UpperOR_Ci
(Intercept)	-1.210	0.020	-60.951	0.298	0.287	0.310
white	-0.453	0.019	-23.656	0.636	0.612	0.660
smoking	0.435	0.017	25.258	1.544	1.493	1.597
obese	0.632	0.017	36.928	1.881	1.819	1.945

library(sjPlot)
plot_model(mylogit2,
           title = "odds for getting poor health")

newdata2 <- with(brfss_17, data.frame(white=mean(white,na.rm=TRUE ), smoking=mean(smoking, na.rm=TRUE), obese=mean(obese, na.rm=TRUE)))

predict(mylogit2, newdata2, type="response")

##         1 
## 0.2285688

#results: The probability seems to be around 0.22 for each race, the issue of overfitting may have caused this similarity amongst each race

homework3

Steve chavez

2/21/2021