For this homework, you will perform a logistic regression (or probit regression) of a binary outcome, using the dataset of your choice. I ask you specify a research question for your analysis and generate appropriate predictors in order to examine your question.

My research question for this assignment which is same as last weeks assignments is: How does race/ethnicity and age affect the likelihood of a person visiting a doctors office in the last 12 months?

The predictor variables are age, and race/ethnicity. To undertake this assignment, am going to use the BRFSS 2019 data set.

#load libraries

library(car)
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Warning: package 'survey' was built under R version 4.0.3
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.3
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
## Warning: package 'questionr' was built under R version 4.0.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.1
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
library(haven)
library(readr)

#load brfss Data set

brfss2019 <- read_csv("C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   IDATE = col_character(),
##   IMONTH = col_character(),
##   IDAY = col_character(),
##   COLGHOUS = col_logical(),
##   COLGSEX = col_logical(),
##   SAFETIME = col_logical(),
##   CTELNUM1 = col_logical(),
##   CELLFON5 = col_logical(),
##   CADULT1 = col_logical(),
##   CELLSEX = col_logical(),
##   PVTRESD3 = col_logical(),
##   CCLGHOUS = col_logical(),
##   CSTATE1 = col_logical(),
##   LANDLINE = col_logical(),
##   HHADULT = col_logical(),
##   TOLDCFS = col_logical(),
##   HAVECFS = col_logical(),
##   WORKCFS = col_logical(),
##   TOLDHEPC = col_logical(),
##   TRETHEPC = col_logical()
##   # ... with 88 more columns
## )
## i Use `spec()` for the full column specifications.
## Warning: 2665849 parsing failures.
##  row      col           expected actual                                                       file
## 2224 HHADULT  1/0/T/F/TRUE/FALSE      2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2225 LANDLINE 1/0/T/F/TRUE/FALSE      2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2225 HHADULT  1/0/T/F/TRUE/FALSE      2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2226 LANDLINE 1/0/T/F/TRUE/FALSE      2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2227 CELLSEX  1/0/T/F/TRUE/FALSE      2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## .... ........ .................. ...... ..........................................................
## See problems(...) for more details.

Work on names of variables

nams<- names(brfss2019)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss2019)<-newnames

Recoding Variables

# Health care cost
brfss2019$medcost<-Recode(brfss2019$medcost, recodes ="7:9=NA; 1=1;2=0")

#Age cut into intervals
brfss2019$agec<-cut(brfss2019$age80, breaks=c(0,24,39,59,79,99))

# race/ethnicity
brfss2019$black<-Recode(brfss2019$racegr3, recodes="2=1; 9=NA; else=0")
brfss2019$white<-Recode(brfss2019$racegr3, recodes="1=1; 9=NA; else=0")
brfss2019$other<-Recode(brfss2019$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss2019$hispanic<-Recode(brfss2019$racegr3, recodes="5=1; 9=NA; else=0")

brfss2019$race_eth<-Recode(brfss2019$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#education level
brfss2019$educ<-Recode(brfss2019$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
                      as.factor=T)

# brfss2019$educ<-relevel(brfss2019$educ, ref='2hsgrad')

#Age cut into intervals
brfss2019$agec<-cut(brfss2019$age80, breaks=c(0,29,39,59,79,99))

2. Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.

Survey Design

options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1, strata= ~ststr, weights= ~llcpwt, data = brfss2019 )

Fitting the logistic regression model

fit.logit<-svyglm(medcost ~ race_eth + agec, design = des, family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
## 
## Call:
## svyglm(formula = medcost ~ race_eth + agec, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = brfss2019)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.14799    0.03102 -37.007  < 2e-16 ***
## race_ethnh black     -0.28913    0.03826  -7.558 4.11e-14 ***
## race_ethnh multirace -0.17726    0.06013  -2.948   0.0032 ** 
## race_ethnh other     -0.65988    0.05527 -11.939  < 2e-16 ***
## race_ethnhwhite      -0.61975    0.02828 -21.917  < 2e-16 ***
## agec(29,39]          -0.01105    0.03147  -0.351   0.7255    
## agec(39,59]          -0.13835    0.02804  -4.933 8.09e-07 ***
## agec(59,79]          -0.92393    0.03195 -28.920  < 2e-16 ***
## agec(79,99]          -1.54569    0.06069 -25.468  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9976046)
## 
## Number of Fisher Scoring iterations: 5
library(broom)
## Warning: package 'broom' was built under R version 4.0.3
fit.logit%>%
  tidy()%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) -1.148 0.031 -37.007 0.000
race_ethnh black -0.289 0.038 -7.558 0.000
race_ethnh multirace -0.177 0.060 -2.948 0.003
race_ethnh other -0.660 0.055 -11.939 0.000
race_ethnhwhite -0.620 0.028 -21.917 0.000
agec(29,39] -0.011 0.031 -0.351 0.725
agec(39,59] -0.138 0.028 -4.933 0.000
agec(59,79] -0.924 0.032 -28.920 0.000
agec(79,99] -1.546 0.061 -25.468 0.000

Adding odd ratios and confidence intervals to the estimates

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR LowerOR_Ci UpperOR_Ci
(Intercept) -1.148 0.031 -37.007 0.000 0.317 0.299 0.337
race_ethnh black -0.289 0.038 -7.558 0.000 0.749 0.695 0.807
race_ethnh multirace -0.177 0.060 -2.948 0.003 0.838 0.744 0.942
race_ethnh other -0.660 0.055 -11.939 0.000 0.517 0.464 0.576
race_ethnhwhite -0.620 0.028 -21.917 0.000 0.538 0.509 0.569
agec(29,39] -0.011 0.031 -0.351 0.725 0.989 0.930 1.052
agec(39,59] -0.138 0.028 -4.933 0.000 0.871 0.824 0.920
agec(59,79] -0.924 0.032 -28.920 0.000 0.397 0.373 0.423
agec(79,99] -1.546 0.061 -25.468 0.000 0.213 0.189 0.240

3. Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.

#get a series of predicted probabilites for different "types" of people for each model
#ref_grid will generate all possible combinations of predictors from a model

library(emmeans)
## Warning: package 'emmeans' was built under R version 4.0.3
rg<-ref_grid(fit.logit)


marg_logit<-emmeans(object = rg,
              specs = c("race_eth","agec"),
              type="response" )

knitr::kable(marg_logit,  digits = 4)
race_eth agec prob SE df asymp.LCL asymp.UCL
hispanic (0,29] 0.2409 0.0057 Inf 0.2299 0.2521
nh black (0,29] 0.1920 0.0053 Inf 0.1817 0.2027
nh multirace (0,29] 0.2099 0.0094 Inf 0.1920 0.2290
nh other (0,29] 0.1409 0.0063 Inf 0.1290 0.1537
nhwhite (0,29] 0.1458 0.0030 Inf 0.1401 0.1517
hispanic (29,39] 0.2388 0.0056 Inf 0.2280 0.2501
nh black (29,39] 0.1903 0.0054 Inf 0.1799 0.2011
nh multirace (29,39] 0.2081 0.0096 Inf 0.1899 0.2276
nh other (29,39] 0.1396 0.0063 Inf 0.1277 0.1523
nhwhite (29,39] 0.1445 0.0029 Inf 0.1388 0.1503
hispanic (39,59] 0.2165 0.0050 Inf 0.2068 0.2264
nh black (39,59] 0.1714 0.0045 Inf 0.1629 0.1803
nh multirace (39,59] 0.1879 0.0086 Inf 0.1717 0.2053
nh other (39,59] 0.1250 0.0056 Inf 0.1143 0.1365
nhwhite (39,59] 0.1294 0.0019 Inf 0.1257 0.1333
hispanic (59,79] 0.1119 0.0034 Inf 0.1053 0.1188
nh black (59,79] 0.0862 0.0027 Inf 0.0810 0.0917
nh multirace (59,79] 0.0954 0.0051 Inf 0.0859 0.1059
nh other (59,79] 0.0611 0.0032 Inf 0.0552 0.0676
nhwhite (59,79] 0.0635 0.0013 Inf 0.0610 0.0661
hispanic (79,99] 0.0633 0.0036 Inf 0.0566 0.0709
nh black (79,99] 0.0482 0.0029 Inf 0.0428 0.0542
nh multirace (79,99] 0.0536 0.0040 Inf 0.0463 0.0620
nh other (79,99] 0.0338 0.0024 Inf 0.0293 0.0389
nhwhite (79,99] 0.0351 0.0019 Inf 0.0315 0.0391
#probit model
fit.probit<-svyglm(medcost~race_eth+educ+agec,
                   design=des,
                   family=binomial(link= "probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
rg<-ref_grid(fit.probit)


marg_probit<-emmeans(object = rg,
              specs = c("race_eth","agec"),
              type="response" )

knitr::kable(marg_probit,  digits = 4)
race_eth agec prob SE df asymp.LCL asymp.UCL
hispanic (0,29] 0.2346 0.0056 Inf 0.2239 0.2457
nh black (0,29] 0.2113 0.0058 Inf 0.2001 0.2228
nh multirace (0,29] 0.2355 0.0101 Inf 0.2161 0.2557
nh other (0,29] 0.1753 0.0072 Inf 0.1616 0.1897
nhwhite (0,29] 0.1731 0.0038 Inf 0.1658 0.1806
hispanic (29,39] 0.2404 0.0055 Inf 0.2297 0.2513
nh black (29,39] 0.2167 0.0059 Inf 0.2052 0.2284
nh multirace (29,39] 0.2412 0.0105 Inf 0.2212 0.2621
nh other (29,39] 0.1801 0.0074 Inf 0.1661 0.1949
nhwhite (29,39] 0.1779 0.0038 Inf 0.1705 0.1855
hispanic (39,59] 0.2136 0.0049 Inf 0.2041 0.2233
nh black (39,59] 0.1915 0.0049 Inf 0.1820 0.2012
nh multirace (39,59] 0.2143 0.0094 Inf 0.1964 0.2332
nh other (39,59] 0.1577 0.0065 Inf 0.1453 0.1708
nhwhite (39,59] 0.1557 0.0026 Inf 0.1506 0.1609
hispanic (59,79] 0.1102 0.0035 Inf 0.1036 0.1171
nh black (59,79] 0.0962 0.0032 Inf 0.0900 0.1026
nh multirace (59,79] 0.1107 0.0063 Inf 0.0989 0.1235
nh other (59,79] 0.0756 0.0041 Inf 0.0680 0.0839
nhwhite (59,79] 0.0744 0.0017 Inf 0.0711 0.0779
hispanic (79,99] 0.0600 0.0034 Inf 0.0535 0.0670
nh black (79,99] 0.0512 0.0031 Inf 0.0454 0.0576
nh multirace (79,99] 0.0603 0.0048 Inf 0.0515 0.0703
nh other (79,99] 0.0388 0.0030 Inf 0.0333 0.0451
nhwhite (79,99] 0.0381 0.0021 Inf 0.0341 0.0424
comps<-as.data.frame(marg_logit)

comps[comps$race_eth=="hispanic" & comps$agec == "(0, 29)" , ]
## [1] race_eth  agec      prob      SE        df        asymp.LCL asymp.UCL
## <0 rows> (or 0-length row.names)
comps[comps$race_eth=="nhwhite" & comps$agec == "(0, 29)" , ]
## [1] race_eth  agec      prob      SE        df        asymp.LCL asymp.UCL
## <0 rows> (or 0-length row.names)

The last two tables did not show up well. However, the s cursory look at the previous logit table indicate that nh black individuals under age 29 have an estimated probability of 1.9% of not visiting the doctors office in the last 12 months due to medical cost while a nhwhite individual has about 1.4% probability of not visiting the doctors office in the last 12 months due to medical cost.