For this homework, you will perform a logistic regression (or probit regression) of a binary outcome, using the dataset of your choice. I ask you specify a research question for your analysis and generate appropriate predictors in order to examine your question.
My research question for this assignment which is same as last weeks assignments is: How does race/ethnicity and age affect the likelihood of a person visiting a doctors office in the last 12 months?
The predictor variables are age, and race/ethnicity. To undertake this assignment, am going to use the BRFSS 2019 data set.
#load libraries
library(car)
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Warning: package 'survey' was built under R version 4.0.3
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.3
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
## Warning: package 'questionr' was built under R version 4.0.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
library(haven)
library(readr)
#load brfss Data set
brfss2019 <- read_csv("C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## IDATE = col_character(),
## IMONTH = col_character(),
## IDAY = col_character(),
## COLGHOUS = col_logical(),
## COLGSEX = col_logical(),
## SAFETIME = col_logical(),
## CTELNUM1 = col_logical(),
## CELLFON5 = col_logical(),
## CADULT1 = col_logical(),
## CELLSEX = col_logical(),
## PVTRESD3 = col_logical(),
## CCLGHOUS = col_logical(),
## CSTATE1 = col_logical(),
## LANDLINE = col_logical(),
## HHADULT = col_logical(),
## TOLDCFS = col_logical(),
## HAVECFS = col_logical(),
## WORKCFS = col_logical(),
## TOLDHEPC = col_logical(),
## TRETHEPC = col_logical()
## # ... with 88 more columns
## )
## i Use `spec()` for the full column specifications.
## Warning: 2665849 parsing failures.
## row col expected actual file
## 2224 HHADULT 1/0/T/F/TRUE/FALSE 2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2225 LANDLINE 1/0/T/F/TRUE/FALSE 2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2225 HHADULT 1/0/T/F/TRUE/FALSE 2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2226 LANDLINE 1/0/T/F/TRUE/FALSE 2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## 2227 CELLSEX 1/0/T/F/TRUE/FALSE 2 'C:/Users/chris/OneDrive/Desktop/brfss 2019/brfss2019.csv'
## .... ........ .................. ...... ..........................................................
## See problems(...) for more details.
Work on names of variables
nams<- names(brfss2019)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss2019)<-newnames
Recoding Variables
# Health care cost
brfss2019$medcost<-Recode(brfss2019$medcost, recodes ="7:9=NA; 1=1;2=0")
#Age cut into intervals
brfss2019$agec<-cut(brfss2019$age80, breaks=c(0,24,39,59,79,99))
# race/ethnicity
brfss2019$black<-Recode(brfss2019$racegr3, recodes="2=1; 9=NA; else=0")
brfss2019$white<-Recode(brfss2019$racegr3, recodes="1=1; 9=NA; else=0")
brfss2019$other<-Recode(brfss2019$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss2019$hispanic<-Recode(brfss2019$racegr3, recodes="5=1; 9=NA; else=0")
brfss2019$race_eth<-Recode(brfss2019$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)
#education level
brfss2019$educ<-Recode(brfss2019$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
# brfss2019$educ<-relevel(brfss2019$educ, ref='2hsgrad')
#Age cut into intervals
brfss2019$agec<-cut(brfss2019$age80, breaks=c(0,29,39,59,79,99))
2. Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.
Survey Design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1, strata= ~ststr, weights= ~llcpwt, data = brfss2019 )
Fitting the logistic regression model
fit.logit<-svyglm(medcost ~ race_eth + agec, design = des, family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = medcost ~ race_eth + agec, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = brfss2019)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.14799 0.03102 -37.007 < 2e-16 ***
## race_ethnh black -0.28913 0.03826 -7.558 4.11e-14 ***
## race_ethnh multirace -0.17726 0.06013 -2.948 0.0032 **
## race_ethnh other -0.65988 0.05527 -11.939 < 2e-16 ***
## race_ethnhwhite -0.61975 0.02828 -21.917 < 2e-16 ***
## agec(29,39] -0.01105 0.03147 -0.351 0.7255
## agec(39,59] -0.13835 0.02804 -4.933 8.09e-07 ***
## agec(59,79] -0.92393 0.03195 -28.920 < 2e-16 ***
## agec(79,99] -1.54569 0.06069 -25.468 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9976046)
##
## Number of Fisher Scoring iterations: 5
library(broom)
## Warning: package 'broom' was built under R version 4.0.3
fit.logit%>%
tidy()%>%
knitr::kable(digits = 3)
| (Intercept) |
-1.148 |
0.031 |
-37.007 |
0.000 |
| race_ethnh black |
-0.289 |
0.038 |
-7.558 |
0.000 |
| race_ethnh multirace |
-0.177 |
0.060 |
-2.948 |
0.003 |
| race_ethnh other |
-0.660 |
0.055 |
-11.939 |
0.000 |
| race_ethnhwhite |
-0.620 |
0.028 |
-21.917 |
0.000 |
| agec(29,39] |
-0.011 |
0.031 |
-0.351 |
0.725 |
| agec(39,59] |
-0.138 |
0.028 |
-4.933 |
0.000 |
| agec(59,79] |
-0.924 |
0.032 |
-28.920 |
0.000 |
| agec(79,99] |
-1.546 |
0.061 |
-25.468 |
0.000 |
Adding odd ratios and confidence intervals to the estimates
fit.logit%>%
tidy()%>%
mutate(OR = exp(estimate),
LowerOR_Ci = exp(estimate - 1.96*std.error),
UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
knitr::kable(digits = 3)
| (Intercept) |
-1.148 |
0.031 |
-37.007 |
0.000 |
0.317 |
0.299 |
0.337 |
| race_ethnh black |
-0.289 |
0.038 |
-7.558 |
0.000 |
0.749 |
0.695 |
0.807 |
| race_ethnh multirace |
-0.177 |
0.060 |
-2.948 |
0.003 |
0.838 |
0.744 |
0.942 |
| race_ethnh other |
-0.660 |
0.055 |
-11.939 |
0.000 |
0.517 |
0.464 |
0.576 |
| race_ethnhwhite |
-0.620 |
0.028 |
-21.917 |
0.000 |
0.538 |
0.509 |
0.569 |
| agec(29,39] |
-0.011 |
0.031 |
-0.351 |
0.725 |
0.989 |
0.930 |
1.052 |
| agec(39,59] |
-0.138 |
0.028 |
-4.933 |
0.000 |
0.871 |
0.824 |
0.920 |
| agec(59,79] |
-0.924 |
0.032 |
-28.920 |
0.000 |
0.397 |
0.373 |
0.423 |
| agec(79,99] |
-1.546 |
0.061 |
-25.468 |
0.000 |
0.213 |
0.189 |
0.240 |
3. Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.
#get a series of predicted probabilites for different "types" of people for each model
#ref_grid will generate all possible combinations of predictors from a model
library(emmeans)
## Warning: package 'emmeans' was built under R version 4.0.3
rg<-ref_grid(fit.logit)
marg_logit<-emmeans(object = rg,
specs = c("race_eth","agec"),
type="response" )
knitr::kable(marg_logit, digits = 4)
| hispanic |
(0,29] |
0.2409 |
0.0057 |
Inf |
0.2299 |
0.2521 |
| nh black |
(0,29] |
0.1920 |
0.0053 |
Inf |
0.1817 |
0.2027 |
| nh multirace |
(0,29] |
0.2099 |
0.0094 |
Inf |
0.1920 |
0.2290 |
| nh other |
(0,29] |
0.1409 |
0.0063 |
Inf |
0.1290 |
0.1537 |
| nhwhite |
(0,29] |
0.1458 |
0.0030 |
Inf |
0.1401 |
0.1517 |
| hispanic |
(29,39] |
0.2388 |
0.0056 |
Inf |
0.2280 |
0.2501 |
| nh black |
(29,39] |
0.1903 |
0.0054 |
Inf |
0.1799 |
0.2011 |
| nh multirace |
(29,39] |
0.2081 |
0.0096 |
Inf |
0.1899 |
0.2276 |
| nh other |
(29,39] |
0.1396 |
0.0063 |
Inf |
0.1277 |
0.1523 |
| nhwhite |
(29,39] |
0.1445 |
0.0029 |
Inf |
0.1388 |
0.1503 |
| hispanic |
(39,59] |
0.2165 |
0.0050 |
Inf |
0.2068 |
0.2264 |
| nh black |
(39,59] |
0.1714 |
0.0045 |
Inf |
0.1629 |
0.1803 |
| nh multirace |
(39,59] |
0.1879 |
0.0086 |
Inf |
0.1717 |
0.2053 |
| nh other |
(39,59] |
0.1250 |
0.0056 |
Inf |
0.1143 |
0.1365 |
| nhwhite |
(39,59] |
0.1294 |
0.0019 |
Inf |
0.1257 |
0.1333 |
| hispanic |
(59,79] |
0.1119 |
0.0034 |
Inf |
0.1053 |
0.1188 |
| nh black |
(59,79] |
0.0862 |
0.0027 |
Inf |
0.0810 |
0.0917 |
| nh multirace |
(59,79] |
0.0954 |
0.0051 |
Inf |
0.0859 |
0.1059 |
| nh other |
(59,79] |
0.0611 |
0.0032 |
Inf |
0.0552 |
0.0676 |
| nhwhite |
(59,79] |
0.0635 |
0.0013 |
Inf |
0.0610 |
0.0661 |
| hispanic |
(79,99] |
0.0633 |
0.0036 |
Inf |
0.0566 |
0.0709 |
| nh black |
(79,99] |
0.0482 |
0.0029 |
Inf |
0.0428 |
0.0542 |
| nh multirace |
(79,99] |
0.0536 |
0.0040 |
Inf |
0.0463 |
0.0620 |
| nh other |
(79,99] |
0.0338 |
0.0024 |
Inf |
0.0293 |
0.0389 |
| nhwhite |
(79,99] |
0.0351 |
0.0019 |
Inf |
0.0315 |
0.0391 |
#probit model
fit.probit<-svyglm(medcost~race_eth+educ+agec,
design=des,
family=binomial(link= "probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
rg<-ref_grid(fit.probit)
marg_probit<-emmeans(object = rg,
specs = c("race_eth","agec"),
type="response" )
knitr::kable(marg_probit, digits = 4)
| hispanic |
(0,29] |
0.2346 |
0.0056 |
Inf |
0.2239 |
0.2457 |
| nh black |
(0,29] |
0.2113 |
0.0058 |
Inf |
0.2001 |
0.2228 |
| nh multirace |
(0,29] |
0.2355 |
0.0101 |
Inf |
0.2161 |
0.2557 |
| nh other |
(0,29] |
0.1753 |
0.0072 |
Inf |
0.1616 |
0.1897 |
| nhwhite |
(0,29] |
0.1731 |
0.0038 |
Inf |
0.1658 |
0.1806 |
| hispanic |
(29,39] |
0.2404 |
0.0055 |
Inf |
0.2297 |
0.2513 |
| nh black |
(29,39] |
0.2167 |
0.0059 |
Inf |
0.2052 |
0.2284 |
| nh multirace |
(29,39] |
0.2412 |
0.0105 |
Inf |
0.2212 |
0.2621 |
| nh other |
(29,39] |
0.1801 |
0.0074 |
Inf |
0.1661 |
0.1949 |
| nhwhite |
(29,39] |
0.1779 |
0.0038 |
Inf |
0.1705 |
0.1855 |
| hispanic |
(39,59] |
0.2136 |
0.0049 |
Inf |
0.2041 |
0.2233 |
| nh black |
(39,59] |
0.1915 |
0.0049 |
Inf |
0.1820 |
0.2012 |
| nh multirace |
(39,59] |
0.2143 |
0.0094 |
Inf |
0.1964 |
0.2332 |
| nh other |
(39,59] |
0.1577 |
0.0065 |
Inf |
0.1453 |
0.1708 |
| nhwhite |
(39,59] |
0.1557 |
0.0026 |
Inf |
0.1506 |
0.1609 |
| hispanic |
(59,79] |
0.1102 |
0.0035 |
Inf |
0.1036 |
0.1171 |
| nh black |
(59,79] |
0.0962 |
0.0032 |
Inf |
0.0900 |
0.1026 |
| nh multirace |
(59,79] |
0.1107 |
0.0063 |
Inf |
0.0989 |
0.1235 |
| nh other |
(59,79] |
0.0756 |
0.0041 |
Inf |
0.0680 |
0.0839 |
| nhwhite |
(59,79] |
0.0744 |
0.0017 |
Inf |
0.0711 |
0.0779 |
| hispanic |
(79,99] |
0.0600 |
0.0034 |
Inf |
0.0535 |
0.0670 |
| nh black |
(79,99] |
0.0512 |
0.0031 |
Inf |
0.0454 |
0.0576 |
| nh multirace |
(79,99] |
0.0603 |
0.0048 |
Inf |
0.0515 |
0.0703 |
| nh other |
(79,99] |
0.0388 |
0.0030 |
Inf |
0.0333 |
0.0451 |
| nhwhite |
(79,99] |
0.0381 |
0.0021 |
Inf |
0.0341 |
0.0424 |
comps<-as.data.frame(marg_logit)
comps[comps$race_eth=="hispanic" & comps$agec == "(0, 29)" , ]
## [1] race_eth agec prob SE df asymp.LCL asymp.UCL
## <0 rows> (or 0-length row.names)
comps[comps$race_eth=="nhwhite" & comps$agec == "(0, 29)" , ]
## [1] race_eth agec prob SE df asymp.LCL asymp.UCL
## <0 rows> (or 0-length row.names)
The last two tables did not show up well. However, the s cursory look at the previous logit table indicate that nh black individuals under age 29 have an estimated probability of 1.9% of not visiting the doctors office in the last 12 months due to medical cost while a nhwhite individual has about 1.4% probability of not visiting the doctors office in the last 12 months due to medical cost.