The exercise uses logistic regression to model a dichotomous outcome variable, i.e., health status, for Texas’s five levels of race-ethnicity categories (predictor variable). Health status is categorized as good or better health and fair and poor health. Also, the age of respondents is added to the model. The data source is 2021 BRFSS survey data.
library(viridis)
## Loading required package: viridisLite
library(purrr)
library(haven)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:viridis':
##
## viridis_pal
library(summarytools)
library(Rmisc)
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(usmap)
library(ggthemes)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ plyr::compact() masks purrr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ car::recode() masks dplyr::recode()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ car::some() masks purrr::some()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ tibble::view() masks summarytools::view()
library(tidycensus)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library("ggpubr")
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:plyr':
##
## mutate
library(readxl)
library("tinytex")
library("lubridate")
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("splitstackshape")
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following objects are masked from 'package:scales':
##
## alpha, rescale
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(glmmTMB)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.5.0
## Current Matrix version is 1.5.1
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
library(haven)
library(foreign)
library(WriteXLS)
brfss21 = read.xport("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Assignment 4\\LLCP2021.XPT_")
head(brfss21)
names(brfss21)
names(brfss21)<-tolower(names(brfss21))
data1<-subset(brfss21,x_state==48,select=c(x_ageg5yr,x_rfhlth,x_racegr3,iyear,x_state))
summary(data1)
## x_ageg5yr x_rfhlth x_racegr3 iyear
## Min. : 1.000 Min. :1.000 Min. :1.000 Length:10815
## 1st Qu.: 4.000 1st Qu.:1.000 1st Qu.:1.000 Class :character
## Median : 7.000 Median :1.000 Median :1.000 Mode :character
## Mean : 7.202 Mean :1.223 Mean :2.627
## 3rd Qu.:10.000 3rd Qu.:1.000 3rd Qu.:5.000
## Max. :14.000 Max. :9.000 Max. :9.000
## x_state
## Min. :48
## 1st Qu.:48
## Median :48
## Mean :48
## 3rd Qu.:48
## Max. :48
library(dplyr)
data2<-data1%>%
mutate(health=car::Recode(x_rfhlth, recodes = "1=1 ; 2=0; else=NA" , as.factor = T))%>%
mutate(race_ethn=ifelse(x_racegr3==1,"white_nonhispan",ifelse(x_racegr3==2,"black_nonhispan",ifelse(x_racegr3==3,"others_nonhispan",ifelse(x_racegr3==4,"multirace_nonhispan",ifelse(x_racegr3==5,"hispan",NA))))))%>%
filter(!is.na(health),!is.na(race_ethn))
summary(data2)
## x_ageg5yr x_rfhlth x_racegr3 iyear
## Min. : 1.000 Min. :1.000 Min. :1.000 Length:10426
## 1st Qu.: 4.000 1st Qu.:1.000 1st Qu.:1.000 Class :character
## Median : 7.000 Median :1.000 Median :1.000 Mode :character
## Mean : 7.157 Mean :1.199 Mean :2.405
## 3rd Qu.:10.000 3rd Qu.:1.000 3rd Qu.:5.000
## Max. :14.000 Max. :2.000 Max. :5.000
## x_state health race_ethn
## Min. :48 0:2072 Length:10426
## 1st Qu.:48 1:8354 Class :character
## Median :48 Mode :character
## Mean :48
## 3rd Qu.:48
## Max. :48
data2$age<- recode(data2$x_ageg5yr, "1=21;2=27;3=32;4=37;5=42;6=47;7=52;8=57;9=62;10=67;11=72;12=77;13=82")
data2 <-filter(data2, age != 14)
is.numeric(data2$age)
## [1] TRUE
table1::table1(health~race_ethn+age, data = data2, na.rm = TRUE, digits = 1, format.number = TRUE)
## Warning in table1.formula(health ~ race_ethn + age, data = data2, na.rm =
## TRUE, : Unexpected LHS in formula ignored (table1 expects a 1-sided formula)
Overall (N=10268) |
|
---|---|
health | |
0 | 2049 (20.0%) |
1 | 8219 (80.0%) |
race_ethn | |
black_nonhispan | 956 (9.3%) |
hispan | 2991 (29.1%) |
multirace_nonhispan | 197 (1.9%) |
others_nonhispan | 490 (4.8%) |
white_nonhispan | 5634 (54.9%) |
age | |
Mean (SD) | 50 (20) |
Median [Min, Max] | 50 [20, 80] |
nrow(data2)
## [1] 10268
tabyl(data2$health)
## data2$health n percent
## 0 2049 0.199552
## 1 8219 0.800448
tabyl(data2$race_ethn)
## data2$race_ethn n percent
## black_nonhispan 956 0.09310479
## hispan 2991 0.29129334
## multirace_nonhispan 197 0.01918582
## others_nonhispan 490 0.04772108
## white_nonhispan 5634 0.54869497
tabyl(data2$age)
## data2$age n percent
## 21 732 0.07128944
## 27 653 0.06359564
## 32 787 0.07664589
## 37 812 0.07908064
## 42 857 0.08346319
## 47 735 0.07158161
## 52 870 0.08472926
## 57 818 0.07966498
## 62 906 0.08823529
## 67 921 0.08969614
## 72 821 0.07995715
## 77 587 0.05716790
## 82 769 0.07489287
xtabs(~health + race_ethn, data = data2)
## race_ethn
## health black_nonhispan hispan multirace_nonhispan others_nonhispan
## 0 219 821 33 74
## 1 737 2170 164 416
## race_ethn
## health white_nonhispan
## 0 902
## 1 4732
model1<- glm(health ~ race_ethn, data = data2, family = "binomial")
summary(model1)
##
## Call:
## glm(formula = health ~ race_ethn, family = "binomial", data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9444 0.5907 0.5907 0.7214 0.8011
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.21352 0.07696 15.768 < 2e-16 ***
## race_ethnhispan -0.24156 0.08719 -2.770 0.005597 **
## race_ethnmultirace_nonhispan 0.38984 0.20573 1.895 0.058099 .
## race_ethnothers_nonhispan 0.51310 0.14779 3.472 0.000517 ***
## race_ethnwhite_nonhispan 0.44397 0.08511 5.217 1.82e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10263 on 10267 degrees of freedom
## Residual deviance: 10095 on 10263 degrees of freedom
## AIC: 10105
##
## Number of Fisher Scoring iterations: 4
fullModel <- glm(health ~ race_ethn+age, data = data2, family = "binomial")
summary(fullModel)
##
## Call:
## glm(formula = health ~ race_ethn + age, family = "binomial",
## data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3743 0.3814 0.5739 0.6979 1.1772
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.769582 0.116063 23.863 < 2e-16 ***
## race_ethnhispan -0.406965 0.089521 -4.546 5.47e-06 ***
## race_ethnmultirace_nonhispan 0.282378 0.209788 1.346 0.1783
## race_ethnothers_nonhispan 0.336736 0.150595 2.236 0.0253 *
## race_ethnwhite_nonhispan 0.592526 0.087221 6.793 1.10e-11 ***
## age -0.028805 0.001524 -18.906 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10263.5 on 10267 degrees of freedom
## Residual deviance: 9711.3 on 10262 degrees of freedom
## AIC: 9723.3
##
## Number of Fisher Scoring iterations: 4
Being in the Hispanic category ( versus the black non-Hispanic race ethnicity group) decreases the log odds of having good health by (.41). This estimate is statistically highly significant (p<0).
Being in the Others non-Hispanic category ( versus the black non-Hispanic race ethnicity group) increases the log odds of having good health by (0.34). This estimate is statistically significant (p<.01).
Being in the White non-Hispanic category ( versus the black non-Hispanic race ethnicity group) increases the log odds of having good health by (0.59). This estimate is statistically highly significant (p<0).
While The multirace non-Hispanic group is not statistically significant in the model.
For every year of increase in age, the log odds of having good health decreases by (.03). This estimate is statistically highly significant (p<0).
confint(fullModel)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.54400895 2.99904863
## race_ethnhispan -0.58398475 -0.23293947
## race_ethnmultirace_nonhispan -0.11735420 0.70731749
## race_ethnothers_nonhispan 0.04565759 0.63667600
## race_ethnwhite_nonhispan 0.42006861 0.76211548
## age -0.03180521 -0.02583195
confint.default(fullModel)
## 2.5 % 97.5 %
## (Intercept) 2.54210295 2.99706179
## race_ethnhispan -0.58242228 -0.23150686
## race_ethnmultirace_nonhispan -0.12879806 0.69355421
## race_ethnothers_nonhispan 0.04157479 0.63189622
## race_ethnwhite_nonhispan 0.42157560 0.76347665
## age -0.03179148 -0.02581893
exp(cbind(OR = coef(fullModel), confint(fullModel)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 15.9519706 12.7306052 20.0664372
## race_ethnhispan 0.6656678 0.5576718 0.7922015
## race_ethnmultirace_nonhispan 1.3262801 0.8892702 2.0285424
## race_ethnothers_nonhispan 1.4003686 1.0467159 1.8901874
## race_ethnwhite_nonhispan 1.8085513 1.5220660 2.1428045
## age 0.9716057 0.9686953 0.9744988
Hispanic- change in Odds %: (0.67 – 1) * 100 =-33%
This means that a Hispanic person experiences a reduction of 33% in the odds of having good health compared to a Black non-Hispanic.
Other races- change in Odds %: (1.40 – 1) * 100 = 40%
Whites- change in Odds %: (1.81 – 1) * 100 = 81%
At the same time, other races and whites experience an increase of 40% and 81%, respectively, in the odds of having good health compared to Black non-Hispanics.
Age- change in Odds %: (0.97 – 1) * 100 = -3%
This means that each additional increase of one year in age is associated with a 3% decrease in the odds of having good health.
library(aod)
wald.test(b = coef(fullModel), Sigma = vcov(fullModel), Terms = 2:5)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 295.9, df = 4, P(> X2) = 0.0