Objective:

The exercise uses logistic regression to model a dichotomous outcome variable, i.e., health status, for Texas’s five levels of race-ethnicity categories (predictor variable). Health status is categorized as good or better health and fair and poor health. Also, the age of respondents is added to the model. The data source is 2021 BRFSS survey data.

Loading libraries

library(viridis)
## Loading required package: viridisLite
library(purrr)
library(haven)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
library(summarytools)
library(Rmisc)
## Loading required package: lattice
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(usmap)
library(ggthemes)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()    masks plyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ plyr::compact()     masks purrr::compact()
## ✖ dplyr::count()      masks plyr::count()
## ✖ scales::discard()   masks purrr::discard()
## ✖ dplyr::failwith()   masks plyr::failwith()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::id()         masks plyr::id()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::mutate()     masks plyr::mutate()
## ✖ car::recode()       masks dplyr::recode()
## ✖ dplyr::rename()     masks plyr::rename()
## ✖ car::some()         masks purrr::some()
## ✖ dplyr::summarise()  masks plyr::summarise()
## ✖ dplyr::summarize()  masks plyr::summarize()
## ✖ tibble::view()      masks summarytools::view()
library(tidycensus)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library("ggpubr")
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:plyr':
## 
##     mutate
library(readxl)
library("tinytex")
library("lubridate")
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library("splitstackshape")
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(glmmTMB)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.5.0
## Current Matrix version is 1.5.1
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
library(haven)
library(foreign)
library(WriteXLS)

Loading the dataset and selecting required variables

brfss21 = read.xport("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Assignment 4\\LLCP2021.XPT_")
head(brfss21)
names(brfss21)
names(brfss21)<-tolower(names(brfss21))
data1<-subset(brfss21,x_state==48,select=c(x_ageg5yr,x_rfhlth,x_racegr3,iyear,x_state)) 
summary(data1)
##    x_ageg5yr         x_rfhlth       x_racegr3        iyear          
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000   Length:10815      
##  1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:1.000   Class :character  
##  Median : 7.000   Median :1.000   Median :1.000   Mode  :character  
##  Mean   : 7.202   Mean   :1.223   Mean   :2.627                     
##  3rd Qu.:10.000   3rd Qu.:1.000   3rd Qu.:5.000                     
##  Max.   :14.000   Max.   :9.000   Max.   :9.000                     
##     x_state  
##  Min.   :48  
##  1st Qu.:48  
##  Median :48  
##  Mean   :48  
##  3rd Qu.:48  
##  Max.   :48

Recoding variables

library(dplyr)
data2<-data1%>%
mutate(health=car::Recode(x_rfhlth, recodes = "1=1 ; 2=0; else=NA" , as.factor = T))%>%
mutate(race_ethn=ifelse(x_racegr3==1,"white_nonhispan",ifelse(x_racegr3==2,"black_nonhispan",ifelse(x_racegr3==3,"others_nonhispan",ifelse(x_racegr3==4,"multirace_nonhispan",ifelse(x_racegr3==5,"hispan",NA))))))%>%
filter(!is.na(health),!is.na(race_ethn))
summary(data2)
##    x_ageg5yr         x_rfhlth       x_racegr3        iyear          
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000   Length:10426      
##  1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:1.000   Class :character  
##  Median : 7.000   Median :1.000   Median :1.000   Mode  :character  
##  Mean   : 7.157   Mean   :1.199   Mean   :2.405                     
##  3rd Qu.:10.000   3rd Qu.:1.000   3rd Qu.:5.000                     
##  Max.   :14.000   Max.   :2.000   Max.   :5.000                     
##     x_state   health    race_ethn        
##  Min.   :48   0:2072   Length:10426      
##  1st Qu.:48   1:8354   Class :character  
##  Median :48            Mode  :character  
##  Mean   :48                              
##  3rd Qu.:48                              
##  Max.   :48

Recoding to midpoint of interval

data2$age<- recode(data2$x_ageg5yr, "1=21;2=27;3=32;4=37;5=42;6=47;7=52;8=57;9=62;10=67;11=72;12=77;13=82")
data2 <-filter(data2, age != 14)
is.numeric(data2$age)
## [1] TRUE

Description of data

table1::table1(health~race_ethn+age, data = data2, na.rm = TRUE, digits = 1, format.number = TRUE)
## Warning in table1.formula(health ~ race_ethn + age, data = data2, na.rm =
## TRUE, : Unexpected LHS in formula ignored (table1 expects a 1-sided formula)
Overall
(N=10268)
health
0 2049 (20.0%)
1 8219 (80.0%)
race_ethn
black_nonhispan 956 (9.3%)
hispan 2991 (29.1%)
multirace_nonhispan 197 (1.9%)
others_nonhispan 490 (4.8%)
white_nonhispan 5634 (54.9%)
age
Mean (SD) 50 (20)
Median [Min, Max] 50 [20, 80]
nrow(data2)
## [1] 10268
tabyl(data2$health)
##  data2$health    n  percent
##             0 2049 0.199552
##             1 8219 0.800448
tabyl(data2$race_ethn)
##      data2$race_ethn    n    percent
##      black_nonhispan  956 0.09310479
##               hispan 2991 0.29129334
##  multirace_nonhispan  197 0.01918582
##     others_nonhispan  490 0.04772108
##      white_nonhispan 5634 0.54869497
tabyl(data2$age)
##  data2$age   n    percent
##         21 732 0.07128944
##         27 653 0.06359564
##         32 787 0.07664589
##         37 812 0.07908064
##         42 857 0.08346319
##         47 735 0.07158161
##         52 870 0.08472926
##         57 818 0.07966498
##         62 906 0.08823529
##         67 921 0.08969614
##         72 821 0.07995715
##         77 587 0.05716790
##         82 769 0.07489287

two-way contingency table of categorical outcome and predictors to make sure there are not 0 cells

xtabs(~health + race_ethn, data = data2)
##       race_ethn
## health black_nonhispan hispan multirace_nonhispan others_nonhispan
##      0             219    821                  33               74
##      1             737   2170                 164              416
##       race_ethn
## health white_nonhispan
##      0             902
##      1            4732

Model: health by race_ethnicity —-

model1<- glm(health ~ race_ethn, data = data2, family = "binomial")
summary(model1)
## 
## Call:
## glm(formula = health ~ race_ethn, family = "binomial", data = data2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9444   0.5907   0.5907   0.7214   0.8011  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   1.21352    0.07696  15.768  < 2e-16 ***
## race_ethnhispan              -0.24156    0.08719  -2.770 0.005597 ** 
## race_ethnmultirace_nonhispan  0.38984    0.20573   1.895 0.058099 .  
## race_ethnothers_nonhispan     0.51310    0.14779   3.472 0.000517 ***
## race_ethnwhite_nonhispan      0.44397    0.08511   5.217 1.82e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10263  on 10267  degrees of freedom
## Residual deviance: 10095  on 10263  degrees of freedom
## AIC: 10105
## 
## Number of Fisher Scoring iterations: 4

Final Model: health by race_ethnicity and age —-

fullModel <- glm(health ~ race_ethn+age, data = data2, family = "binomial")
summary(fullModel)
## 
## Call:
## glm(formula = health ~ race_ethn + age, family = "binomial", 
##     data = data2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3743   0.3814   0.5739   0.6979   1.1772  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   2.769582   0.116063  23.863  < 2e-16 ***
## race_ethnhispan              -0.406965   0.089521  -4.546 5.47e-06 ***
## race_ethnmultirace_nonhispan  0.282378   0.209788   1.346   0.1783    
## race_ethnothers_nonhispan     0.336736   0.150595   2.236   0.0253 *  
## race_ethnwhite_nonhispan      0.592526   0.087221   6.793 1.10e-11 ***
## age                          -0.028805   0.001524 -18.906  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10263.5  on 10267  degrees of freedom
## Residual deviance:  9711.3  on 10262  degrees of freedom
## AIC: 9723.3
## 
## Number of Fisher Scoring iterations: 4

Interpretation

confint(fullModel)
## Waiting for profiling to be done...
##                                    2.5 %      97.5 %
## (Intercept)                   2.54400895  2.99904863
## race_ethnhispan              -0.58398475 -0.23293947
## race_ethnmultirace_nonhispan -0.11735420  0.70731749
## race_ethnothers_nonhispan     0.04565759  0.63667600
## race_ethnwhite_nonhispan      0.42006861  0.76211548
## age                          -0.03180521 -0.02583195
confint.default(fullModel)
##                                    2.5 %      97.5 %
## (Intercept)                   2.54210295  2.99706179
## race_ethnhispan              -0.58242228 -0.23150686
## race_ethnmultirace_nonhispan -0.12879806  0.69355421
## race_ethnothers_nonhispan     0.04157479  0.63189622
## race_ethnwhite_nonhispan      0.42157560  0.76347665
## age                          -0.03179148 -0.02581893

Odds ratios and 95% CI

exp(cbind(OR = coef(fullModel), confint(fullModel)))
## Waiting for profiling to be done...
##                                      OR      2.5 %     97.5 %
## (Intercept)                  15.9519706 12.7306052 20.0664372
## race_ethnhispan               0.6656678  0.5576718  0.7922015
## race_ethnmultirace_nonhispan  1.3262801  0.8892702  2.0285424
## race_ethnothers_nonhispan     1.4003686  1.0467159  1.8901874
## race_ethnwhite_nonhispan      1.8085513  1.5220660  2.1428045
## age                           0.9716057  0.9686953  0.9744988

Interpretation

Wald test: testing overall contribution of race_ethnicity

library(aod)
wald.test(b = coef(fullModel), Sigma = vcov(fullModel), Terms = 2:5)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 295.9, df = 4, P(> X2) = 0.0

Interpretation