Homework 4a

Objective:

The exercise uses logistic regression to model a dichotomous outcome variable, i.e., health status, for Texas’s five levels of race-ethnicity categories (predictor variable). Health status is categorized as good or better health and fair and poor health. Also, the age of respondents is added to the model. The data source is 2021 BRFSS survey data.

Loading libraries

library(viridis)

## Loading required package: viridisLite

library(purrr)
library(haven)
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(plyr)

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:purrr':
## 
##     compact

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

library(summarytools)
library(Rmisc)

## Loading required package: lattice

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(usmap)
library(ggthemes)
library(tigris)

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()    masks plyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ plyr::compact()     masks purrr::compact()
## ✖ dplyr::count()      masks plyr::count()
## ✖ scales::discard()   masks purrr::discard()
## ✖ dplyr::failwith()   masks plyr::failwith()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::id()         masks plyr::id()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::mutate()     masks plyr::mutate()
## ✖ car::recode()       masks dplyr::recode()
## ✖ dplyr::rename()     masks plyr::rename()
## ✖ car::some()         masks purrr::some()
## ✖ dplyr::summarise()  masks plyr::summarise()
## ✖ dplyr::summarize()  masks plyr::summarize()
## ✖ tibble::view()      masks summarytools::view()

library(tidycensus)
library(lme4)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library("ggpubr")

## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:plyr':
## 
##     mutate

library(readxl)
library("tinytex")
library("lubridate")

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library("splitstackshape")
library(psych)

## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(glmmTMB)

## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.5.0
## Current Matrix version is 1.5.1
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package

library(haven)
library(foreign)
library(WriteXLS)

Loading the dataset and selecting required variables

brfss21 = read.xport("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Assignment 4\\LLCP2021.XPT_")
head(brfss21)
names(brfss21)
names(brfss21)<-tolower(names(brfss21))

data1<-subset(brfss21,x_state==48,select=c(x_ageg5yr,x_rfhlth,x_racegr3,iyear,x_state)) 
summary(data1)

##    x_ageg5yr         x_rfhlth       x_racegr3        iyear          
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000   Length:10815      
##  1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:1.000   Class :character  
##  Median : 7.000   Median :1.000   Median :1.000   Mode  :character  
##  Mean   : 7.202   Mean   :1.223   Mean   :2.627                     
##  3rd Qu.:10.000   3rd Qu.:1.000   3rd Qu.:5.000                     
##  Max.   :14.000   Max.   :9.000   Max.   :9.000                     
##     x_state  
##  Min.   :48  
##  1st Qu.:48  
##  Median :48  
##  Mean   :48  
##  3rd Qu.:48  
##  Max.   :48

Recoding variables

library(dplyr)
data2<-data1%>%
mutate(health=car::Recode(x_rfhlth, recodes = "1=1 ; 2=0; else=NA" , as.factor = T))%>%
mutate(race_ethn=ifelse(x_racegr3==1,"white_nonhispan",ifelse(x_racegr3==2,"black_nonhispan",ifelse(x_racegr3==3,"others_nonhispan",ifelse(x_racegr3==4,"multirace_nonhispan",ifelse(x_racegr3==5,"hispan",NA))))))%>%
filter(!is.na(health),!is.na(race_ethn))
summary(data2)

##    x_ageg5yr         x_rfhlth       x_racegr3        iyear          
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000   Length:10426      
##  1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:1.000   Class :character  
##  Median : 7.000   Median :1.000   Median :1.000   Mode  :character  
##  Mean   : 7.157   Mean   :1.199   Mean   :2.405                     
##  3rd Qu.:10.000   3rd Qu.:1.000   3rd Qu.:5.000                     
##  Max.   :14.000   Max.   :2.000   Max.   :5.000                     
##     x_state   health    race_ethn        
##  Min.   :48   0:2072   Length:10426      
##  1st Qu.:48   1:8354   Class :character  
##  Median :48            Mode  :character  
##  Mean   :48                              
##  3rd Qu.:48                              
##  Max.   :48

Recoding to midpoint of interval

data2$age<- recode(data2$x_ageg5yr, "1=21;2=27;3=32;4=37;5=42;6=47;7=52;8=57;9=62;10=67;11=72;12=77;13=82")
data2 <-filter(data2, age != 14)
is.numeric(data2$age)

## [1] TRUE

Description of data

table1::table1(health~race_ethn+age, data = data2, na.rm = TRUE, digits = 1, format.number = TRUE)

## Warning in table1.formula(health ~ race_ethn + age, data = data2, na.rm =
## TRUE, : Unexpected LHS in formula ignored (table1 expects a 1-sided formula)

	Overall (N=10268)
health
0	2049 (20.0%)
1	8219 (80.0%)
race_ethn
black_nonhispan	956 (9.3%)
hispan	2991 (29.1%)
multirace_nonhispan	197 (1.9%)
others_nonhispan	490 (4.8%)
white_nonhispan	5634 (54.9%)
age
Mean (SD)	50 (20)
Median [Min, Max]	50 [20, 80]

nrow(data2)

## [1] 10268

tabyl(data2$health)

##  data2$health    n  percent
##             0 2049 0.199552
##             1 8219 0.800448

tabyl(data2$race_ethn)

##      data2$race_ethn    n    percent
##      black_nonhispan  956 0.09310479
##               hispan 2991 0.29129334
##  multirace_nonhispan  197 0.01918582
##     others_nonhispan  490 0.04772108
##      white_nonhispan 5634 0.54869497

tabyl(data2$age)

##  data2$age   n    percent
##         21 732 0.07128944
##         27 653 0.06359564
##         32 787 0.07664589
##         37 812 0.07908064
##         42 857 0.08346319
##         47 735 0.07158161
##         52 870 0.08472926
##         57 818 0.07966498
##         62 906 0.08823529
##         67 921 0.08969614
##         72 821 0.07995715
##         77 587 0.05716790
##         82 769 0.07489287

two-way contingency table of categorical outcome and predictors to make sure there are not 0 cells

xtabs(~health + race_ethn, data = data2)

##       race_ethn
## health black_nonhispan hispan multirace_nonhispan others_nonhispan
##      0             219    821                  33               74
##      1             737   2170                 164              416
##       race_ethn
## health white_nonhispan
##      0             902
##      1            4732

Model: health by race_ethnicity —-

model1<- glm(health ~ race_ethn, data = data2, family = "binomial")
summary(model1)

## 
## Call:
## glm(formula = health ~ race_ethn, family = "binomial", data = data2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9444   0.5907   0.5907   0.7214   0.8011  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   1.21352    0.07696  15.768  < 2e-16 ***
## race_ethnhispan              -0.24156    0.08719  -2.770 0.005597 ** 
## race_ethnmultirace_nonhispan  0.38984    0.20573   1.895 0.058099 .  
## race_ethnothers_nonhispan     0.51310    0.14779   3.472 0.000517 ***
## race_ethnwhite_nonhispan      0.44397    0.08511   5.217 1.82e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10263  on 10267  degrees of freedom
## Residual deviance: 10095  on 10263  degrees of freedom
## AIC: 10105
## 
## Number of Fisher Scoring iterations: 4

Final Model: health by race_ethnicity and age —-

fullModel <- glm(health ~ race_ethn+age, data = data2, family = "binomial")
summary(fullModel)

## 
## Call:
## glm(formula = health ~ race_ethn + age, family = "binomial", 
##     data = data2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3743   0.3814   0.5739   0.6979   1.1772  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   2.769582   0.116063  23.863  < 2e-16 ***
## race_ethnhispan              -0.406965   0.089521  -4.546 5.47e-06 ***
## race_ethnmultirace_nonhispan  0.282378   0.209788   1.346   0.1783    
## race_ethnothers_nonhispan     0.336736   0.150595   2.236   0.0253 *  
## race_ethnwhite_nonhispan      0.592526   0.087221   6.793 1.10e-11 ***
## age                          -0.028805   0.001524 -18.906  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10263.5  on 10267  degrees of freedom
## Residual deviance:  9711.3  on 10262  degrees of freedom
## AIC: 9723.3
## 
## Number of Fisher Scoring iterations: 4

Interpretation

Being in the Hispanic category ( versus the black non-Hispanic race ethnicity group) decreases the log odds of having good health by (.41). This estimate is statistically highly significant (p<0).
Being in the Others non-Hispanic category ( versus the black non-Hispanic race ethnicity group) increases the log odds of having good health by (0.34). This estimate is statistically significant (p<.01).
Being in the White non-Hispanic category ( versus the black non-Hispanic race ethnicity group) increases the log odds of having good health by (0.59). This estimate is statistically highly significant (p<0).
While The multirace non-Hispanic group is not statistically significant in the model.
For every year of increase in age, the log odds of having good health decreases by (.03). This estimate is statistically highly significant (p<0).

confint(fullModel)

## Waiting for profiling to be done...

##                                    2.5 %      97.5 %
## (Intercept)                   2.54400895  2.99904863
## race_ethnhispan              -0.58398475 -0.23293947
## race_ethnmultirace_nonhispan -0.11735420  0.70731749
## race_ethnothers_nonhispan     0.04565759  0.63667600
## race_ethnwhite_nonhispan      0.42006861  0.76211548
## age                          -0.03180521 -0.02583195

confint.default(fullModel)

##                                    2.5 %      97.5 %
## (Intercept)                   2.54210295  2.99706179
## race_ethnhispan              -0.58242228 -0.23150686
## race_ethnmultirace_nonhispan -0.12879806  0.69355421
## race_ethnothers_nonhispan     0.04157479  0.63189622
## race_ethnwhite_nonhispan      0.42157560  0.76347665
## age                          -0.03179148 -0.02581893

Odds ratios and 95% CI

exp(cbind(OR = coef(fullModel), confint(fullModel)))

## Waiting for profiling to be done...

##                                      OR      2.5 %     97.5 %
## (Intercept)                  15.9519706 12.7306052 20.0664372
## race_ethnhispan               0.6656678  0.5576718  0.7922015
## race_ethnmultirace_nonhispan  1.3262801  0.8892702  2.0285424
## race_ethnothers_nonhispan     1.4003686  1.0467159  1.8901874
## race_ethnwhite_nonhispan      1.8085513  1.5220660  2.1428045
## age                           0.9716057  0.9686953  0.9744988

Interpretation

Hispanic- change in Odds %: (0.67 – 1) * 100 =-33%
This means that a Hispanic person experiences a reduction of 33% in the odds of having good health compared to a Black non-Hispanic.
Other races- change in Odds %: (1.40 – 1) * 100 = 40%
Whites- change in Odds %: (1.81 – 1) * 100 = 81%
At the same time, other races and whites experience an increase of 40% and 81%, respectively, in the odds of having good health compared to Black non-Hispanics.
Age- change in Odds %: (0.97 – 1) * 100 = -3%
This means that each additional increase of one year in age is associated with a 3% decrease in the odds of having good health.

Wald test: testing overall contribution of race_ethnicity

library(aod)
wald.test(b = coef(fullModel), Sigma = vcov(fullModel), Terms = 2:5)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 295.9, df = 4, P(> X2) = 0.0

Interpretation

The chi-squared test statistic of 295.9, with four degrees of freedom is associated with a p-value of 0.0 indicating that the overall effect of race ethnicity variable is statistically significant.

Homework 4a

Anamika Kumar

2023-02-19

Objective:

Loading libraries

Loading the dataset and selecting required variables

Recoding variables

Recoding to midpoint of interval

Description of data

two-way contingency table of categorical outcome and predictors to make sure there are not 0 cells

Model: health by race_ethnicity —-

Final Model: health by race_ethnicity and age —-

Interpretation

Odds ratios and 95% CI

Interpretation

Wald test: testing overall contribution of race_ethnicity

Interpretation