For this short report, I use 2020 Behavioral Risk Factor Surveillance System data on chronic health conditions. I use the outcome of depressive disorder diagnosis as my outcome.
### Load libraries
library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v stringr 1.4.0
## v tidyr 1.1.4 v forcats 0.5.1
## v readr 2.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
### Load data
brfss20 = readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))
### Fix variable names
names(brfss20) = tolower(gsub(pattern = "_",
replacement = "",
x = names(brfss20)))
Here I recode some of my independent variables and exclude missing data:
general health = genhtlh sex = sex age = age race/ethnicity education = educa income = incg
#Depressive disorder
brfss20$depression = as.factor(Recode(brfss20$addepev3,
recodes="1=1; 2=0; else=NA"))
#Good versus poor health
brfss20$poorhlth = Recode(brfss20$genhlth,
recodes="4:5=1; 1:3=0; else=NA")
#Sex
brfss20$male = as.factor(ifelse(brfss20$sex==1,
"Male",
"Female"))
#Race/ethnicity
brfss20$black = Recode(brfss20$racegr3,
recodes="2=1; 9=NA; else=0")
brfss20$white = Recode(brfss20$racegr3,
recodes="1=1; 9=NA; else=0")
brfss20$other = Recode(brfss20$racegr3,
recodes="3:4=1; 9=NA; else=0")
brfss20$hispanic = Recode(brfss20$racegr3,
recodes="5=1; 9=NA; else=0")
brfss20$race_eth = Recode(brfss20$racegr3,
recodes="1='nh white'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss20$race_eth = relevel(brfss20$race_eth,
ref = "nh white")
#Education level
brfss20$educ = Recode(brfss20$educa,
recodes="1:2='Less than HS';
3='Some HS';
4='HS grad';
5='Some college';
6='College grad';9=NA",
as.factor=T)
brfss20$educ = fct_relevel(brfss20$educ,'Less than HS','Some HS','HS grad','Some college','College grad' )
#Age cut into intervals
brfss20$agec = cut(brfss20$age80,
breaks=c(0,24,39,59,79,99))
#Income grouping
brfss20$inc = ifelse(brfss20$incomg==9,
NA,
brfss20$incomg)
#Filter cases
brfss20 = brfss20 %>%
filter(is.na(depression)==F,
is.na(poorhlth)==F,
is.na(male)==F,
is.na(race_eth)==F,
is.na(educ)==F,
is.na(agec)==F,
is.na(inc)==F)
model.dat = brfss20 %>%
dplyr::select(depression,poorhlth, male, race_eth,educ, agec,inc)
knitr::kable(head(model.dat))
| depression | poorhlth | male | race_eth | educ | agec | inc |
|---|---|---|---|---|---|---|
| 1 | 0 | Female | nh white | HS grad | (0,24] | 1 |
| 0 | 0 | Male | nh black | HS grad | (39,59] | 5 |
| 0 | 0 | Female | nh white | Some college | (39,59] | 5 |
| 0 | 0 | Male | nh white | Some college | (59,79] | 5 |
| 0 | 0 | Male | hispanic | College grad | (59,79] | 5 |
| 0 | 0 | Male | nh white | HS grad | (39,59] | 2 |
For this predictive model, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.
We use an 80% training fraction, which is standard.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1016)
train = createDataPartition(y = model.dat$depression,
p = .80,
list=F)
model.dat_train = model.dat[train,]
model.dat_test = model.dat[-train,]
table(model.dat_train$depression)
##
## 0 1
## 98035 23948
prop.table(table(model.dat_train$depression))
##
## 0 1
## 0.8036776 0.1963224
summary(model.dat_train)
## depression poorhlth male race_eth
## 0:98035 Min. :0.0000 Female:64225 nh white :89299
## 1:23948 1st Qu.:0.0000 Male :57758 hispanic :12756
## Median :0.0000 nh black :12285
## Mean :0.1329 nh multirace: 2219
## 3rd Qu.:0.0000 nh other : 5424
## Max. :1.0000
## educ agec inc
## Less than HS: 2044 (0,24] : 7461 Min. :1.000
## Some HS : 4202 (24,39]:24995 1st Qu.:3.000
## HS grad :27642 (39,59]:40898 Median :5.000
## Some college:32423 (59,79]:41277 Mean :4.023
## College grad:55672 (79,99]: 7352 3rd Qu.:5.000
## Max. :5.000
Here I use a basic binomial GLM to estimate the probability of an individual having depressive disorder on the basis of general health status, sex, race/ethnicity, level of education, age, and level of income.
This model can be written: \[ln \left ( \frac{Pr(\text{Depressive Disorer})}{1-Pr(\text{Depressive Disorer})} \right ) = X' \beta\]
Which can be converted to the probability scale via the inverse logit transform:
\[Pr(\text{Depressive Disorer}) = \frac{1}{1+exp (-X' \beta)}\]
glm1 = glm(depression ~
factor(poorhlth)+
factor(male)+
factor(race_eth)+
factor(educ)+
factor(agec)+
factor(inc),
data=model.dat_train,
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = depression ~ factor(poorhlth) + factor(male) +
## factor(race_eth) + factor(educ) + factor(agec) + factor(inc),
## family = binomial, data = model.dat_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6795 -0.6995 -0.5330 -0.3797 2.8573
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.561257 0.073392 -7.647 2.05e-14 ***
## factor(poorhlth)1 1.104934 0.020323 54.370 < 2e-16 ***
## factor(male)Male -0.711068 0.015741 -45.173 < 2e-16 ***
## factor(race_eth)hispanic -0.754333 0.028167 -26.781 < 2e-16 ***
## factor(race_eth)nh black -0.721564 0.027708 -26.042 < 2e-16 ***
## factor(race_eth)nh multirace 0.030296 0.051645 0.587 0.557
## factor(race_eth)nh other -0.597714 0.040985 -14.584 < 2e-16 ***
## factor(educ)Some HS 0.423706 0.073029 5.802 6.56e-09 ***
## factor(educ)HS grad 0.359163 0.065395 5.492 3.97e-08 ***
## factor(educ)Some college 0.593256 0.065463 9.062 < 2e-16 ***
## factor(educ)College grad 0.471220 0.065785 7.163 7.89e-13 ***
## factor(agec)(24,39] -0.006321 0.032822 -0.193 0.847
## factor(agec)(39,59] -0.232652 0.031769 -7.323 2.42e-13 ***
## factor(agec)(59,79] -0.601366 0.032114 -18.726 < 2e-16 ***
## factor(agec)(79,99] -1.581856 0.050219 -31.499 < 2e-16 ***
## factor(inc)2 -0.310691 0.030996 -10.024 < 2e-16 ***
## factor(inc)3 -0.469431 0.034933 -13.438 < 2e-16 ***
## factor(inc)4 -0.603277 0.033096 -18.228 < 2e-16 ***
## factor(inc)5 -0.960438 0.029386 -32.683 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 120827 on 121982 degrees of freedom
## Residual deviance: 111017 on 121964 degrees of freedom
## AIC: 111055
##
## Number of Fisher Scoring iterations: 5
We see that almost all predictors are significantly related to our outcome. Next we see how the model performs in terms of accuracy of prediction.
tr_pred = predict(glm1,
newdata = model.dat_train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.44964776 0.09613964 0.04240160 0.18900246 0.28374365 0.21701820
These are the estimated probability that each individual has depressive disorder, based on the model.
In order to create classes (has depressive disorder vs does not have depressive disorder) we have to use a decision rule:
\(Pr(y=\text{Depressive Disorder} |X) >.5\) Then classify the observation as an individaul with depressive disorder, and otherwise not.
tr_predcl = factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1 = data.frame(pr=tr_pred,
gr=tr_predcl,
depression=model.dat_train$depression)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=depression, fill=depression))+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.
table(tr_predcl,
model.dat_train$depression)
##
## tr_predcl 0 1
## 0 96573 21789
## 1 1462 2159
cm1<-confusionMatrix(data = tr_predcl,
reference = model.dat_train$depression )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 96573 21789
## 1 1462 2159
##
## Accuracy : 0.8094
## 95% CI : (0.8072, 0.8116)
## No Information Rate : 0.8037
## P-Value [Acc > NIR] : 2.347e-07
##
## Kappa : 0.1108
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.98509
## Specificity : 0.09015
## Pos Pred Value : 0.81591
## Neg Pred Value : 0.59624
## Prevalence : 0.80368
## Detection Rate : 0.79169
## Detection Prevalence : 0.97032
## Balanced Accuracy : 0.53762
##
## 'Positive' Class : 0
##
Overall the model has an approximately 81% accuracy, however the sensitivity is almost 99% which is very high. The specificity is only around 9% indicating 9% of individuals with depressive disorder were predicted as having depressive disorder. In other words, the model can predict well if you don’t have depressive disorder, but not at predicting if you do.
Next, I use the mean of the response as the cutoff value.
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat_train$depression==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred,
gr=tr_predcl,
modcon=model.dat_train$depression)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(model.dat_train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat_train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
confusionMatrix(data = tr_predcl,
model.dat_train$depression,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 61990 8547
## 1 36045 15401
##
## Accuracy : 0.6344
## 95% CI : (0.6317, 0.6371)
## No Information Rate : 0.8037
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1921
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6431
## Specificity : 0.6323
## Pos Pred Value : 0.2994
## Neg Pred Value : 0.8788
## Prevalence : 0.1963
## Detection Rate : 0.1263
## Detection Prevalence : 0.4217
## Balanced Accuracy : 0.6377
##
## 'Positive' Class : 1
##
This mean cutoff produces a reduced accuracy, but decreases the sensitivity at the cost of highly increased specificity.
Next I perform confusion matrices on the test set to evaluate model performance outside of the training data.
pred_test<-predict(glm1,
newdata=model.dat_test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean(I(model.dat_test$depression==1)), 1, 0))
table(model.dat_test$depression,pred_cl)
## pred_cl
## 0 1
## 0 15445 9063
## 1 2147 3839
confusionMatrix(data = pred_cl,model.dat_test$depression)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15445 2147
## 1 9063 3839
##
## Accuracy : 0.6324
## 95% CI : (0.6269, 0.6378)
## No Information Rate : 0.8037
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.189
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6302
## Specificity : 0.6413
## Pos Pred Value : 0.8780
## Neg Pred Value : 0.2976
## Prevalence : 0.8037
## Detection Rate : 0.5065
## Detection Prevalence : 0.5769
## Balanced Accuracy : 0.6358
##
## 'Positive' Class : 0
##
In the test data, the model performs approximately as it did on the training data.