#load libraries
library(car, quietly = T)
library(stargazer, quietly = T)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")
names(brfss20sm) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20sm)))
#My binary outcome variable is Depressive disorder (ADDEPEV3).
brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")
#sex
brfss20sm$male<-as.factor(ifelse(brfss20sm$sex==1,
"Male",
"Female"))
#Age cut into intervals
brfss20sm$agec<-cut(brfss20sm$age80,
breaks=c(0,24,39,59,79,99))
#race/ethnicity
brfss20sm$black<-Recode(brfss20sm$racegr3,
recodes="2=1; 9=NA; else=0")
brfss20sm$white<-Recode(brfss20sm$racegr3,
recodes="1=1; 9=NA; else=0")
brfss20sm$other<-Recode(brfss20sm$racegr3,
recodes="3:4=1; 9=NA; else=0")
brfss20sm$hispanic<-Recode(brfss20sm$racegr3,
recodes="5=1; 9=NA; else=0")
brfss20sm$race_eth<-Recode(brfss20sm$racegr3,
recodes="1='nhwhite'; 2='nh_black'; 3='nh_other';4='nh_multirace'; 5='hispanic'; else=NA",
as.factor = T)
#insurance
brfss20sm$ins<-Recode(brfss20sm$hlthpln1,
recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss20sm$inc<-ifelse(brfss20sm$incomg==9,
NA,
brfss20sm$incomg)
#education level
brfss20sm$educ<-Recode(brfss20sm$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
#brfss20sm$educ<-relevel(brfss20sm$educ, ref='2hsgrad')
#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20sm$employ<-relevel(brfss20sm$employ,
ref='Employed')
#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss20sm$marst<-relevel(brfss20sm$marst,
ref='married')
#Age cut into intervals
brfss20sm$agec<-cut(brfss20sm$age80,
breaks=c(0,29,39,59,79,99))
brfss20sm$ageg<-factor(brfss20sm$ageg)
#BMI, in the brfss20a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's
brfss20sm$bmi<-brfss20sm$bmi5/100
brfss20sm$obese<-ifelse(brfss20sm$bmi>=30,
1,
0)
brfss20sm<-brfss20sm%>%
filter(is.na(marst)==F,
is.na(employ)==F,
is.na(depression)==F)
##Fit a predictive logistic regression model using as many predictor variables as you think you need:
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
strata= ~ststr,
weights= ~mmsawt,
data = brfss20sm )
First make our analytical dataset and our survey design information
library(dplyr)
sub1<-brfss20sm%>%
dplyr::select(sex, agec,marst, employ,depression)
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
knitr::kable(head(sub1))
| sex | agec | marst | employ | depression |
|---|---|---|---|---|
| 1 | (29,39] | married | Employed | 0 |
| 2 | (0,29] | cohab | nilf | 1 |
| 1 | (39,59] | married | Employed | 0 |
| 2 | (79,99] | widowed | retired | 0 |
| 2 | (39,59] | married | Employed | 0 |
| 2 | (59,79] | married | retired | 0 |
In predictive models, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.
We use an 80% training fraction, which is standard.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1115)
train<- createDataPartition(y = sub1$depression,
p = .80,
list=F)
sub1train<-sub1[train,]
sub1test<-sub1[-train,]
table(sub1train$depression)
##
## 0 1
## 122537 28521
prop.table(table(sub1train$depression))
##
## 0 1
## 0.8111917 0.1888083
summary(sub1train)
## sex agec marst employ
## Min. :1.000 (0,29] :19418 married :77608 Employed:81160
## 1st Qu.:1.000 (29,39]:20237 cohab : 6323 nilf :20711
## Median :2.000 (39,59]:48377 divorced :19059 retired :40911
## Mean :1.538 (59,79]:52240 nm :30521 unable : 8276
## 3rd Qu.:2.000 (79,99]:10786 separated: 3093
## Max. :2.000 widowed :14454
## depression
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1888
## 3rd Qu.:0.0000
## Max. :1.0000
glm1<-glm(depression~factor(agec)+factor(sex)+factor(marst)+factor(employ),
data=sub1train,
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = depression ~ factor(agec) + factor(sex) + factor(marst) +
## factor(employ), family = binomial, data = sub1train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5467 -0.6710 -0.5410 -0.4288 2.5740
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.08884 0.02588 -80.715 < 2e-16 ***
## factor(agec)(29,39] 0.06207 0.02621 2.368 0.0179 *
## factor(agec)(39,59] -0.14303 0.02456 -5.823 5.79e-09 ***
## factor(agec)(59,79] -0.41674 0.02852 -14.613 < 2e-16 ***
## factor(agec)(79,99] -1.18672 0.04574 -25.944 < 2e-16 ***
## factor(sex)2 0.65042 0.01440 45.180 < 2e-16 ***
## factor(marst)cohab 0.49529 0.03271 15.142 < 2e-16 ***
## factor(marst)divorced 0.65792 0.02007 32.782 < 2e-16 ***
## factor(marst)nm 0.39434 0.01993 19.784 < 2e-16 ***
## factor(marst)separated 0.57912 0.04307 13.447 < 2e-16 ***
## factor(marst)widowed 0.27210 0.02743 9.920 < 2e-16 ***
## factor(employ)nilf 0.35600 0.01955 18.214 < 2e-16 ***
## factor(employ)retired 0.28479 0.02311 12.323 < 2e-16 ***
## factor(employ)unable 1.55453 0.02542 61.165 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146372 on 151057 degrees of freedom
## Residual deviance: 136848 on 151044 degrees of freedom
## AIC: 136876
##
## Number of Fisher Scoring iterations: 4
We see that all the predictors are significantly related to our outcome. Next we see how the model performs in terms of accuracy of prediction. This is new comparison to how we typically use logistic regression.
We use the predict() function to get the estimated class probabilities for each case.
tr_pred<- predict(glm1,
newdata = sub1train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.11642162 0.35729558 0.09692512 0.17216369 0.09789900 0.09789900
These are the estimated probability that each of these people suffered from depression, based on the model.
In order to create classes (having depression vs doesn’t have depression) we have to use a decision rule. A decision rule is when we choose a cut off point, or threshold value of the probability to classify each observation as belonging to one class or the other.
A basic decision rule is if \(Pr(y=\text{Modern Contraception} |X) >.5\) Then classify the observation as a depressive patient, and otherwise not. This is what we will use here.
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred,
gr=tr_predcl,
depression=sub1train$depression)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Depression",
subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=depression, fill=depression))+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.
table( tr_predcl,
sub1train$depression)
##
## tr_predcl 0 1
## 0 121192 26676
## 1 1345 1845
cm1<-confusionMatrix(data = tr_predcl,
as.factor(sub1train$depression) )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 121192 26676
## 1 1345 1845
##
## Accuracy : 0.8145
## 95% CI : (0.8125, 0.8165)
## No Information Rate : 0.8112
## P-Value [Acc > NIR] : 0.0004998
##
## Kappa : 0.0815
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.98902
## Specificity : 0.06469
## Pos Pred Value : 0.81960
## Neg Pred Value : 0.57837
## Prevalence : 0.81119
## Detection Rate : 0.80229
## Detection Prevalence : 0.97888
## Balanced Accuracy : 0.52686
##
## 'Positive' Class : 0
##
Overall, the model has a 81.45%% accuracy, which isn’t bad.The sensitivity is 98%% and specificity is 6%. In other word, the model is pretty good at people without depression than the people have depressive disorder.
We could try a different decision rule, in this case, I use the mean of the response as the cutoff value.
tr_predcl<-factor(ifelse(tr_pred>mean(I(sub1train$depression==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred,
gr=tr_predcl,
depression=sub1train$depression)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(sub1train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=depression))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Depressive Disorder",
subtitle = "Truth")+
geom_vline(xintercept=mean(I(sub1train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
confusionMatrix(data = tr_predcl,
as.factor(sub1train$depression),
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 82799 12293
## 1 39738 16228
##
## Accuracy : 0.6556
## 95% CI : (0.6532, 0.658)
## No Information Rate : 0.8112
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1787
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5690
## Specificity : 0.6757
## Pos Pred Value : 0.2900
## Neg Pred Value : 0.8707
## Prevalence : 0.1888
## Detection Rate : 0.1074
## Detection Prevalence : 0.3705
## Balanced Accuracy : 0.6223
##
## 'Positive' Class : 1
##
It produces the almost same accuracy, but decreases the sensitivity at the cost of increased specificity.
Testing data using mean
pred_test<-predict(glm1,
newdata=sub1test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(sub1test$depression==1)), 1, 0))
table(sub1test$depression,pred_cl)
## pred_cl
## 0 1
## 0 20733 9737
## 1 3213 4081
confusionMatrix(data = pred_cl,as.factor(sub1test$depression) )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 20733 3213
## 1 9737 4081
##
## Accuracy : 0.6571
## 95% CI : (0.6523, 0.6619)
## No Information Rate : 0.8069
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.179
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6804
## Specificity : 0.5595
## Pos Pred Value : 0.8658
## Neg Pred Value : 0.2953
## Prevalence : 0.8069
## Detection Rate : 0.5490
## Detection Prevalence : 0.6341
## Balanced Accuracy : 0.6200
##
## 'Positive' Class : 0
##
In the test data, the model does almost same as it did on the training data using mean, which is ideal.
Testing data using 0.5 decision rule
pred_test2<- predict(glm1,
newdata=sub1test,
type="response")
pred_cl<-factor(ifelse(pred_test >.5,1,0))
table(sub1test$depression,pred_cl)
## pred_cl
## 0 1
## 0 30115 355
## 1 6830 464
confusionMatrix(data = pred_cl,as.factor(sub1test$depression) )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 30115 6830
## 1 355 464
##
## Accuracy : 0.8097
## 95% CI : (0.8057, 0.8137)
## No Information Rate : 0.8069
## P-Value [Acc > NIR] : 0.07844
##
## Kappa : 0.0784
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.98835
## Specificity : 0.06361
## Pos Pred Value : 0.81513
## Neg Pred Value : 0.56654
## Prevalence : 0.80685
## Detection Rate : 0.79745
## Detection Prevalence : 0.97831
## Balanced Accuracy : 0.52598
##
## 'Positive' Class : 0
##
The % correct classification is 80.97% from the test data using the .5 decision rule.