#load libraries
library(car, quietly = T)
library(stargazer, quietly = T)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")

names(brfss20sm) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(brfss20sm)))

#My binary outcome variable is Depressive disorder (ADDEPEV3).

brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")
#sex
brfss20sm$male<-as.factor(ifelse(brfss20sm$sex==1,
                                "Male",
                                "Female"))

#Age cut into intervals
brfss20sm$agec<-cut(brfss20sm$age80,
                   breaks=c(0,24,39,59,79,99))

#race/ethnicity
brfss20sm$black<-Recode(brfss20sm$racegr3,
                       recodes="2=1; 9=NA; else=0")
brfss20sm$white<-Recode(brfss20sm$racegr3,
                       recodes="1=1; 9=NA; else=0")
brfss20sm$other<-Recode(brfss20sm$racegr3,
                       recodes="3:4=1; 9=NA; else=0")
brfss20sm$hispanic<-Recode(brfss20sm$racegr3,
                          recodes="5=1; 9=NA; else=0")

brfss20sm$race_eth<-Recode(brfss20sm$racegr3,
                          recodes="1='nhwhite'; 2='nh_black'; 3='nh_other';4='nh_multirace'; 5='hispanic'; else=NA",
                          as.factor = T)

#insurance
brfss20sm$ins<-Recode(brfss20sm$hlthpln1,
                     recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss20sm$inc<-ifelse(brfss20sm$incomg==9,
                     NA,
                     brfss20sm$incomg)

#education level
brfss20sm$educ<-Recode(brfss20sm$educa,
                      recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
                      as.factor=T)

#brfss20sm$educ<-relevel(brfss20sm$educ, ref='2hsgrad')

#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
                        recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
                        as.factor=T)

brfss20sm$employ<-relevel(brfss20sm$employ,
                         ref='Employed')

#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
                       recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
                       as.factor=T)

brfss20sm$marst<-relevel(brfss20sm$marst,
                        ref='married')

#Age cut into intervals
brfss20sm$agec<-cut(brfss20sm$age80,
                   breaks=c(0,29,39,59,79,99))

brfss20sm$ageg<-factor(brfss20sm$ageg)

#BMI, in the brfss20a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss20sm$bmi<-brfss20sm$bmi5/100

brfss20sm$obese<-ifelse(brfss20sm$bmi>=30,
                       1,
                       0)
brfss20sm<-brfss20sm%>%
  filter(is.na(marst)==F,
         is.na(employ)==F,
         is.na(depression)==F)

##Fit a predictive logistic regression model using as many predictor variables as you think you need:

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
               strata= ~ststr,
               weights= ~mmsawt,
               data = brfss20sm )

Analysis:

First make our analytical dataset and our survey design information

library(dplyr)
sub1<-brfss20sm%>%
  dplyr::select(sex, agec,marst, employ,depression) 
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
knitr::kable(head(sub1))
sex agec marst employ depression
1 (29,39] married Employed 0
2 (0,29] cohab nilf 1
1 (39,59] married Employed 0
2 (79,99] widowed retired 0
2 (39,59] married Employed 0
2 (59,79] married retired 0

3) using caret to create training and test sets.

In predictive models, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.

We use an 80% training fraction, which is standard.

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1115)
train<- createDataPartition(y = sub1$depression,
                            p = .80,
                            list=F)

sub1train<-sub1[train,]
sub1test<-sub1[-train,]

table(sub1train$depression)
## 
##      0      1 
## 122537  28521
prop.table(table(sub1train$depression))
## 
##         0         1 
## 0.8111917 0.1888083
summary(sub1train)
##       sex             agec             marst            employ     
##  Min.   :1.000   (0,29] :19418   married  :77608   Employed:81160  
##  1st Qu.:1.000   (29,39]:20237   cohab    : 6323   nilf    :20711  
##  Median :2.000   (39,59]:48377   divorced :19059   retired :40911  
##  Mean   :1.538   (59,79]:52240   nm       :30521   unable  : 8276  
##  3rd Qu.:2.000   (79,99]:10786   separated: 3093                   
##  Max.   :2.000                   widowed  :14454                   
##    depression    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1888  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
glm1<-glm(depression~factor(agec)+factor(sex)+factor(marst)+factor(employ),
          data=sub1train,
          family = binomial)
summary(glm1)
## 
## Call:
## glm(formula = depression ~ factor(agec) + factor(sex) + factor(marst) + 
##     factor(employ), family = binomial, data = sub1train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5467  -0.6710  -0.5410  -0.4288   2.5740  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.08884    0.02588 -80.715  < 2e-16 ***
## factor(agec)(29,39]     0.06207    0.02621   2.368   0.0179 *  
## factor(agec)(39,59]    -0.14303    0.02456  -5.823 5.79e-09 ***
## factor(agec)(59,79]    -0.41674    0.02852 -14.613  < 2e-16 ***
## factor(agec)(79,99]    -1.18672    0.04574 -25.944  < 2e-16 ***
## factor(sex)2            0.65042    0.01440  45.180  < 2e-16 ***
## factor(marst)cohab      0.49529    0.03271  15.142  < 2e-16 ***
## factor(marst)divorced   0.65792    0.02007  32.782  < 2e-16 ***
## factor(marst)nm         0.39434    0.01993  19.784  < 2e-16 ***
## factor(marst)separated  0.57912    0.04307  13.447  < 2e-16 ***
## factor(marst)widowed    0.27210    0.02743   9.920  < 2e-16 ***
## factor(employ)nilf      0.35600    0.01955  18.214  < 2e-16 ***
## factor(employ)retired   0.28479    0.02311  12.323  < 2e-16 ***
## factor(employ)unable    1.55453    0.02542  61.165  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146372  on 151057  degrees of freedom
## Residual deviance: 136848  on 151044  degrees of freedom
## AIC: 136876
## 
## Number of Fisher Scoring iterations: 4

We see that all the predictors are significantly related to our outcome. Next we see how the model performs in terms of accuracy of prediction. This is new comparison to how we typically use logistic regression.

We use the predict() function to get the estimated class probabilities for each case.

tr_pred<- predict(glm1,
                  newdata = sub1train,
                  type = "response")

head(tr_pred)
##          1          2          3          4          5          6 
## 0.11642162 0.35729558 0.09692512 0.17216369 0.09789900 0.09789900

These are the estimated probability that each of these people suffered from depression, based on the model.

3a) Does changing the decision rule threshold affect your classification accuracy?

In order to create classes (having depression vs doesn’t have depression) we have to use a decision rule. A decision rule is when we choose a cut off point, or threshold value of the probability to classify each observation as belonging to one class or the other.

A basic decision rule is if \(Pr(y=\text{Modern Contraception} |X) >.5\) Then classify the observation as a depressive patient, and otherwise not. This is what we will use here.

tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  depression=sub1train$depression)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Depression",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=depression, fill=depression))+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.

table( tr_predcl,
       sub1train$depression)
##          
## tr_predcl      0      1
##         0 121192  26676
##         1   1345   1845
cm1<-confusionMatrix(data = tr_predcl,
                    as.factor(sub1train$depression) ) 
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 121192  26676
##          1   1345   1845
##                                           
##                Accuracy : 0.8145          
##                  95% CI : (0.8125, 0.8165)
##     No Information Rate : 0.8112          
##     P-Value [Acc > NIR] : 0.0004998       
##                                           
##                   Kappa : 0.0815          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.98902         
##             Specificity : 0.06469         
##          Pos Pred Value : 0.81960         
##          Neg Pred Value : 0.57837         
##              Prevalence : 0.81119         
##          Detection Rate : 0.80229         
##    Detection Prevalence : 0.97888         
##       Balanced Accuracy : 0.52686         
##                                           
##        'Positive' Class : 0               
## 

Overall, the model has a 81.45%% accuracy, which isn’t bad.The sensitivity is 98%% and specificity is 6%. In other word, the model is pretty good at people without depression than the people have depressive disorder.

We could try a different decision rule, in this case, I use the mean of the response as the cutoff value.

tr_predcl<-factor(ifelse(tr_pred>mean(I(sub1train$depression==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  depression=sub1train$depression)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(sub1train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=depression))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Truth")+
  geom_vline(xintercept=mean(I(sub1train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = tr_predcl,
                as.factor(sub1train$depression),
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 82799 12293
##          1 39738 16228
##                                          
##                Accuracy : 0.6556         
##                  95% CI : (0.6532, 0.658)
##     No Information Rate : 0.8112         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1787         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.5690         
##             Specificity : 0.6757         
##          Pos Pred Value : 0.2900         
##          Neg Pred Value : 0.8707         
##              Prevalence : 0.1888         
##          Detection Rate : 0.1074         
##    Detection Prevalence : 0.3705         
##       Balanced Accuracy : 0.6223         
##                                          
##        'Positive' Class : 1              
## 

It produces the almost same accuracy, but decreases the sensitivity at the cost of increased specificity.

4)Report the % correct classification from the test data using the 0.5 decision rule and again using the mean of the outcome decision rule:

Testing data using mean

pred_test<-predict(glm1,
                   newdata=sub1test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(sub1test$depression==1)), 1, 0))

table(sub1test$depression,pred_cl)
##    pred_cl
##         0     1
##   0 20733  9737
##   1  3213  4081
confusionMatrix(data = pred_cl,as.factor(sub1test$depression) )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 20733  3213
##          1  9737  4081
##                                           
##                Accuracy : 0.6571          
##                  95% CI : (0.6523, 0.6619)
##     No Information Rate : 0.8069          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.179           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6804          
##             Specificity : 0.5595          
##          Pos Pred Value : 0.8658          
##          Neg Pred Value : 0.2953          
##              Prevalence : 0.8069          
##          Detection Rate : 0.5490          
##    Detection Prevalence : 0.6341          
##       Balanced Accuracy : 0.6200          
##                                           
##        'Positive' Class : 0               
## 

In the test data, the model does almost same as it did on the training data using mean, which is ideal.

Testing data using 0.5 decision rule

pred_test2<- predict(glm1,
                     newdata=sub1test,
                     type="response")

pred_cl<-factor(ifelse(pred_test >.5,1,0))

table(sub1test$depression,pred_cl)
##    pred_cl
##         0     1
##   0 30115   355
##   1  6830   464
confusionMatrix(data = pred_cl,as.factor(sub1test$depression) )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 30115  6830
##          1   355   464
##                                           
##                Accuracy : 0.8097          
##                  95% CI : (0.8057, 0.8137)
##     No Information Rate : 0.8069          
##     P-Value [Acc > NIR] : 0.07844         
##                                           
##                   Kappa : 0.0784          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.98835         
##             Specificity : 0.06361         
##          Pos Pred Value : 0.81513         
##          Neg Pred Value : 0.56654         
##              Prevalence : 0.80685         
##          Detection Rate : 0.79745         
##    Detection Prevalence : 0.97831         
##       Balanced Accuracy : 0.52598         
##                                           
##        'Positive' Class : 0               
## 

The % correct classification is 80.97% from the test data using the .5 decision rule.