Logistic Regression as a Predictive Model

For this short report, I use 2020 Behavioral Risk Factor Surveillance System data on chronic health conditions. I use the outcome of depressive disorder diagnosis as my outcome.

### Load libraries
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v stringr 1.4.0
## v tidyr   1.1.4     v forcats 0.5.1
## v readr   2.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
### Load data

brfss20 = readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))

### Fix variable names
names(brfss20) = tolower(gsub(pattern = "_",
                              replacement =  "",
                              x =  names(brfss20)))

Here I recode some of my independent variables and exclude missing data:

general health = genhtlh sex = sex age = age race/ethnicity education = educa income = incg

#Depressive disorder
brfss20$depression = as.factor(Recode(brfss20$addepev3,
                    recodes="1=1; 2=0; else=NA"))

#Good versus poor health
brfss20$poorhlth = Recode(brfss20$genhlth,
                           recodes="4:5=1; 1:3=0; else=NA")

#Sex
brfss20$male = as.factor(ifelse(brfss20$sex==1,
                                "Male",
                                "Female"))
#Race/ethnicity
brfss20$black = Recode(brfss20$racegr3,
                       recodes="2=1; 9=NA; else=0")

brfss20$white = Recode(brfss20$racegr3,
                       recodes="1=1; 9=NA; else=0")

brfss20$other = Recode(brfss20$racegr3,
                       recodes="3:4=1; 9=NA; else=0")

brfss20$hispanic = Recode(brfss20$racegr3,
                          recodes="5=1; 9=NA; else=0")

brfss20$race_eth = Recode(brfss20$racegr3,
                          recodes="1='nh white'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
                          as.factor = T)

brfss20$race_eth = relevel(brfss20$race_eth,
                           ref = "nh white")

#Education level
brfss20$educ = Recode(brfss20$educa,
                     recodes="1:2='Less than HS'; 
                     3='Some HS'; 
                     4='HS grad'; 
                     5='Some college'; 
                     6='College grad';9=NA",
                     as.factor=T)

brfss20$educ = fct_relevel(brfss20$educ,'Less than HS','Some HS','HS grad','Some college','College grad' ) 

#Age cut into intervals
brfss20$agec = cut(brfss20$age80, 
                  breaks=c(0,24,39,59,79,99))

#Income grouping
brfss20$inc = ifelse(brfss20$incomg==9,
                     NA,
                     brfss20$incomg)

#Filter cases
brfss20 = brfss20 %>% 
  filter(is.na(depression)==F,
         is.na(poorhlth)==F,
         is.na(male)==F,
         is.na(race_eth)==F,
         is.na(educ)==F,
         is.na(agec)==F,
         is.na(inc)==F)


model.dat = brfss20 %>%
  dplyr::select(depression,poorhlth, male, race_eth,educ, agec,inc)
knitr::kable(head(model.dat))
depression poorhlth male race_eth educ agec inc
1 0 Female nh white HS grad (0,24] 1
0 0 Male nh black HS grad (39,59] 5
0 0 Female nh white Some college (39,59] 5
0 0 Male nh white Some college (59,79] 5
0 0 Male hispanic College grad (59,79] 5
0 0 Male nh white HS grad (39,59] 2

using caret to create training and test sets.

For this predictive model, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.

We use an 80% training fraction, which is standard.

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1016)
train = createDataPartition(y = model.dat$depression,
                            p = .80,
                            list=F)

model.dat_train = model.dat[train,]
model.dat_test = model.dat[-train,]

table(model.dat_train$depression)
## 
##     0     1 
## 98035 23948
prop.table(table(model.dat_train$depression))
## 
##         0         1 
## 0.8036776 0.1963224
summary(model.dat_train)
##  depression    poorhlth          male               race_eth    
##  0:98035    Min.   :0.0000   Female:64225   nh white    :89299  
##  1:23948    1st Qu.:0.0000   Male  :57758   hispanic    :12756  
##             Median :0.0000                  nh black    :12285  
##             Mean   :0.1329                  nh multirace: 2219  
##             3rd Qu.:0.0000                  nh other    : 5424  
##             Max.   :1.0000                                      
##            educ            agec            inc       
##  Less than HS: 2044   (0,24] : 7461   Min.   :1.000  
##  Some HS     : 4202   (24,39]:24995   1st Qu.:3.000  
##  HS grad     :27642   (39,59]:40898   Median :5.000  
##  Some college:32423   (59,79]:41277   Mean   :4.023  
##  College grad:55672   (79,99]: 7352   3rd Qu.:5.000  
##                                       Max.   :5.000

Logistic regression for classification

Here I use a basic binomial GLM to estimate the probability of an individual having depressive disorder on the basis of general health status, sex, race/ethnicity, level of education, age, and level of income.

This model can be written: \[ln \left ( \frac{Pr(\text{Depressive Disorer})}{1-Pr(\text{Depressive Disorer})} \right ) = X' \beta\]

Which can be converted to the probability scale via the inverse logit transform:

\[Pr(\text{Depressive Disorer}) = \frac{1}{1+exp (-X' \beta)}\]

glm1 = glm(depression ~ 
             factor(poorhlth)+
             factor(male)+
             factor(race_eth)+
             factor(educ)+
             factor(agec)+
             factor(inc),
          data=model.dat_train,
          family = binomial)

summary(glm1)
## 
## Call:
## glm(formula = depression ~ factor(poorhlth) + factor(male) + 
##     factor(race_eth) + factor(educ) + factor(agec) + factor(inc), 
##     family = binomial, data = model.dat_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6795  -0.6995  -0.5330  -0.3797   2.8573  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -0.561257   0.073392  -7.647 2.05e-14 ***
## factor(poorhlth)1             1.104934   0.020323  54.370  < 2e-16 ***
## factor(male)Male             -0.711068   0.015741 -45.173  < 2e-16 ***
## factor(race_eth)hispanic     -0.754333   0.028167 -26.781  < 2e-16 ***
## factor(race_eth)nh black     -0.721564   0.027708 -26.042  < 2e-16 ***
## factor(race_eth)nh multirace  0.030296   0.051645   0.587    0.557    
## factor(race_eth)nh other     -0.597714   0.040985 -14.584  < 2e-16 ***
## factor(educ)Some HS           0.423706   0.073029   5.802 6.56e-09 ***
## factor(educ)HS grad           0.359163   0.065395   5.492 3.97e-08 ***
## factor(educ)Some college      0.593256   0.065463   9.062  < 2e-16 ***
## factor(educ)College grad      0.471220   0.065785   7.163 7.89e-13 ***
## factor(agec)(24,39]          -0.006321   0.032822  -0.193    0.847    
## factor(agec)(39,59]          -0.232652   0.031769  -7.323 2.42e-13 ***
## factor(agec)(59,79]          -0.601366   0.032114 -18.726  < 2e-16 ***
## factor(agec)(79,99]          -1.581856   0.050219 -31.499  < 2e-16 ***
## factor(inc)2                 -0.310691   0.030996 -10.024  < 2e-16 ***
## factor(inc)3                 -0.469431   0.034933 -13.438  < 2e-16 ***
## factor(inc)4                 -0.603277   0.033096 -18.228  < 2e-16 ***
## factor(inc)5                 -0.960438   0.029386 -32.683  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 120827  on 121982  degrees of freedom
## Residual deviance: 111017  on 121964  degrees of freedom
## AIC: 111055
## 
## Number of Fisher Scoring iterations: 5

We see that almost all predictors are significantly related to our outcome. Next we see how the model performs in terms of accuracy of prediction.

tr_pred = predict(glm1,
                  newdata = model.dat_train,
                  type = "response")

head(tr_pred)
##          1          2          3          4          5          6 
## 0.44964776 0.09613964 0.04240160 0.18900246 0.28374365 0.21701820

These are the estimated probability that each individual has depressive disorder, based on the model.

In order to create classes (has depressive disorder vs does not have depressive disorder) we have to use a decision rule:

\(Pr(y=\text{Depressive Disorder} |X) >.5\) Then classify the observation as an individaul with depressive disorder, and otherwise not.

tr_predcl = factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1 = data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  depression=model.dat_train$depression)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=depression, fill=depression))+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.

table(tr_predcl,
       model.dat_train$depression)
##          
## tr_predcl     0     1
##         0 96573 21789
##         1  1462  2159
cm1<-confusionMatrix(data = tr_predcl,
                     reference = model.dat_train$depression )
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 96573 21789
##          1  1462  2159
##                                           
##                Accuracy : 0.8094          
##                  95% CI : (0.8072, 0.8116)
##     No Information Rate : 0.8037          
##     P-Value [Acc > NIR] : 2.347e-07       
##                                           
##                   Kappa : 0.1108          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.98509         
##             Specificity : 0.09015         
##          Pos Pred Value : 0.81591         
##          Neg Pred Value : 0.59624         
##              Prevalence : 0.80368         
##          Detection Rate : 0.79169         
##    Detection Prevalence : 0.97032         
##       Balanced Accuracy : 0.53762         
##                                           
##        'Positive' Class : 0               
## 

Overall the model has an approximately 81% accuracy, however the sensitivity is almost 99% which is very high. The specificity is only around 9% indicating 9% of individuals with depressive disorder were predicted as having depressive disorder. In other words, the model can predict well if you don’t have depressive disorder, but not at predicting if you do.

Next, I use the mean of the response as the cutoff value.

tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat_train$depression==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  modcon=model.dat_train$depression)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat_train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat_train$depression==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = tr_predcl,
                model.dat_train$depression,
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 61990  8547
##          1 36045 15401
##                                           
##                Accuracy : 0.6344          
##                  95% CI : (0.6317, 0.6371)
##     No Information Rate : 0.8037          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1921          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6431          
##             Specificity : 0.6323          
##          Pos Pred Value : 0.2994          
##          Neg Pred Value : 0.8788          
##              Prevalence : 0.1963          
##          Detection Rate : 0.1263          
##    Detection Prevalence : 0.4217          
##       Balanced Accuracy : 0.6377          
##                                           
##        'Positive' Class : 1               
## 

This mean cutoff produces a reduced accuracy, but decreases the sensitivity at the cost of highly increased specificity.

Next I perform confusion matrices on the test set to evaluate model performance outside of the training data.

pred_test<-predict(glm1,
                   newdata=model.dat_test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean(I(model.dat_test$depression==1)), 1, 0))

table(model.dat_test$depression,pred_cl)
##    pred_cl
##         0     1
##   0 15445  9063
##   1  2147  3839
confusionMatrix(data = pred_cl,model.dat_test$depression)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 15445  2147
##          1  9063  3839
##                                           
##                Accuracy : 0.6324          
##                  95% CI : (0.6269, 0.6378)
##     No Information Rate : 0.8037          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.189           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6302          
##             Specificity : 0.6413          
##          Pos Pred Value : 0.8780          
##          Neg Pred Value : 0.2976          
##              Prevalence : 0.8037          
##          Detection Rate : 0.5065          
##    Detection Prevalence : 0.5769          
##       Balanced Accuracy : 0.6358          
##                                           
##        'Positive' Class : 0               
## 

In the test data, the model performs approximately as it did on the training data.

---
title: "Homework 4: Logistic Regression as a Predictive Model"
author: "Christina Quintanilla-Muñoz"
date:  "`r format(Sys.time(), '%d %B, %Y')`"
output:
   html_document:
    df_print: paged
    fig_height: 7
    fig_width: 7
    toc: yes
    toc_float: yes
    code_download: true
---

## Logistic Regression as a Predictive Model

For this short report, I use 2020 Behavioral Risk Factor Surveillance System data on chronic health conditions. I use the outcome of depressive disorder diagnosis as my outcome.


```{r, warning=TRUE}
### Load libraries
library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(tidyverse)

### Load data

brfss20 = readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))

### Fix variable names
names(brfss20) = tolower(gsub(pattern = "_",
                              replacement =  "",
                              x =  names(brfss20)))
```

Here I recode some of my independent variables and exclude missing data:

general health = genhtlh
sex = sex
age = age
race/ethnicity 
education = educa
income = incg


```{r}
#Depressive disorder
brfss20$depression = as.factor(Recode(brfss20$addepev3,
                    recodes="1=1; 2=0; else=NA"))

#Good versus poor health
brfss20$poorhlth = Recode(brfss20$genhlth,
                           recodes="4:5=1; 1:3=0; else=NA")

#Sex
brfss20$male = as.factor(ifelse(brfss20$sex==1,
                                "Male",
                                "Female"))
#Race/ethnicity
brfss20$black = Recode(brfss20$racegr3,
                       recodes="2=1; 9=NA; else=0")

brfss20$white = Recode(brfss20$racegr3,
                       recodes="1=1; 9=NA; else=0")

brfss20$other = Recode(brfss20$racegr3,
                       recodes="3:4=1; 9=NA; else=0")

brfss20$hispanic = Recode(brfss20$racegr3,
                          recodes="5=1; 9=NA; else=0")

brfss20$race_eth = Recode(brfss20$racegr3,
                          recodes="1='nh white'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
                          as.factor = T)

brfss20$race_eth = relevel(brfss20$race_eth,
                           ref = "nh white")

#Education level
brfss20$educ = Recode(brfss20$educa,
                     recodes="1:2='Less than HS'; 
                     3='Some HS'; 
                     4='HS grad'; 
                     5='Some college'; 
                     6='College grad';9=NA",
                     as.factor=T)

brfss20$educ = fct_relevel(brfss20$educ,'Less than HS','Some HS','HS grad','Some college','College grad' ) 

#Age cut into intervals
brfss20$agec = cut(brfss20$age80, 
                  breaks=c(0,24,39,59,79,99))

#Income grouping
brfss20$inc = ifelse(brfss20$incomg==9,
                     NA,
                     brfss20$incomg)

#Filter cases
brfss20 = brfss20 %>% 
  filter(is.na(depression)==F,
         is.na(poorhlth)==F,
         is.na(male)==F,
         is.na(race_eth)==F,
         is.na(educ)==F,
         is.na(agec)==F,
         is.na(inc)==F)


model.dat = brfss20 %>%
  dplyr::select(depression,poorhlth, male, race_eth,educ, agec,inc)

```

```{r, results='asis'}

knitr::kable(head(model.dat))

```

### using caret to create training and test sets.
For this predictive model, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model's predictive ability. 

We use an 80% training fraction, which is standard. 

```{r}
library(caret)
set.seed(1016)
train = createDataPartition(y = model.dat$depression,
                            p = .80,
                            list=F)

model.dat_train = model.dat[train,]
model.dat_test = model.dat[-train,]

table(model.dat_train$depression)
prop.table(table(model.dat_train$depression))
```

```{r}
summary(model.dat_train)
```

## Logistic regression for classification
Here I use a basic binomial GLM to estimate the probability of an individual having depressive disorder on the basis of general health status, sex, race/ethnicity, level of education, age, and level of income.

This model can be written: 
$$ln \left ( \frac{Pr(\text{Depressive Disorer})}{1-Pr(\text{Depressive Disorer})} \right ) = X' \beta$$

Which can be converted to the probability scale via the inverse logit transform:

$$Pr(\text{Depressive Disorer}) = \frac{1}{1+exp (-X' \beta)}$$ 


```{r}
glm1 = glm(depression ~ 
             factor(poorhlth)+
             factor(male)+
             factor(race_eth)+
             factor(educ)+
             factor(agec)+
             factor(inc),
          data=model.dat_train,
          family = binomial)

summary(glm1)
```

We see that almost all predictors are significantly related to our outcome. Next we see how the model performs in terms of accuracy of prediction.

```{r}
tr_pred = predict(glm1,
                  newdata = model.dat_train,
                  type = "response")

head(tr_pred)

```

These are the estimated probability that each individual has depressive disorder, based on the model. 

In order to create classes (has depressive disorder vs does not have depressive disorder) we have to use a **decision rule:**

$Pr(y=\text{Depressive Disorder} |X) >.5$ Then classify the observation as an individaul with depressive disorder, and otherwise not.


```{r}

tr_predcl = factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1 = data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  depression=model.dat_train$depression)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)


pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=depression, fill=depression))+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Truth")+
  geom_vline(xintercept=.5)

```


Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the **confusion matrix**. 

```{r}
table(tr_predcl,
       model.dat_train$depression)
```

```{r}
cm1<-confusionMatrix(data = tr_predcl,
                     reference = model.dat_train$depression )
cm1
```

Overall the model has an approximately 81% accuracy, however the sensitivity is almost 99% which is very high. The specificity is only around 9% indicating 9% of individuals with depressive disorder were predicted as having depressive disorder. In other words, the model can predict well if you don't have depressive disorder, but not at predicting if you do. 

Next, I use the mean of the response as the cutoff value. 

```{r}
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat_train$depression==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  modcon=model.dat_train$depression)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat_train$depression==1)))


pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Depressive Disorder",
          subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat_train$depression==1)))



```


```{r}
confusionMatrix(data = tr_predcl,
                model.dat_train$depression,
                positive = "1" )

```
This mean cutoff produces a reduced accuracy, but decreases the sensitivity at the cost of highly increased specificity.

Next I perform confusion matrices on the test set to evaluate model performance outside of the training data.

```{r}
pred_test<-predict(glm1,
                   newdata=model.dat_test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean(I(model.dat_test$depression==1)), 1, 0))

table(model.dat_test$depression,pred_cl)

confusionMatrix(data = pred_cl,model.dat_test$depression)

```

In the test data, the model performs approximately as it did on the training data.