The outcome of interest is whether or not there were two or more substances (drugs or alcohol) present in the case. The Predictive model will predict whether or not there are two or more substances in the case.

library(car, quietly = T)
library(stargazer, quietly = T)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
load("~/Downloads/Stats2-Dem7283 /Stats2/ICPSR_34565-2/DS0001/34565-0001-Data.rda")
dawn2011 <- da34565.0001

#number of substances
dawn2011$morethanonesub <- ifelse(dawn2011$NUMSUBS==1,0,1)

#age
dawn2011$age <- Recode(dawn2011$AGECAT, recodes= "-8:NA")

dawn2011 <- dawn2011 %>%
  filter(is.na(age)==F,
         SEX!= -8,
         is.na(morethanonesub)==F)

modeldawn2011 <- dawn2011 %>%
  select(morethanonesub, age, SEX, METRO)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1234)
train<- createDataPartition(y = modeldawn2011$morethanonesub,
                            p= .80,
                            list=F)

modeldawn2011train <- modeldawn2011[train,]
modeldawn2011test <- modeldawn2011[-train,]

table(modeldawn2011$morethanonesub)
prop.table(table(modeldawn2011$morethanonesub))

summary(modeldawn2011train)

modeldawn2011$morethanonesub <- as.factor(modeldawn2011$morethanonesub)
modeldawn2011test$morethanonesub <- as.factor(modeldawn2011test$morethanonesub)
modeldawn2011train$morethanonesub <- as.factor(modeldawn2011train$morethanonesub)
glm1 <- glm(modeldawn2011train$morethanonesub~factor(SEX)+factor(age)+factor(METRO),
            data=modeldawn2011train[,-1],
            family=binomial)

summary(glm1)
tr_pred <- predict(glm1,
                   newdata = modeldawn2011train,
                   type = "response")

head(tr_pred)
##         1         2         3         6         7         8 
## 0.3727795 0.2896783 0.1770457 0.2211185 0.4467416 0.4657641
tr_predcl <- factor(ifelse(tr_pred>.5,1,0))

library(ggplot2)

pred1 <- data.frame(pr=tr_pred,
                    gr=tr_predcl,
                    modcon=modeldawn2011train$morethanonesub)

pred1 %>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of more than one substance used",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept = .5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1 %>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of more than one substance used",
          subtitle = "Truth")+
  geom_vline(xintercept = .5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(tr_predcl,modeldawn2011train$morethanonesub)
##          
## tr_predcl      0      1
##         0 111020  56532
##         1   6798   8912
confusionMatrix(data=tr_predcl,
                modeldawn2011train$morethanonesub,
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 111020  56532
##          1   6798   8912
##                                           
##                Accuracy : 0.6544          
##                  95% CI : (0.6522, 0.6566)
##     No Information Rate : 0.6429          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.0944          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.13618         
##             Specificity : 0.94230         
##          Pos Pred Value : 0.56728         
##          Neg Pred Value : 0.66260         
##              Prevalence : 0.35711         
##          Detection Rate : 0.04863         
##    Detection Prevalence : 0.08572         
##       Balanced Accuracy : 0.53924         
##                                           
##        'Positive' Class : 1               
## 
tr_predcl<-factor(ifelse(tr_pred>mean(I(modeldawn2011train$morethanonesub==1)),1,0))

pred2 <- data.frame(pr=tr_pred,
                    gr=tr_predcl,
                    modcon=modeldawn2011train$morethanonesub)

pred2 %>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position = "identity",
                 alpha=.358)+
  ggtitle(label= "Probability of more than one substance",
          subtitle="Threshold = mean")+
  geom_vline(xintercept = mean(I(modeldawn2011train$morethanonesub==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2 %>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity",
                 alpha=.358)+
  ggtitle(label= "Probability of more than one substance",
          subtitle="Truth")+
  geom_vline(xintercept = mean(I(modeldawn2011train$morethanonesub==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data=tr_predcl,
                modeldawn2011train$morethanonesub,
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 59946 19957
##          1 57872 45487
##                                          
##                Accuracy : 0.5753         
##                  95% CI : (0.573, 0.5776)
##     No Information Rate : 0.6429         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1806         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.6951         
##             Specificity : 0.5088         
##          Pos Pred Value : 0.4401         
##          Neg Pred Value : 0.7502         
##              Prevalence : 0.3571         
##          Detection Rate : 0.2482         
##    Detection Prevalence : 0.5640         
##       Balanced Accuracy : 0.6019         
##                                          
##        'Positive' Class : 1              
## 
pred_test<- predict(glm1,
                    newdata=modeldawn2011test,
                    type="response")

pred_cl <- factor(ifelse(pred_test > mean(I(modeldawn2011test==1)),1,0))
modeldawn2011test$morethanonesub <- as.factor(modeldawn2011test$morethanonesub)

table(modeldawn2011test$morethanonesub, pred_cl)
##    pred_cl
##         0     1
##   0   593 28644
##   1    54 16524
confusionMatrix(data=pred_cl, modeldawn2011test$morethanonesub)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0   593    54
##          1 28644 16524
##                                           
##                Accuracy : 0.3736          
##                  95% CI : (0.3692, 0.3781)
##     No Information Rate : 0.6382          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0124          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.02028         
##             Specificity : 0.99674         
##          Pos Pred Value : 0.91654         
##          Neg Pred Value : 0.36583         
##              Prevalence : 0.63815         
##          Detection Rate : 0.01294         
##    Detection Prevalence : 0.01412         
##       Balanced Accuracy : 0.50851         
##                                           
##        'Positive' Class : 0               
## 

When using the 50% threshold the correct classification is 65%%. When using the mean as the threshold the correct classification is 57%%. The balanced accuracy for the 50% threshold is 53%. While the balanced accuracy for the mean threshold is 60% So, the balanced accurace is actually better for the mean threshold. When using the test data, the correct classification is 37% and the balanced accuracy is 51%