#load libraries
library(car, quietly = T)
library(stargazer, quietly = T)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
library(haven)
BRFSS19<-read_xpt("C:/Users/adolp/Desktop/Statistics 2/BRFSS/LLCP2019.XPT")
nams<-names(BRFSS19)
head(BRFSS19, n=10)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(BRFSS19)<-newnames
BRFSS19$Alc<-Recode(BRFSS19$acedrink, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Dep<-Recode(BRFSS19$acedeprs, recodes ="7:9=NA; 1=1;2=0") 
BRFSS19$Gen<-Recode(BRFSS19$birthsex, recodes = "7:9=NA; 1=1;2=0")
BRFSS19$Mhlth<-Recode(BRFSS19$ment14d, recodes ="7:9=NA; 1=0;2=1;3=1")
sub<-BRFSS19 %>%
  select(Alc,Dep,Gen,Mhlth,llcpwt, ststr) %>%
  filter( complete.cases( . ))
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data = sub)
#Logit model
fit.logit<-svyglm(Mhlth ~ Alc + Dep + Gen,
                  design = des,
                  family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
## 
## Call:
## svyglm(formula = Mhlth ~ Alc + Dep + Gen, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.63075    0.05942 -10.616  < 2e-16 ***
## Alc          0.55473    0.08934   6.209 5.72e-10 ***
## Dep          1.03377    0.09724  10.631  < 2e-16 ***
## Gen         -0.42431    0.07676  -5.528 3.40e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00086)
## 
## Number of Fisher Scoring iterations: 4
fit.logit1<-svyglm(Mhlth ~ Alc,design= des, family=binomial) #Alc only
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit2<-svyglm(Mhlth~Alc+Dep,design= des, family=binomial) #Alc+Dep
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit3<-svyglm(Mhlth~Alc+Dep+Gen,design= des, family=binomial)#Alc+Dep+Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit1)
## 
## Call:
## svyglm(formula = Mhlth ~ Alc, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.70185    0.04391  -15.98   <2e-16 ***
## Alc          0.87789    0.08397   10.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000187)
## 
## Number of Fisher Scoring iterations: 4
summary(fit.logit2)
## 
## Call:
## svyglm(formula = Mhlth ~ Alc + Dep, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.83874    0.04642 -18.067  < 2e-16 ***
## Alc          0.56946    0.08927   6.379 1.94e-10 ***
## Dep          1.06084    0.09786  10.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000171)
## 
## Number of Fisher Scoring iterations: 4
regTermTest(fit.logit2, test.terms = ~Dep, method="Wald", df = NULL)
## Wald test for Dep
##  in svyglm(formula = Mhlth ~ Alc + Dep, design = des, family = binomial)
## F =  117.5029  on  1  and  5334  df: p= < 2.22e-16
summary(fit.logit3)
## 
## Call:
## svyglm(formula = Mhlth ~ Alc + Dep + Gen, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.63075    0.05942 -10.616  < 2e-16 ***
## Alc          0.55473    0.08934   6.209 5.72e-10 ***
## Dep          1.03377    0.09724  10.631  < 2e-16 ***
## Gen         -0.42431    0.07676  -5.528 3.40e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00086)
## 
## Number of Fisher Scoring iterations: 4
regTermTest(fit.logit3, test.terms=~Gen, method="Wald", df = NULL)
## Wald test for Gen
##  in svyglm(formula = Mhlth ~ Alc + Dep + Gen, design = des, family = binomial)
## F =  30.5551  on  1  and  5333  df: p= 3.3994e-08
myexp<-function(x){exp(x)}
stargazer(fit.logit1, fit.logit2, fit.logit3,type = "html",
          style="demography",
          covariate.labels =c("Alc", "Dep", "Gen", "Mhlth"),
          ci = T,
          apply.coef = myexp ,
          t.auto = F)
Mhlth
Model 1 Model 2 Model 3
Alc 2.406*** 1.767*** 1.741***
(2.241, 2.570) (1.592, 1.942) (1.566, 1.917)
Dep 2.889*** 2.812***
(2.697, 3.081) (2.621, 3.002)
Gen 0.654***
(0.504, 0.805)
Mhlth 0.496*** 0.432*** 0.532***
(0.410, 0.582) (0.341, 0.523) (0.416, 0.649)
N 5,360 5,360 5,360
Log Likelihood -3,437.175 -3,347.616 -3,322.921
AIC 6,878.351 6,701.231 6,653.841
p < .05; p < .01; p < .001

Stratified Models

fit.logitint<-svyglm(Mhlth~Alc*Dep+Gen,design= des, family=binomial)#Mhlth~Alc*Dep+Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~Alc:Dep, method = "Wald", df=NULL)
## Wald test for Alc:Dep
##  in svyglm(formula = Mhlth ~ Alc * Dep + Gen, design = des, family = binomial)
## F =  0.09266719  on  1  and  5332  df: p= 0.76083
fit.logitint<-svyglm(Mhlth~Alc+Dep*Gen,design= des, family=binomial)#Mhlth~Alc+Dep*Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~Dep:Gen, method = "Wald", df=NULL)
## Wald test for Dep:Gen
##  in svyglm(formula = Mhlth ~ Alc + Dep * Gen, design = des, family = binomial)
## F =  3.54029  on  1  and  5332  df: p= 0.059949
fit.logitint<-svyglm(Mhlth~Alc*Gen+Dep,design= des, family=binomial)#Mhlth~Alc+Dep*Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~Alc:Gen, method = "Wald", df=NULL)
## Wald test for Alc:Gen
##  in svyglm(formula = Mhlth ~ Alc * Gen + Dep, design = des, family = binomial)
## F =  1.490828  on  1  and  5332  df: p= 0.22214

Logistic Regression as a Predictive Model Classification methods and models

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster

Creating training and test datasets

set.seed(1115)
train<- createDataPartition(y = sub$Mhlth,
                            p = .80,
                            list=F)

model.dat2train<-sub[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
model.dat2test<-sub[-train,]

table(model.dat2train$Mhlth)
## 
##    0    1 
## 2674 1614
prop.table(table(model.dat2train$Mhlth))
## 
##         0         1 
## 0.6236007 0.3763993
summary(model.dat2train)
##       Alc              Dep             Gen             Mhlth       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.2397   Mean   :0.194   Mean   :0.4764   Mean   :0.3764  
##  3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##      llcpwt             ststr       
##  Min.   :   56.18   Min.   :421011  
##  1st Qu.:  723.58   1st Qu.:421081  
##  Median : 1158.62   Median :422049  
##  Mean   : 1518.73   Mean   :421803  
##  3rd Qu.: 1906.32   3rd Qu.:422079  
##  Max.   :17163.45   Max.   :422089
summary(model.dat2test)
##       Alc              Dep            Gen             Mhlth       
##  Min.   :0.0000   Min.   :0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00   Median :0.0000   Median :0.0000  
##  Mean   :0.2285   Mean   :0.18   Mean   :0.4981   Mean   :0.3722  
##  3rd Qu.:0.0000   3rd Qu.:0.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.00   Max.   :1.0000   Max.   :1.0000  
##      llcpwt             ststr       
##  Min.   :   88.88   Min.   :421011  
##  1st Qu.:  750.14   1st Qu.:421082  
##  Median : 1167.00   Median :422049  
##  Mean   : 1551.03   Mean   :421802  
##  3rd Qu.: 1891.63   3rd Qu.:422079  
##  Max.   :12885.39   Max.   :422089

Logistic regression for classification for training models

attach(model.dat2train)
glm1<-glm(Mhlth~factor(Alc)+factor(Dep)+factor(Gen),
          data=model.dat2train[,-1],
          family = binomial)
summary(glm1)
## 
## Call:
## glm(formula = Mhlth ~ factor(Alc) + factor(Dep) + factor(Gen), 
##     family = binomial, data = model.dat2train[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5709  -0.9109  -0.7717   1.2544   1.6472  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.66524    0.05023 -13.244  < 2e-16 ***
## factor(Alc)1  0.48615    0.07793   6.238 4.42e-10 ***
## factor(Dep)1  1.06891    0.08399  12.726  < 2e-16 ***
## factor(Gen)1 -0.39353    0.06600  -5.963 2.48e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5679.7  on 4287  degrees of freedom
## Residual deviance: 5353.1  on 4284  degrees of freedom
## AIC: 5361.1
## 
## Number of Fisher Scoring iterations: 4
tr_pred<- predict(glm1,
                  newdata = model.dat2train,
                  type = "response")

head(tr_pred)
##         1         2         3         4         5         6 
## 0.3395627 0.3395627 0.2575436 0.2575436 0.3395627 0.3395627

Using .5 decision rule

tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, Mhlth=model.dat2train$Mhlth)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Mental Health", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=Mhlth, fill=Mhlth))+
  ggtitle(label = "Probability of Mental Health", subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Rows = Predicted, Columns = Observed.

table( tr_predcl,model.dat2train$Mhlth)
##          
## tr_predcl    0    1
##         0 2357 1099
##         1  317  515
cm1<-confusionMatrix(data = tr_predcl,
                     reference = as.factor(model.dat2train$Mhlth))
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2357 1099
##          1  317  515
##                                           
##                Accuracy : 0.6698          
##                  95% CI : (0.6555, 0.6838)
##     No Information Rate : 0.6236          
##     P-Value [Acc > NIR] : 1.691e-10       
##                                           
##                   Kappa : 0.2218          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8815          
##             Specificity : 0.3191          
##          Pos Pred Value : 0.6820          
##          Neg Pred Value : 0.6190          
##              Prevalence : 0.6236          
##          Detection Rate : 0.5497          
##    Detection Prevalence : 0.8060          
##       Balanced Accuracy : 0.6003          
##                                           
##        'Positive' Class : 0               
## 

Using mean as the decision rule

tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$Mhlth==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred, gr=tr_predcl, Mhlth=model.dat2train$Mhlth)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Mental Health", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2train$Mhlth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=Mhlth))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Mental Health", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2train$Mhlth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = tr_predcl,
                reference = as.factor (model.dat2train$Mhlth),
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2187  958
##          1  487  656
##                                           
##                Accuracy : 0.663           
##                  95% CI : (0.6486, 0.6772)
##     No Information Rate : 0.6236          
##     P-Value [Acc > NIR] : 4.394e-08       
##                                           
##                   Kappa : 0.2381          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4064          
##             Specificity : 0.8179          
##          Pos Pred Value : 0.5739          
##          Neg Pred Value : 0.6954          
##              Prevalence : 0.3764          
##          Detection Rate : 0.1530          
##    Detection Prevalence : 0.2666          
##       Balanced Accuracy : 0.6122          
##                                           
##        'Positive' Class : 1               
## 

Logistic regression for classification for testing models

attach(model.dat2test)
## The following objects are masked from model.dat2train:
## 
##     Alc, Dep, Gen, llcpwt, Mhlth, ststr
glm2<-glm(Mhlth~factor(Alc)+factor(Dep)+factor(Gen),
          data=model.dat2test[,-1],
          family = binomial)
summary(glm2)
## 
## Call:
## glm(formula = Mhlth ~ factor(Alc) + factor(Dep) + factor(Gen), 
##     family = binomial, data = model.dat2test[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4943  -0.9237  -0.7942   1.2299   1.6171  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.6311     0.1004  -6.289 3.20e-10 ***
## factor(Alc)1   0.5086     0.1577   3.224  0.00126 ** 
## factor(Dep)1   0.8423     0.1715   4.910 9.09e-07 ***
## factor(Gen)1  -0.3610     0.1303  -2.770  0.00560 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1415.3  on 1071  degrees of freedom
## Residual deviance: 1356.2  on 1068  degrees of freedom
## AIC: 1364.2
## 
## Number of Fisher Scoring iterations: 4
tr_pred2<- predict(glm2,
                  newdata = model.dat2test,
                  type = "response")

head(tr_pred2)
##         1         2         3         4         5         6 
## 0.3814162 0.2704973 0.3472653 0.3472653 0.2704973 0.2704973

Using .5 decision rule

tr_predcl2<-factor(ifelse(tr_pred2>.5, 1, 0))

library(ggplot2)

pred2<-data.frame(pr=tr_pred2, gr=tr_predcl2, Mhlth=model.dat2test$Mhlth)

pred2%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Mental Health", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=Mhlth, fill=Mhlth))+
  ggtitle(label = "Probability of Mental Health", subtitle = "Pseudo Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl2,model.dat2test$Mhlth)
##           
## tr_predcl2   0   1
##          0 614 309
##          1  59  90
cm2<-confusionMatrix(data = tr_predcl2,
                     reference = as.factor(model.dat2test$Mhlth))
cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 614 309
##          1  59  90
##                                           
##                Accuracy : 0.6567          
##                  95% CI : (0.6274, 0.6851)
##     No Information Rate : 0.6278          
##     P-Value [Acc > NIR] : 0.0265          
##                                           
##                   Kappa : 0.1581          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9123          
##             Specificity : 0.2256          
##          Pos Pred Value : 0.6652          
##          Neg Pred Value : 0.6040          
##              Prevalence : 0.6278          
##          Detection Rate : 0.5728          
##    Detection Prevalence : 0.8610          
##       Balanced Accuracy : 0.5689          
##                                           
##        'Positive' Class : 0               
## 

Using mean as decision rule

pred_test2<-predict(glm2,
                   newdata=model.dat2test,
                   type="response")

pred_cl2<-factor(ifelse(pred_test2 > mean( I(model.dat2test$Mhlth==1)), 1, 0))

table(model.dat2test$Mhlth,pred_cl2)
##    pred_cl2
##       0   1
##   0 509 164
##   1 225 174
confusionMatrix(data = pred_cl2,
                reference = as.factor (model.dat2test$Mhlth ))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 509 225
##          1 164 174
##                                          
##                Accuracy : 0.6371         
##                  95% CI : (0.6075, 0.666)
##     No Information Rate : 0.6278         
##     P-Value [Acc > NIR] : 0.274757       
##                                          
##                   Kappa : 0.1986         
##                                          
##  Mcnemar's Test P-Value : 0.002349       
##                                          
##             Sensitivity : 0.7563         
##             Specificity : 0.4361         
##          Pos Pred Value : 0.6935         
##          Neg Pred Value : 0.5148         
##              Prevalence : 0.6278         
##          Detection Rate : 0.4748         
##    Detection Prevalence : 0.6847         
##       Balanced Accuracy : 0.5962         
##                                          
##        'Positive' Class : 0              
## 

Results

A 80% training/20% test split for your data. Results from the training data using the .5 decision rule and again using the mean of the outcome decision rule are reported below. The model has a 67% accuracy, with a sensitivity of 88% and a specificity of 32%. In other words, the model is pretty good at predicting if you are not experiencing mental health days, but not at predicting if you are experiencing mental health days. In comparing the two model, the overall accuracy slightly changed, but sensitivity decreased and specificity increased. The model has a 66% accuracy, with a sensitivity of 41% and a specificity of 82%. In other words, the model is somewhat good at predicting if you are not experiencing mental health days, and pretty good if you are experiencing mental health days. Results from the test data using the .5 decision rule and again using the mean of the outcome decision rule below. The model has a 66% accuracy, with a sensitivity of 91% and a specificity of 23%. In other words, the model is pretty good at predicting if you are not experiencing mental health days, but not at predicting if you are experiencing mental health days. In comparing the two model, the overall accuracy slightly changed, but sensitivity decreased and specificity increased. The model has a 64% accuracy, with a sensitivity of 76% and a specificity of 44%. In other words, the model is somewhat good at predicting if you are not experiencing mental health days, and not good if you are experiencing mental health days.