#load libraries
library(car, quietly = T)
library(stargazer, quietly = T)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
library(haven)
BRFSS19<-read_xpt("C:/Users/adolp/Desktop/Statistics 2/BRFSS/LLCP2019.XPT")
nams<-names(BRFSS19)
head(BRFSS19, n=10)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(BRFSS19)<-newnames
BRFSS19$Alc<-Recode(BRFSS19$acedrink, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Dep<-Recode(BRFSS19$acedeprs, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Gen<-Recode(BRFSS19$birthsex, recodes = "7:9=NA; 1=1;2=0")
BRFSS19$Mhlth<-Recode(BRFSS19$ment14d, recodes ="7:9=NA; 1=0;2=1;3=1")
sub<-BRFSS19 %>%
select(Alc,Dep,Gen,Mhlth,llcpwt, ststr) %>%
filter( complete.cases( . ))
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data = sub)
#Logit model
fit.logit<-svyglm(Mhlth ~ Alc + Dep + Gen,
design = des,
family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = Mhlth ~ Alc + Dep + Gen, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.63075 0.05942 -10.616 < 2e-16 ***
## Alc 0.55473 0.08934 6.209 5.72e-10 ***
## Dep 1.03377 0.09724 10.631 < 2e-16 ***
## Gen -0.42431 0.07676 -5.528 3.40e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.00086)
##
## Number of Fisher Scoring iterations: 4
fit.logit1<-svyglm(Mhlth ~ Alc,design= des, family=binomial) #Alc only
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit2<-svyglm(Mhlth~Alc+Dep,design= des, family=binomial) #Alc+Dep
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit3<-svyglm(Mhlth~Alc+Dep+Gen,design= des, family=binomial)#Alc+Dep+Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit1)
##
## Call:
## svyglm(formula = Mhlth ~ Alc, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.70185 0.04391 -15.98 <2e-16 ***
## Alc 0.87789 0.08397 10.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000187)
##
## Number of Fisher Scoring iterations: 4
summary(fit.logit2)
##
## Call:
## svyglm(formula = Mhlth ~ Alc + Dep, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.83874 0.04642 -18.067 < 2e-16 ***
## Alc 0.56946 0.08927 6.379 1.94e-10 ***
## Dep 1.06084 0.09786 10.840 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000171)
##
## Number of Fisher Scoring iterations: 4
regTermTest(fit.logit2, test.terms = ~Dep, method="Wald", df = NULL)
## Wald test for Dep
## in svyglm(formula = Mhlth ~ Alc + Dep, design = des, family = binomial)
## F = 117.5029 on 1 and 5334 df: p= < 2.22e-16
summary(fit.logit3)
##
## Call:
## svyglm(formula = Mhlth ~ Alc + Dep + Gen, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.63075 0.05942 -10.616 < 2e-16 ***
## Alc 0.55473 0.08934 6.209 5.72e-10 ***
## Dep 1.03377 0.09724 10.631 < 2e-16 ***
## Gen -0.42431 0.07676 -5.528 3.40e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.00086)
##
## Number of Fisher Scoring iterations: 4
regTermTest(fit.logit3, test.terms=~Gen, method="Wald", df = NULL)
## Wald test for Gen
## in svyglm(formula = Mhlth ~ Alc + Dep + Gen, design = des, family = binomial)
## F = 30.5551 on 1 and 5333 df: p= 3.3994e-08
myexp<-function(x){exp(x)}
stargazer(fit.logit1, fit.logit2, fit.logit3,type = "html",
style="demography",
covariate.labels =c("Alc", "Dep", "Gen", "Mhlth"),
ci = T,
apply.coef = myexp ,
t.auto = F)
| Mhlth | |||
| Model 1 | Model 2 | Model 3 | |
| Alc | 2.406*** | 1.767*** | 1.741*** |
| (2.241, 2.570) | (1.592, 1.942) | (1.566, 1.917) | |
| Dep | 2.889*** | 2.812*** | |
| (2.697, 3.081) | (2.621, 3.002) | ||
| Gen | 0.654*** | ||
| (0.504, 0.805) | |||
| Mhlth | 0.496*** | 0.432*** | 0.532*** |
| (0.410, 0.582) | (0.341, 0.523) | (0.416, 0.649) | |
| N | 5,360 | 5,360 | 5,360 |
| Log Likelihood | -3,437.175 | -3,347.616 | -3,322.921 |
| AIC | 6,878.351 | 6,701.231 | 6,653.841 |
| p < .05; p < .01; p < .001 | |||
Stratified Models
fit.logitint<-svyglm(Mhlth~Alc*Dep+Gen,design= des, family=binomial)#Mhlth~Alc*Dep+Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~Alc:Dep, method = "Wald", df=NULL)
## Wald test for Alc:Dep
## in svyglm(formula = Mhlth ~ Alc * Dep + Gen, design = des, family = binomial)
## F = 0.09266719 on 1 and 5332 df: p= 0.76083
fit.logitint<-svyglm(Mhlth~Alc+Dep*Gen,design= des, family=binomial)#Mhlth~Alc+Dep*Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~Dep:Gen, method = "Wald", df=NULL)
## Wald test for Dep:Gen
## in svyglm(formula = Mhlth ~ Alc + Dep * Gen, design = des, family = binomial)
## F = 3.54029 on 1 and 5332 df: p= 0.059949
fit.logitint<-svyglm(Mhlth~Alc*Gen+Dep,design= des, family=binomial)#Mhlth~Alc+Dep*Gen
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~Alc:Gen, method = "Wald", df=NULL)
## Wald test for Alc:Gen
## in svyglm(formula = Mhlth ~ Alc * Gen + Dep, design = des, family = binomial)
## F = 1.490828 on 1 and 5332 df: p= 0.22214
Logistic Regression as a Predictive Model Classification methods and models
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
Creating training and test datasets
set.seed(1115)
train<- createDataPartition(y = sub$Mhlth,
p = .80,
list=F)
model.dat2train<-sub[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
model.dat2test<-sub[-train,]
table(model.dat2train$Mhlth)
##
## 0 1
## 2674 1614
prop.table(table(model.dat2train$Mhlth))
##
## 0 1
## 0.6236007 0.3763993
summary(model.dat2train)
## Alc Dep Gen Mhlth
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.2397 Mean :0.194 Mean :0.4764 Mean :0.3764
## 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## llcpwt ststr
## Min. : 56.18 Min. :421011
## 1st Qu.: 723.58 1st Qu.:421081
## Median : 1158.62 Median :422049
## Mean : 1518.73 Mean :421803
## 3rd Qu.: 1906.32 3rd Qu.:422079
## Max. :17163.45 Max. :422089
summary(model.dat2test)
## Alc Dep Gen Mhlth
## Min. :0.0000 Min. :0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00 Median :0.0000 Median :0.0000
## Mean :0.2285 Mean :0.18 Mean :0.4981 Mean :0.3722
## 3rd Qu.:0.0000 3rd Qu.:0.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00 Max. :1.0000 Max. :1.0000
## llcpwt ststr
## Min. : 88.88 Min. :421011
## 1st Qu.: 750.14 1st Qu.:421082
## Median : 1167.00 Median :422049
## Mean : 1551.03 Mean :421802
## 3rd Qu.: 1891.63 3rd Qu.:422079
## Max. :12885.39 Max. :422089
Logistic regression for classification for training models
attach(model.dat2train)
glm1<-glm(Mhlth~factor(Alc)+factor(Dep)+factor(Gen),
data=model.dat2train[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = Mhlth ~ factor(Alc) + factor(Dep) + factor(Gen),
## family = binomial, data = model.dat2train[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5709 -0.9109 -0.7717 1.2544 1.6472
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.66524 0.05023 -13.244 < 2e-16 ***
## factor(Alc)1 0.48615 0.07793 6.238 4.42e-10 ***
## factor(Dep)1 1.06891 0.08399 12.726 < 2e-16 ***
## factor(Gen)1 -0.39353 0.06600 -5.963 2.48e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5679.7 on 4287 degrees of freedom
## Residual deviance: 5353.1 on 4284 degrees of freedom
## AIC: 5361.1
##
## Number of Fisher Scoring iterations: 4
tr_pred<- predict(glm1,
newdata = model.dat2train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.3395627 0.3395627 0.2575436 0.2575436 0.3395627 0.3395627
Using .5 decision rule
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred, gr=tr_predcl, Mhlth=model.dat2train$Mhlth)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Mental Health", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=Mhlth, fill=Mhlth))+
ggtitle(label = "Probability of Mental Health", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Rows = Predicted, Columns = Observed.
table( tr_predcl,model.dat2train$Mhlth)
##
## tr_predcl 0 1
## 0 2357 1099
## 1 317 515
cm1<-confusionMatrix(data = tr_predcl,
reference = as.factor(model.dat2train$Mhlth))
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2357 1099
## 1 317 515
##
## Accuracy : 0.6698
## 95% CI : (0.6555, 0.6838)
## No Information Rate : 0.6236
## P-Value [Acc > NIR] : 1.691e-10
##
## Kappa : 0.2218
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8815
## Specificity : 0.3191
## Pos Pred Value : 0.6820
## Neg Pred Value : 0.6190
## Prevalence : 0.6236
## Detection Rate : 0.5497
## Detection Prevalence : 0.8060
## Balanced Accuracy : 0.6003
##
## 'Positive' Class : 0
##
Using mean as the decision rule
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$Mhlth==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred, gr=tr_predcl, Mhlth=model.dat2train$Mhlth)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Mental Health", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(model.dat2train$Mhlth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=Mhlth))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Mental Health", subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2train$Mhlth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
confusionMatrix(data = tr_predcl,
reference = as.factor (model.dat2train$Mhlth),
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2187 958
## 1 487 656
##
## Accuracy : 0.663
## 95% CI : (0.6486, 0.6772)
## No Information Rate : 0.6236
## P-Value [Acc > NIR] : 4.394e-08
##
## Kappa : 0.2381
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4064
## Specificity : 0.8179
## Pos Pred Value : 0.5739
## Neg Pred Value : 0.6954
## Prevalence : 0.3764
## Detection Rate : 0.1530
## Detection Prevalence : 0.2666
## Balanced Accuracy : 0.6122
##
## 'Positive' Class : 1
##
Logistic regression for classification for testing models
attach(model.dat2test)
## The following objects are masked from model.dat2train:
##
## Alc, Dep, Gen, llcpwt, Mhlth, ststr
glm2<-glm(Mhlth~factor(Alc)+factor(Dep)+factor(Gen),
data=model.dat2test[,-1],
family = binomial)
summary(glm2)
##
## Call:
## glm(formula = Mhlth ~ factor(Alc) + factor(Dep) + factor(Gen),
## family = binomial, data = model.dat2test[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4943 -0.9237 -0.7942 1.2299 1.6171
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6311 0.1004 -6.289 3.20e-10 ***
## factor(Alc)1 0.5086 0.1577 3.224 0.00126 **
## factor(Dep)1 0.8423 0.1715 4.910 9.09e-07 ***
## factor(Gen)1 -0.3610 0.1303 -2.770 0.00560 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1415.3 on 1071 degrees of freedom
## Residual deviance: 1356.2 on 1068 degrees of freedom
## AIC: 1364.2
##
## Number of Fisher Scoring iterations: 4
tr_pred2<- predict(glm2,
newdata = model.dat2test,
type = "response")
head(tr_pred2)
## 1 2 3 4 5 6
## 0.3814162 0.2704973 0.3472653 0.3472653 0.2704973 0.2704973
Using .5 decision rule
tr_predcl2<-factor(ifelse(tr_pred2>.5, 1, 0))
library(ggplot2)
pred2<-data.frame(pr=tr_pred2, gr=tr_predcl2, Mhlth=model.dat2test$Mhlth)
pred2%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Mental Health", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot()+
geom_histogram(aes(x=pr, color=Mhlth, fill=Mhlth))+
ggtitle(label = "Probability of Mental Health", subtitle = "Pseudo Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table( tr_predcl2,model.dat2test$Mhlth)
##
## tr_predcl2 0 1
## 0 614 309
## 1 59 90
cm2<-confusionMatrix(data = tr_predcl2,
reference = as.factor(model.dat2test$Mhlth))
cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 614 309
## 1 59 90
##
## Accuracy : 0.6567
## 95% CI : (0.6274, 0.6851)
## No Information Rate : 0.6278
## P-Value [Acc > NIR] : 0.0265
##
## Kappa : 0.1581
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9123
## Specificity : 0.2256
## Pos Pred Value : 0.6652
## Neg Pred Value : 0.6040
## Prevalence : 0.6278
## Detection Rate : 0.5728
## Detection Prevalence : 0.8610
## Balanced Accuracy : 0.5689
##
## 'Positive' Class : 0
##
Using mean as decision rule
pred_test2<-predict(glm2,
newdata=model.dat2test,
type="response")
pred_cl2<-factor(ifelse(pred_test2 > mean( I(model.dat2test$Mhlth==1)), 1, 0))
table(model.dat2test$Mhlth,pred_cl2)
## pred_cl2
## 0 1
## 0 509 164
## 1 225 174
confusionMatrix(data = pred_cl2,
reference = as.factor (model.dat2test$Mhlth ))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 509 225
## 1 164 174
##
## Accuracy : 0.6371
## 95% CI : (0.6075, 0.666)
## No Information Rate : 0.6278
## P-Value [Acc > NIR] : 0.274757
##
## Kappa : 0.1986
##
## Mcnemar's Test P-Value : 0.002349
##
## Sensitivity : 0.7563
## Specificity : 0.4361
## Pos Pred Value : 0.6935
## Neg Pred Value : 0.5148
## Prevalence : 0.6278
## Detection Rate : 0.4748
## Detection Prevalence : 0.6847
## Balanced Accuracy : 0.5962
##
## 'Positive' Class : 0
##
Results
A 80% training/20% test split for your data. Results from the training data using the .5 decision rule and again using the mean of the outcome decision rule are reported below. The model has a 67% accuracy, with a sensitivity of 88% and a specificity of 32%. In other words, the model is pretty good at predicting if you are not experiencing mental health days, but not at predicting if you are experiencing mental health days. In comparing the two model, the overall accuracy slightly changed, but sensitivity decreased and specificity increased. The model has a 66% accuracy, with a sensitivity of 41% and a specificity of 82%. In other words, the model is somewhat good at predicting if you are not experiencing mental health days, and pretty good if you are experiencing mental health days. Results from the test data using the .5 decision rule and again using the mean of the outcome decision rule below. The model has a 66% accuracy, with a sensitivity of 91% and a specificity of 23%. In other words, the model is pretty good at predicting if you are not experiencing mental health days, but not at predicting if you are experiencing mental health days. In comparing the two model, the overall accuracy slightly changed, but sensitivity decreased and specificity increased. The model has a 64% accuracy, with a sensitivity of 76% and a specificity of 44%. In other words, the model is somewhat good at predicting if you are not experiencing mental health days, and not good if you are experiencing mental health days.