# Loading data from BRFSS 2020
brfss20<- readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))
### Fix variable names
names(brfss20) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20)))
#Alcohol Intake within past 30 days
brfss20$alcohol<-car::Recode(brfss20$drnkany5, recodes="1=1; 2=0; else=NA")
#brfss20$alcohol<-brfss20$drnkany5
#brfss20$alcohol <-as.factor(ifelse(brfss20$alcohol == 1,1,0))
#sex
brfss20$male<-as.factor(ifelse(brfss20$sex==1,
"Male",
"Female"))
#Age cut into intervals
brfss20$agec<-cut(brfss20$age80,
breaks=c(0,24,39,59,79,99))
brfss20$educ<-Recode(brfss20$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
#Racial/Ethnic Background
brfss20$black<-Recode(brfss20$racegr3,
recodes="2=1; 9=NA; else=0")
brfss20$white<-Recode(brfss20$racegr3,
recodes="1=1; 9=NA; else=0")
brfss20$other<-Recode(brfss20$racegr3,
recodes="3:4=1; 9=NA; else=0")
brfss20$hispanic<-Recode(brfss20$racegr3,
recodes="5=1; 9=NA; else=0")
brfss20$race_eth<-Recode(brfss20$racegr3,
recodes="1='nhwhite'; 2='nh_black'; 3='nh_other';4='nh_multirace'; 5='hispanic'; else=NA",
as.factor = T)
#insurance
brfss20$ins<-Recode(brfss20$hlthpln1,
recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss20$inc<-ifelse(brfss20$incomg==9,
NA,
brfss20$incomg)
#employment
brfss20$employ<-Recode(brfss20$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20$employ<-relevel(brfss20$employ,
ref='Employed')
#marital status
brfss20$marst<-Recode(brfss20$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss20$marst<-relevel(brfss20$marst,
ref='married')
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
strata= ~ststr,
weights= ~mmsawt,
data = brfss20 )
Alcohol consumption
library(dplyr)
model.data<- brfss20 %>%
select(seqno, alcohol,ins,educ, marst,employ,racegr3)
knitr::kable(head(model.data))
| seqno | alcohol | ins | educ | marst | employ | racegr3 |
|---|---|---|---|---|---|---|
| 2.02e+09 | 1 | 0 | 4colgrad | married | Employed | 9 |
| 2.02e+09 | 1 | 1 | 2hsgrad | cohab | nilf | 1 |
| 2.02e+09 | 1 | 0 | 2hsgrad | married | Employed | 2 |
| 2.02e+09 | 1 | 1 | 2hsgrad | widowed | retired | 1 |
| 2.02e+09 | 0 | 1 | 3somecol | married | Employed | 1 |
| 2.02e+09 | 1 | 1 | 4colgrad | married | retired | 1 |
set.seed(1115)
train<- createDataPartition(y = model.data$alcohol,
p = .80,
list=F)
model.dat2train<-model.data[train,]
model.dat2test<-model.data[-train,]
table(model.dat2train$alcohol)
##
## 0 1
## 64438 77014
prop.table(table(model.dat2train$alcohol))
##
## 0 1
## 0.4555468 0.5444532
summary(model.dat2train)
## seqno alcohol ins educ
## Min. :2.02e+09 Min. :0.0000 Min. :0.0000 0Prim : 2620
## 1st Qu.:2.02e+09 1st Qu.:0.0000 1st Qu.:1.0000 1somehs : 5153
## Median :2.02e+09 Median :1.0000 Median :1.0000 2hsgrad :33086
## Mean :2.02e+09 Mean :0.5445 Mean :0.9209 3somecol:37729
## 3rd Qu.:2.02e+09 3rd Qu.:1.0000 3rd Qu.:1.0000 4colgrad:62864
## Max. :2.02e+09 Max. :1.0000 Max. :1.0000
## marst employ racegr3
## married :72523 Employed:75729 Min. :1.000
## cohab : 6002 nilf :19328 1st Qu.:1.000
## divorced :17946 retired :38656 Median :1.000
## nm :28461 unable : 7739 Mean :1.795
## separated: 2833 3rd Qu.:2.000
## widowed :13687 Max. :9.000
model.dat2train$alcohol<- as.factor(model.dat2train$alcohol)
glm1<-glm(alcohol~factor(ins)+factor(marst)+factor(educ)+factor(employ)+factor(racegr3),
data=brfss20[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = alcohol ~ factor(ins) + factor(marst) + factor(educ) +
## factor(employ) + factor(racegr3), family = binomial, data = brfss20[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7761 -1.1619 0.8194 1.0383 2.2286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.33802 0.04482 -7.543 4.61e-14 ***
## factor(ins)1 0.04023 0.01926 2.089 0.036672 *
## factor(marst)cohab 0.42684 0.02653 16.088 < 2e-16 ***
## factor(marst)divorced -0.10533 0.01566 -6.725 1.75e-11 ***
## factor(marst)nm 0.02730 0.01357 2.011 0.044278 *
## factor(marst)separated -0.05810 0.03637 -1.598 0.110151
## factor(marst)widowed -0.41666 0.01834 -22.721 < 2e-16 ***
## factor(educ)1somehs 0.35315 0.04887 7.227 4.94e-13 ***
## factor(educ)2hsgrad 0.57768 0.04279 13.501 < 2e-16 ***
## factor(educ)3somecol 0.84366 0.04280 19.714 < 2e-16 ***
## factor(educ)4colgrad 1.21685 0.04268 28.511 < 2e-16 ***
## factor(employ)nilf -0.48928 0.01513 -32.348 < 2e-16 ***
## factor(employ)retired -0.58260 0.01244 -46.846 < 2e-16 ***
## factor(employ)unable -1.05461 0.02406 -43.828 < 2e-16 ***
## factor(racegr3)2 -0.36420 0.01738 -20.961 < 2e-16 ***
## factor(racegr3)3 -0.62716 0.02442 -25.687 < 2e-16 ***
## factor(racegr3)4 -0.13244 0.03762 -3.520 0.000431 ***
## factor(racegr3)5 -0.40000 0.01748 -22.884 < 2e-16 ***
## factor(racegr3)9 -0.32942 0.03754 -8.774 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 243712 on 176814 degrees of freedom
## Residual deviance: 229984 on 176796 degrees of freedom
## AIC: 230022
##
## Number of Fisher Scoring iterations: 4
tr_pred<- predict(glm1,
newdata = model.dat2train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.6339983 0.5541491 0.4689063 0.5833310 0.4908186 0.4841204
Using 50% as the predictor
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred,
gr=tr_predcl,
alco=model.dat2train$alcohol)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Alcohol consumption",
subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=alco, fill=alco))+
ggtitle(label = "Probability of Alcohol consumption",
subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table( tr_predcl,
model.dat2train$alcohol)
##
## tr_predcl 0 1
## 0 31938 20757
## 1 32500 56257
model.dat2train$alcohol<- as.factor(model.dat2train$alcohol)
cm1<-confusionMatrix(data = tr_predcl,
reference = model.dat2train$alcohol)
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31938 20757
## 1 32500 56257
##
## Accuracy : 0.6235
## 95% CI : (0.621, 0.626)
## No Information Rate : 0.5445
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2295
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4956
## Specificity : 0.7305
## Pos Pred Value : 0.6061
## Neg Pred Value : 0.6338
## Prevalence : 0.4555
## Detection Rate : 0.2258
## Detection Prevalence : 0.3725
## Balanced Accuracy : 0.6131
##
## 'Positive' Class : 0
##
Overall the model has a 62.35% accuracy.
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$alcohol==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred,
gr=tr_predcl,
alco=model.dat2train$alcohol)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Alcohol consumption",
subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(model.dat2train$alcohol==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=alco))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Alcohol consumption",
subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2train$alcohol==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table( tr_predcl,
model.dat2train$alcohol)
##
## tr_predcl 0 1
## 0 36514 26147
## 1 27924 50867
model.dat2train$alcohol<- as.factor(model.dat2train$alcohol)
confusionMatrix(data = tr_predcl,
model.dat2train$alcohol,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36514 26147
## 1 27924 50867
##
## Accuracy : 0.6177
## 95% CI : (0.6152, 0.6203)
## No Information Rate : 0.5445
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2277
##
## Mcnemar's Test P-Value : 2.212e-14
##
## Sensitivity : 0.6605
## Specificity : 0.5667
## Pos Pred Value : 0.6456
## Neg Pred Value : 0.5827
## Prevalence : 0.5445
## Detection Rate : 0.3596
## Detection Prevalence : 0.5570
## Balanced Accuracy : 0.6136
##
## 'Positive' Class : 1
##
From the training data, the model has a 61.77% accuracy.
Yes, using the .5 the classification accuracy is 62.35%, while using the mean the accuracy reduced to 61.77%. There are no much difference in the accuracy, but the .5 seems to be a little higher.
pred_test<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$alcohol==1)), 1, 0))
table(model.dat2test$alcohol,pred_cl)
## pred_cl
## 0 1
## 0 9184 6908
## 1 6459 12812
model.dat2test$alcohol<- as.factor(model.dat2test$alcohol)
confusionMatrix(data = pred_cl,model.dat2test$alcohol)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9184 6459
## 1 6908 12812
##
## Accuracy : 0.622
## 95% CI : (0.6169, 0.6271)
## No Information Rate : 0.5449
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2361
##
## Mcnemar's Test P-Value : 0.0001067
##
## Sensitivity : 0.5707
## Specificity : 0.6648
## Pos Pred Value : 0.5871
## Neg Pred Value : 0.6497
## Prevalence : 0.4551
## Detection Rate : 0.2597
## Detection Prevalence : 0.4424
## Balanced Accuracy : 0.6178
##
## 'Positive' Class : 0
##
From the testing data, the model has a 62.2% accuracy.
pred_test2<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test >.5, 1, 0))
table(model.dat2test$alcohol,pred_cl)
## pred_cl
## 0 1
## 0 8026 8066
## 1 5152 14119
model.dat2test$alcohol<- as.factor(model.dat2test$alcohol)
confusionMatrix(data = pred_cl,model.dat2test$alcohol)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8026 5152
## 1 8066 14119
##
## Accuracy : 0.6262
## 95% CI : (0.6212, 0.6313)
## No Information Rate : 0.5449
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2349
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4988
## Specificity : 0.7327
## Pos Pred Value : 0.6090
## Neg Pred Value : 0.6364
## Prevalence : 0.4551
## Detection Rate : 0.2270
## Detection Prevalence : 0.3726
## Balanced Accuracy : 0.6157
##
## 'Positive' Class : 0
##
The percentage correct classification is 62.62 % accurate.