library(car, quietly = T)
library(stargazer, quietly = T)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
library(haven)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(haven)
NSDUH_2019 <- read_sav("NSDUH_2019.SAV")
View(NSDUH_2019)
nams<-names(NSDUH_2019)
head(nams, n=10)
## [1] "QUESTID2" "FILEDATE" "CIGEVER" "CIGOFRSM" "CIGWILYR" "CIGTRY"
## [7] "CIGYFU" "CIGMFU" "CIGREC" "CIG30USE"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(NSDUH_2019)<-newnames
## attempted suicide
NSDUH_2019$attempt_suicide<-Recode(NSDUH_2019$adwrsatp, recodes="1=1; 2=0;else=NA", as.factor=T)
summary(NSDUH_2019$attempt_suicide, na.rm = TRUE)
## 0 1 NA's
## 2650 887 52599
## marital status
NSDUH_2019$marst<-Recode(NSDUH_2019$irmarit, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; else=NA", as.factor=T)
NSDUH_2019$marst<-relevel(NSDUH_2019$marst, ref='married')
## education recodes
NSDUH_2019$educ<-Recode(NSDUH_2019$ireduhighst2, recodes="1:7='LssThnHgh'; 8='highschool'; 9='someCollege'; 10='associates'; 11='colgrad';else=NA", as.factor=T)
NSDUH_2019$educ<-relevel(NSDUH_2019$educ, ref='colgrad')
## sexuality recodes
NSDUH_2019$sexuality<-Recode(NSDUH_2019$sexident, recodes="1='Heterosexual'; 2='Les/Gay'; 3='Bisexual';else=NA", as.factor=T)
NSDUH_2019$sexuality<-relevel(NSDUH_2019$sexuality, ref='Heterosexual')
## gender recodes
NSDUH_2019$male<-as.factor(ifelse(NSDUH_2019$irsex==1, "Male", "Female"))
## Race recoded items
NSDUH_2019$black<-Recode(NSDUH_2019$newrace2, recodes="2=1; 9=NA; else=0")
NSDUH_2019$white<-Recode(NSDUH_2019$newrace2, recodes="1=1; 9=NA; else=0")
NSDUH_2019$other<-Recode(NSDUH_2019$newrace2, recodes="3:4=1; 9=NA; else=0")
NSDUH_2019$mult_race<-Recode(NSDUH_2019$newrace2, recodes="6=1; 9=NA; else=0")
NSDUH_2019$asian<-Recode(NSDUH_2019$newrace2, recodes="5=1; 9=NA; else=0")
NSDUH_2019$hispanic<-Recode(NSDUH_2019$newrace2, recodes="7=1; 9=NA; else=0")
NSDUH_2019$race_eth<-Recode(NSDUH_2019$newrace2,
recodes="1='white'; 2='black'; 3='other'; 4='asian'; 5='mult_race'; 6='hispanic'; else=NA",
as.factor = T)
NSDUH_2019$lst_alc_use2<-Recode(NSDUH_2019$iralcrc, recodes="1='last 30days'; 2='12>1month'; 3='>12months'; else=NA", as.factor=T)
NSDUH_2019$dep_year2<-Recode(NSDUH_2019$amdeyr, recodes="1=1; 2=0;else=NA")
NSDUH_2019$age_cat<-Recode(NSDUH_2019$age2, recodes="7:8='18-19'; 9:10='20-21'; 11='22-23'; 12='24-25'; 13='26-29'; 14='30-34'; 15='35-49'; 16='50-64'; 17='65+'; else=NA", as.factor=T)
## Step 1 recoding variables into smaller data set
nsduh<-NSDUH_2019%>%
mutate(race = race_eth,
marital = marst,
suicide_attempt = attempt_suicide,
educ_attain = educ,
sex = male,
sexiden = sexuality,
age = age_cat,
fam_income = income,
last_alc_use = lst_alc_use2,
dep_year = dep_year2)%>%
filter(dep_year==1)%>%
dplyr::select(race, marital, suicide_attempt, educ_attain, sexiden, age, fam_income, last_alc_use, dep_year, sex)
knitr::kable(head(nsduh))
| race | marital | suicide_attempt | educ_attain | sexiden | age | fam_income | last_alc_use | dep_year | sex |
|---|---|---|---|---|---|---|---|---|---|
| white | married | 0 | colgrad | Heterosexual | 35-49 | 4 | last 30days | 1 | Male |
| black | widowed | NA | someCollege | Heterosexual | 50-64 | 1 | 12>1month | 1 | Male |
| white | widowed | 0 | someCollege | Heterosexual | 26-29 | 1 | last 30days | 1 | Female |
| hispanic | separated | 1 | someCollege | Heterosexual | 18-19 | 2 | NA | 1 | Male |
| white | widowed | 0 | highschool | Les/Gay | 35-49 | 1 | >12months | 1 | Female |
| white | widowed | NA | colgrad | Heterosexual | 30-34 | 4 | last 30days | 1 | Female |
##3) Use a 80% training/20% test split for your data.
nsduh_new <- na.omit(nsduh)
set.seed(500)
train<- createDataPartition(y = nsduh_new$suicide_attempt,
p = .80,
list=F)
nsduh_train<-nsduh_new[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
nsduh_test<-nsduh_new[-train,]
table(nsduh_train$suicide_attempt)
##
## 0 1
## 983 347
prop.table(table(nsduh_train$suicide_attempt))
##
## 0 1
## 0.7390977 0.2609023
summary(nsduh_train)
## race marital suicide_attempt educ_attain
## asian : 4 married :271 0:983 colgrad :291
## black : 117 divorced : 19 1:347 associates :135
## hispanic : 81 separated:888 highschool :352
## mult_race: 47 widowed :152 LssThnHgh : 94
## other : 26 someCollege:458
## white :1055
##
## sexiden age fam_income last_alc_use dep_year
## Heterosexual:951 35-49 :245 Min. :1.000 >12months :155 Min. :1
## Bisexual :327 20-21 :192 1st Qu.:1.000 12>1month :286 1st Qu.:1
## Les/Gay : 52 22-23 :191 Median :2.000 last 30days:889 Median :1
## 24-25 :190 Mean :2.398 Mean :1
## 18-19 :142 3rd Qu.:4.000 3rd Qu.:1
## 26-29 :141 Max. :4.000 Max. :1
## (Other):229
## sex
## Female:848
## Male :482
##
##
##
##
##
attach(nsduh_train)
names(nsduh_train)
## [1] "race" "marital" "suicide_attempt" "educ_attain"
## [5] "sexiden" "age" "fam_income" "last_alc_use"
## [9] "dep_year" "sex"
glm1<-glm(suicide_attempt~scale(fam_income)+factor(age)+factor(race)+factor(educ_attain)+factor(sexiden)+factor(last_alc_use)+factor(sex)+factor(marital),
data=nsduh_train[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = suicide_attempt ~ scale(fam_income) + factor(age) +
## factor(race) + factor(educ_attain) + factor(sexiden) + factor(last_alc_use) +
## factor(sex) + factor(marital), family = binomial, data = nsduh_train[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5253 -0.8025 -0.6348 1.0269 2.2577
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.19909 1.28054 -0.155 0.876450
## scale(fam_income) -0.15879 0.07208 -2.203 0.027599 *
## factor(age)20-21 0.17196 0.25838 0.666 0.505711
## factor(age)22-23 -0.15464 0.27677 -0.559 0.576341
## factor(age)24-25 0.28016 0.27139 1.032 0.301935
## factor(age)26-29 0.46194 0.29011 1.592 0.111320
## factor(age)30-34 0.45565 0.30282 1.505 0.132400
## factor(age)35-49 0.15125 0.29720 0.509 0.610808
## factor(age)50-64 0.10565 0.39925 0.265 0.791311
## factor(age)65+ -0.26926 0.70929 -0.380 0.704228
## factor(race)black -1.62435 1.22940 -1.321 0.186419
## factor(race)hispanic -1.19702 1.23412 -0.970 0.332076
## factor(race)mult_race -1.73863 1.26686 -1.372 0.169942
## factor(race)other -1.43439 1.28796 -1.114 0.265412
## factor(race)white -1.65990 1.21439 -1.367 0.171668
## factor(educ_attain)associates 0.66456 0.26918 2.469 0.013555 *
## factor(educ_attain)highschool 0.92322 0.22720 4.064 4.83e-05 ***
## factor(educ_attain)LssThnHgh 1.73318 0.29051 5.966 2.43e-09 ***
## factor(educ_attain)someCollege 0.69845 0.21239 3.289 0.001007 **
## factor(sexiden)Bisexual 0.58082 0.15467 3.755 0.000173 ***
## factor(sexiden)Les/Gay 0.79058 0.31425 2.516 0.011877 *
## factor(last_alc_use)12>1month -0.22477 0.24187 -0.929 0.352747
## factor(last_alc_use)last 30days -0.06853 0.21391 -0.320 0.748706
## factor(sex)Male -0.20105 0.14422 -1.394 0.163302
## factor(marital)divorced -0.10994 0.57748 -0.190 0.849010
## factor(marital)separated -0.24021 0.20306 -1.183 0.236829
## factor(marital)widowed 0.03154 0.24360 0.129 0.896970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1526.8 on 1329 degrees of freedom
## Residual deviance: 1428.5 on 1303 degrees of freedom
## AIC: 1482.5
##
## Number of Fisher Scoring iterations: 4
##3) Report the % correct classification from the training data using the .5 decision rule and again useing the mean of the outcome decision rule
tr_pred<- predict(glm1,
newdata = nsduh_train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.09940754 0.36866059 0.55853517 0.19075639 0.19497092 0.46022449
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=nsduh_train$suicide_attempt)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Attempting Suicide", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of Attempting Suicide", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table( tr_predcl,nsduh_train$suicide_attempt)
##
## tr_predcl 0 1
## 0 958 308
## 1 25 39
cm1<-confusionMatrix(data = tr_predcl,
reference = nsduh_train$suicide_attempt )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 958 308
## 1 25 39
##
## Accuracy : 0.7496
## 95% CI : (0.7254, 0.7727)
## No Information Rate : 0.7391
## P-Value [Acc > NIR] : 0.2
##
## Kappa : 0.1181
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9746
## Specificity : 0.1124
## Pos Pred Value : 0.7567
## Neg Pred Value : 0.6094
## Prevalence : 0.7391
## Detection Rate : 0.7203
## Detection Prevalence : 0.9519
## Balanced Accuracy : 0.5435
##
## 'Positive' Class : 0
##
tr_predcl<-factor(ifelse(tr_pred>mean(I(nsduh_train$suicide_attempt==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=nsduh_train$suicide_attempt)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Attempting Suicide", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(nsduh_train$suicide_attempt==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Attempting Suicide", subtitle = "Truth")+
geom_vline(xintercept=mean(I(nsduh_train$suicide_attempt==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
confusionMatrix(data = tr_predcl,
nsduh_train$suicide_attempt,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 623 139
## 1 360 208
##
## Accuracy : 0.6248
## 95% CI : (0.5982, 0.6509)
## No Information Rate : 0.7391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1934
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5994
## Specificity : 0.6338
## Pos Pred Value : 0.3662
## Neg Pred Value : 0.8176
## Prevalence : 0.2609
## Detection Rate : 0.1564
## Detection Prevalence : 0.4271
## Balanced Accuracy : 0.6166
##
## 'Positive' Class : 1
##
pred_test<-predict(glm1,
newdata=nsduh_test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(nsduh_test$suicide_attempt==1)), 1, 0))
table(nsduh_test$suicide_attempt,pred_cl)
## pred_cl
## 0 1
## 0 126 119
## 1 34 52
confusionMatrix(data = pred_cl,nsduh_test$suicide_attempt )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 126 34
## 1 119 52
##
## Accuracy : 0.5378
## 95% CI : (0.4824, 0.5924)
## No Information Rate : 0.7402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0901
##
## Mcnemar's Test P-Value : 1.114e-11
##
## Sensitivity : 0.5143
## Specificity : 0.6047
## Pos Pred Value : 0.7875
## Neg Pred Value : 0.3041
## Prevalence : 0.7402
## Detection Rate : 0.3807
## Detection Prevalence : 0.4834
## Balanced Accuracy : 0.5595
##
## 'Positive' Class : 0
##
pred_cl<-factor(ifelse(pred_test>.5, 1, 0))
confusionMatrix(data = pred_cl,
nsduh_test$suicide_attempt,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 224 74
## 1 21 12
##
## Accuracy : 0.713
## 95% CI : (0.661, 0.7611)
## No Information Rate : 0.7402
## P-Value [Acc > NIR] : 0.8824
##
## Kappa : 0.0673
##
## Mcnemar's Test P-Value : 9.55e-08
##
## Sensitivity : 0.13953
## Specificity : 0.91429
## Pos Pred Value : 0.36364
## Neg Pred Value : 0.75168
## Prevalence : 0.25982
## Detection Rate : 0.03625
## Detection Prevalence : 0.09970
## Balanced Accuracy : 0.52691
##
## 'Positive' Class : 1
##
```