Homework 4 stats 2

library(car, quietly = T)
library(stargazer, quietly = T)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey, quietly = T)

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr, quietly = T)
library(dplyr, quietly = T)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2, quietly = T)
library(haven)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

library(haven)
NSDUH_2019 <- read_sav("NSDUH_2019.SAV")
View(NSDUH_2019)

nams<-names(NSDUH_2019)
head(nams, n=10)

##  [1] "QUESTID2" "FILEDATE" "CIGEVER"  "CIGOFRSM" "CIGWILYR" "CIGTRY"  
##  [7] "CIGYFU"   "CIGMFU"   "CIGREC"   "CIG30USE"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(NSDUH_2019)<-newnames

1) Define a binary outcome of your choosing

For the purpose of the assignment the usage of variable ADWRSATP will be used to measure a previous suicide attempt in the sample. That is with 0= not having attempted suicide, and 1= having made a suicide attempt. The variable itself follows several question items beginning with if a person contemplated, then planned, and in turn have also made a suicide attempt. The data selected includes those who attempted suicide, while also having a depressive episode in the last year.

2)Fit a predictive logistic regression model using as many predictor variables as you think you need

## attempted suicide
NSDUH_2019$attempt_suicide<-Recode(NSDUH_2019$adwrsatp, recodes="1=1; 2=0;else=NA", as.factor=T)
summary(NSDUH_2019$attempt_suicide, na.rm = TRUE)

##     0     1  NA's 
##  2650   887 52599

## marital status
NSDUH_2019$marst<-Recode(NSDUH_2019$irmarit, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; else=NA", as.factor=T)
NSDUH_2019$marst<-relevel(NSDUH_2019$marst, ref='married')

## education recodes
NSDUH_2019$educ<-Recode(NSDUH_2019$ireduhighst2, recodes="1:7='LssThnHgh'; 8='highschool'; 9='someCollege'; 10='associates'; 11='colgrad';else=NA", as.factor=T)
NSDUH_2019$educ<-relevel(NSDUH_2019$educ, ref='colgrad')

## sexuality recodes
NSDUH_2019$sexuality<-Recode(NSDUH_2019$sexident, recodes="1='Heterosexual'; 2='Les/Gay'; 3='Bisexual';else=NA", as.factor=T)
NSDUH_2019$sexuality<-relevel(NSDUH_2019$sexuality, ref='Heterosexual')

## gender recodes
NSDUH_2019$male<-as.factor(ifelse(NSDUH_2019$irsex==1, "Male", "Female"))

## Race recoded items
NSDUH_2019$black<-Recode(NSDUH_2019$newrace2, recodes="2=1; 9=NA; else=0")
NSDUH_2019$white<-Recode(NSDUH_2019$newrace2, recodes="1=1; 9=NA; else=0")
NSDUH_2019$other<-Recode(NSDUH_2019$newrace2, recodes="3:4=1; 9=NA; else=0")
NSDUH_2019$mult_race<-Recode(NSDUH_2019$newrace2, recodes="6=1; 9=NA; else=0")
NSDUH_2019$asian<-Recode(NSDUH_2019$newrace2, recodes="5=1; 9=NA; else=0")
NSDUH_2019$hispanic<-Recode(NSDUH_2019$newrace2, recodes="7=1; 9=NA; else=0")
NSDUH_2019$race_eth<-Recode(NSDUH_2019$newrace2,
                          recodes="1='white'; 2='black'; 3='other'; 4='asian'; 5='mult_race'; 6='hispanic'; else=NA",
                          as.factor = T)
NSDUH_2019$lst_alc_use2<-Recode(NSDUH_2019$iralcrc, recodes="1='last 30days'; 2='12>1month'; 3='>12months'; else=NA", as.factor=T)
NSDUH_2019$dep_year2<-Recode(NSDUH_2019$amdeyr, recodes="1=1; 2=0;else=NA")
NSDUH_2019$age_cat<-Recode(NSDUH_2019$age2, recodes="7:8='18-19'; 9:10='20-21'; 11='22-23'; 12='24-25'; 13='26-29'; 14='30-34'; 15='35-49'; 16='50-64'; 17='65+'; else=NA", as.factor=T)

## Step 1 recoding variables into smaller data set
nsduh<-NSDUH_2019%>%
mutate(race = race_eth, 
         marital = marst, 
         suicide_attempt = attempt_suicide,
         educ_attain = educ,
       sex = male,
       sexiden = sexuality,
         age = age_cat,
       fam_income = income,
       last_alc_use = lst_alc_use2,
       dep_year = dep_year2)%>%
    filter(dep_year==1)%>%
  dplyr::select(race, marital, suicide_attempt, educ_attain, sexiden, age, fam_income, last_alc_use, dep_year, sex)
knitr::kable(head(nsduh))

race	marital	suicide_attempt	educ_attain	sexiden	age	fam_income	last_alc_use	dep_year	sex
white	married	0	colgrad	Heterosexual	35-49	4	last 30days	1	Male
black	widowed	NA	someCollege	Heterosexual	50-64	1	12>1month	1	Male
white	widowed	0	someCollege	Heterosexual	26-29	1	last 30days	1	Female
hispanic	separated	1	someCollege	Heterosexual	18-19	2	NA	1	Male
white	widowed	0	highschool	Les/Gay	35-49	1	>12months	1	Female
white	widowed	NA	colgrad	Heterosexual	30-34	4	last 30days	1	Female

##3) Use a 80% training/20% test split for your data.

nsduh_new <- na.omit(nsduh) 
set.seed(500)
train<- createDataPartition(y = nsduh_new$suicide_attempt,
                            p = .80,
                            list=F)

nsduh_train<-nsduh_new[train,]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

nsduh_test<-nsduh_new[-train,]
table(nsduh_train$suicide_attempt)

## 
##   0   1 
## 983 347

prop.table(table(nsduh_train$suicide_attempt))

## 
##         0         1 
## 0.7390977 0.2609023

summary(nsduh_train)

##         race           marital    suicide_attempt      educ_attain 
##  asian    :   4   married  :271   0:983           colgrad    :291  
##  black    : 117   divorced : 19   1:347           associates :135  
##  hispanic :  81   separated:888                   highschool :352  
##  mult_race:  47   widowed  :152                   LssThnHgh  : 94  
##  other    :  26                                   someCollege:458  
##  white    :1055                                                    
##                                                                    
##          sexiden         age        fam_income         last_alc_use    dep_year
##  Heterosexual:951   35-49  :245   Min.   :1.000   >12months  :155   Min.   :1  
##  Bisexual    :327   20-21  :192   1st Qu.:1.000   12>1month  :286   1st Qu.:1  
##  Les/Gay     : 52   22-23  :191   Median :2.000   last 30days:889   Median :1  
##                     24-25  :190   Mean   :2.398                     Mean   :1  
##                     18-19  :142   3rd Qu.:4.000                     3rd Qu.:1  
##                     26-29  :141   Max.   :4.000                     Max.   :1  
##                     (Other):229                                                
##      sex     
##  Female:848  
##  Male  :482  
##              
##              
##              
##              
##

GLM is based on incidence of having life time depression

attach(nsduh_train)
names(nsduh_train)

##  [1] "race"            "marital"         "suicide_attempt" "educ_attain"    
##  [5] "sexiden"         "age"             "fam_income"      "last_alc_use"   
##  [9] "dep_year"        "sex"

glm1<-glm(suicide_attempt~scale(fam_income)+factor(age)+factor(race)+factor(educ_attain)+factor(sexiden)+factor(last_alc_use)+factor(sex)+factor(marital),
          data=nsduh_train[,-1],
          family = binomial)
summary(glm1)

## 
## Call:
## glm(formula = suicide_attempt ~ scale(fam_income) + factor(age) + 
##     factor(race) + factor(educ_attain) + factor(sexiden) + factor(last_alc_use) + 
##     factor(sex) + factor(marital), family = binomial, data = nsduh_train[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5253  -0.8025  -0.6348   1.0269   2.2577  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -0.19909    1.28054  -0.155 0.876450    
## scale(fam_income)               -0.15879    0.07208  -2.203 0.027599 *  
## factor(age)20-21                 0.17196    0.25838   0.666 0.505711    
## factor(age)22-23                -0.15464    0.27677  -0.559 0.576341    
## factor(age)24-25                 0.28016    0.27139   1.032 0.301935    
## factor(age)26-29                 0.46194    0.29011   1.592 0.111320    
## factor(age)30-34                 0.45565    0.30282   1.505 0.132400    
## factor(age)35-49                 0.15125    0.29720   0.509 0.610808    
## factor(age)50-64                 0.10565    0.39925   0.265 0.791311    
## factor(age)65+                  -0.26926    0.70929  -0.380 0.704228    
## factor(race)black               -1.62435    1.22940  -1.321 0.186419    
## factor(race)hispanic            -1.19702    1.23412  -0.970 0.332076    
## factor(race)mult_race           -1.73863    1.26686  -1.372 0.169942    
## factor(race)other               -1.43439    1.28796  -1.114 0.265412    
## factor(race)white               -1.65990    1.21439  -1.367 0.171668    
## factor(educ_attain)associates    0.66456    0.26918   2.469 0.013555 *  
## factor(educ_attain)highschool    0.92322    0.22720   4.064 4.83e-05 ***
## factor(educ_attain)LssThnHgh     1.73318    0.29051   5.966 2.43e-09 ***
## factor(educ_attain)someCollege   0.69845    0.21239   3.289 0.001007 ** 
## factor(sexiden)Bisexual          0.58082    0.15467   3.755 0.000173 ***
## factor(sexiden)Les/Gay           0.79058    0.31425   2.516 0.011877 *  
## factor(last_alc_use)12>1month   -0.22477    0.24187  -0.929 0.352747    
## factor(last_alc_use)last 30days -0.06853    0.21391  -0.320 0.748706    
## factor(sex)Male                 -0.20105    0.14422  -1.394 0.163302    
## factor(marital)divorced         -0.10994    0.57748  -0.190 0.849010    
## factor(marital)separated        -0.24021    0.20306  -1.183 0.236829    
## factor(marital)widowed           0.03154    0.24360   0.129 0.896970    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1526.8  on 1329  degrees of freedom
## Residual deviance: 1428.5  on 1303  degrees of freedom
## AIC: 1482.5
## 
## Number of Fisher Scoring iterations: 4

##3) Report the % correct classification from the training data using the .5 decision rule and again useing the mean of the outcome decision rule

tr_pred<- predict(glm1,
                  newdata = nsduh_train,
                  type = "response")

head(tr_pred)

##          1          2          3          4          5          6 
## 0.09940754 0.36866059 0.55853517 0.19075639 0.19497092 0.46022449

tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=nsduh_train$suicide_attempt)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Attempting Suicide", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of Attempting Suicide", subtitle = "Truth")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,nsduh_train$suicide_attempt)

##          
## tr_predcl   0   1
##         0 958 308
##         1  25  39

cm1<-confusionMatrix(data = tr_predcl,
                     reference = nsduh_train$suicide_attempt )
cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 958 308
##          1  25  39
##                                           
##                Accuracy : 0.7496          
##                  95% CI : (0.7254, 0.7727)
##     No Information Rate : 0.7391          
##     P-Value [Acc > NIR] : 0.2             
##                                           
##                   Kappa : 0.1181          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9746          
##             Specificity : 0.1124          
##          Pos Pred Value : 0.7567          
##          Neg Pred Value : 0.6094          
##              Prevalence : 0.7391          
##          Detection Rate : 0.7203          
##    Detection Prevalence : 0.9519          
##       Balanced Accuracy : 0.5435          
##                                           
##        'Positive' Class : 0               
##

tr_predcl<-factor(ifelse(tr_pred>mean(I(nsduh_train$suicide_attempt==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=nsduh_train$suicide_attempt)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Attempting Suicide", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(nsduh_train$suicide_attempt==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Attempting Suicide", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(nsduh_train$suicide_attempt==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = tr_predcl,
                nsduh_train$suicide_attempt,
                positive = "1" )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 623 139
##          1 360 208
##                                           
##                Accuracy : 0.6248          
##                  95% CI : (0.5982, 0.6509)
##     No Information Rate : 0.7391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1934          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5994          
##             Specificity : 0.6338          
##          Pos Pred Value : 0.3662          
##          Neg Pred Value : 0.8176          
##              Prevalence : 0.2609          
##          Detection Rate : 0.1564          
##    Detection Prevalence : 0.4271          
##       Balanced Accuracy : 0.6166          
##                                           
##        'Positive' Class : 1               
##

Overall, the logistic regression results predicted higher odds of attempting suicide when there was a depressive episode during the year for homosexuals, bisexuals, compared to heterosexuals. As income increases, the likelihood of attempting a suicide when there is a year where a depressive episode has happened decreases. Education wise, as educcational attainment decreases compared to having a college degree, depressive episodes during the year are more likely to result in a suicide attempt, with those with less than a high school education having a 1.7 times higher chance of making a suicide attempt.In order to test the accuracy and spceficiness of the results, the use of a confusion matrix was used.

Using the 80/20 split for the data, the confusion matrix indicated in the training model had an accuracy of 75%. A decision rule of attempting suicide >.5 was used in this particular model. This is somewhat reflective of the 6 cases selected by the predictive portion, of the 6 cases only one had appraoched 55% likelihood of attempting suicide under the chosen perameters. With the odds decreasing in each subsequent case. This model appears to be pretty useful at predicting suicide attempts when a depressive episode has occured during the year. With a sensitivity of 97% and specificity of 11% this model is better at predicting suicide attempts when having a depressive episode during the year.

3a) Does changing the decision rule threshold affect your classification accuracy?

In the previous confusion matrix, it was indicated a .5 decision rule was used, while in the second the mean of the attempted suicide response variable was used. Changing the threshold of the second confusion matrix caused a decrease in accuracy 62.48% compared to 75% in the first matrix. Sensitivity decreased, at the cost of a marked increase in specificity from 11% to 63%. This model is more useful for predicting suicide attempts when there hasnt been a depressive episode in a year, when a meant threshold is used.

pred_test<-predict(glm1,
                   newdata=nsduh_test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(nsduh_test$suicide_attempt==1)), 1, 0))

table(nsduh_test$suicide_attempt,pred_cl)

##    pred_cl
##       0   1
##   0 126 119
##   1  34  52

confusionMatrix(data = pred_cl,nsduh_test$suicide_attempt )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 126  34
##          1 119  52
##                                           
##                Accuracy : 0.5378          
##                  95% CI : (0.4824, 0.5924)
##     No Information Rate : 0.7402          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0901          
##                                           
##  Mcnemar's Test P-Value : 1.114e-11       
##                                           
##             Sensitivity : 0.5143          
##             Specificity : 0.6047          
##          Pos Pred Value : 0.7875          
##          Neg Pred Value : 0.3041          
##              Prevalence : 0.7402          
##          Detection Rate : 0.3807          
##    Detection Prevalence : 0.4834          
##       Balanced Accuracy : 0.5595          
##                                           
##        'Positive' Class : 0               
##

pred_cl<-factor(ifelse(pred_test>.5, 1, 0))
confusionMatrix(data = pred_cl,
                nsduh_test$suicide_attempt,
                positive = "1" )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 224  74
##          1  21  12
##                                          
##                Accuracy : 0.713          
##                  95% CI : (0.661, 0.7611)
##     No Information Rate : 0.7402         
##     P-Value [Acc > NIR] : 0.8824         
##                                          
##                   Kappa : 0.0673         
##                                          
##  Mcnemar's Test P-Value : 9.55e-08       
##                                          
##             Sensitivity : 0.13953        
##             Specificity : 0.91429        
##          Pos Pred Value : 0.36364        
##          Neg Pred Value : 0.75168        
##              Prevalence : 0.25982        
##          Detection Rate : 0.03625        
##    Detection Prevalence : 0.09970        
##       Balanced Accuracy : 0.52691        
##                                          
##        'Positive' Class : 1              
##

```

4) Report the % correct classification from the test data using the .5 decision rule and again useing the mean of the outcome decision rule

Using the confusion matrix in the testing model resulted with a similar accuracy as presented in the training models confusion matrix. The originals accuracy using the threshold of.5 was 74% while this test model is 74%. Here in the test model the specificity and sensitivity are reversed. The model is now really useful at predicting a suicide attempt when there is no depressive epsiode during the year. The second of the test confusion matrix models returned decrease accuracy, sensitvity, and specificity.