library(car)
## Loading required package: carData
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
NSDUH_2019 <- read_sav("NSDUH_2019.SAV")
View(NSDUH_2019)
nams<-names(NSDUH_2019)
head(nams, n=10)
##  [1] "QUESTID2" "FILEDATE" "CIGEVER"  "CIGOFRSM" "CIGWILYR" "CIGTRY"  
##  [7] "CIGYFU"   "CIGMFU"   "CIGREC"   "CIG30USE"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(NSDUH_2019)<-newnames
NSDUH_2019$attempt_suicide<-Recode(NSDUH_2019$adwrsatp, recodes="1=1; 2=0;else=NA")

## Main variable hopeless last thirty days
NSDUH_2019$hopelesslst30days<-Recode(NSDUH_2019$dsthop30,
                               recodes="1:2=3;3:4=2;5=1; else=NA")
NSDUH_2019$fltnervouslst30days<-Recode(NSDUH_2019$dstnrv30,
                               recodes="1:2=3;3:4=2;5=1; else=NA")
NSDUH_2019$fltrestlesslst30days<-Recode(NSDUH_2019$dstrst30,
                               recodes="1:2=3;3:4=2;5=1; else=NA")
NSDUH_2019$fltsadlst30days<-Recode(NSDUH_2019$dstchr30,
                               recodes="1:2=3;3:4=2;5=1; else=NA")
NSDUH_2019$effortlst30days<-Recode(NSDUH_2019$dsteff30,
                               recodes="1:2=3;3:4=2;5=1; else=NA")
NSDUH_2019$fltdwnlst30days<-Recode(NSDUH_2019$dstngd30,
                               recodes="1:2=3;3:4=2;5=1; else=NA")
## marital status
NSDUH_2019$marst<-Recode(NSDUH_2019$irmarit, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; else=NA", as.factor=T)
NSDUH_2019$marst<-relevel(NSDUH_2019$marst, ref='married')

## education recodes
NSDUH_2019$educ<-Recode(NSDUH_2019$ireduhighst2, recodes="1:7='LssThnHgh'; 8='highschool'; 9='someCollege'; 10='associates'; 11='colgrad';else=NA", as.factor=T)
NSDUH_2019$educ<-relevel(NSDUH_2019$educ, ref='colgrad')

## sexuality recodes
NSDUH_2019$sexuality<-Recode(NSDUH_2019$sexident, recodes="1='Heterosexual'; 2='Les/Gay'; 3='Bisexual';else=NA", as.factor=T)
NSDUH_2019$sexuality<-relevel(NSDUH_2019$sexuality, ref='Heterosexual')

## gender recodes
NSDUH_2019$male<-as.factor(ifelse(NSDUH_2019$irsex==1, "Male", "Female"))

## Race recoded items
NSDUH_2019$black<-Recode(NSDUH_2019$newrace2, recodes="2=1; 9=NA; else=0")
NSDUH_2019$white<-Recode(NSDUH_2019$newrace2, recodes="1=1; 9=NA; else=0")
NSDUH_2019$other<-Recode(NSDUH_2019$newrace2, recodes="3:4=1; 9=NA; else=0")
NSDUH_2019$mult_race<-Recode(NSDUH_2019$newrace2, recodes="6=1; 9=NA; else=0")
NSDUH_2019$asian<-Recode(NSDUH_2019$newrace2, recodes="5=1; 9=NA; else=0")
NSDUH_2019$hispanic<-Recode(NSDUH_2019$newrace2, recodes="7=1; 9=NA; else=0")
NSDUH_2019$race_eth<-Recode(NSDUH_2019$newrace2,
                          recodes="1='white'; 2='black'; 3='other'; 4='asian'; 5='mult_race'; 6='hispanic'; else=NA",
                          as.factor = T)
NSDUH_2019$race_eth<-relevel(NSDUH_2019$race_eth, ref='white')
NSDUH_2019$lst_alc_use2<-Recode(NSDUH_2019$iralcrc, recodes="1='last 30days'; 2='12>1month'; 3='>12months'; else=NA", as.factor=T)
NSDUH_2019$dep_year2<-Recode(NSDUH_2019$amdeyr, recodes="1=1; 2=0;else=NA")
NSDUH_2019$age_cat<-Recode(NSDUH_2019$age2, recodes="7:8='18-19'; 9:10='20-21'; 11='22-23'; 12='24-25'; 13='26-29'; 14='30-34'; 15='35-49'; 16='50-64'; 17='65+'; else=NA", as.factor=T)

NSDUH_2019$daysalc<-Recode(NSDUH_2019$alcdays, recodes = "85=NA; 91=NA;  93=NA; 94=NA; 97=NA; 98=NA ")
NSDUH_2019$weekhrswrkd<-Recode(NSDUH_2019$wrkdhrswk2, recodes = "985=NA; 994=NA; 997=NA; 998=NA; 999=NA")

Missing Values outcome if had last alcholic drink in 30 days, with predictors wrkdhrswk2 sexuality, race_eth , marst, educ

summary(NSDUH_2019[, c("daysalc", "sexuality",  "race_eth", "weekhrswrkd", "marst", "dep_year2")])
##     daysalc              sexuality          race_eth      weekhrswrkd   
##  Min.   : 1.000   Heterosexual:38214   white    :32089   Min.   : 1.00  
##  1st Qu.: 2.000   Bisexual    : 2642   asian    :  292   1st Qu.:30.00  
##  Median : 4.000   Les/Gay     :  976   black    : 7256   Median :40.00  
##  Mean   : 7.499   NA's        :14304   hispanic : 2202   Mean   :36.47  
##  3rd Qu.:10.000                        mult_race: 2697   3rd Qu.:45.00  
##  Max.   :30.000                        other    :  752   Max.   :61.00  
##  NA's   :31467                         NA's     :10848   NA's   :28467  
##        marst         dep_year2    
##  married  :16983   Min.   :0.000  
##  divorced : 1333   1st Qu.:0.000  
##  separated:26750   Median :0.000  
##  widowed  : 4515   Mean   :0.106  
##  NA's     : 6555   3rd Qu.:0.000  
##                    Max.   :1.000  
##                    NA's   :13954

#Report the pattern of missingness among all of these variables # The results of the summary analysis indicates a rather large amount of missing data. The main outcome variable actually has an rather large amount of missing variables. The NSDUH_2019 has about 56136 observations in its variables. Of that the outcome variable is missing 31,467. This is followed closely by hours worked in a week, which according to the codebook indicates over 27,000 of which came from a legitimate skip. Race ethnicity, depressive episodes in the year, and sexuality as well had rather large amounts of missing data. With marital status having the least out of the missing variables. Which again coming from the codebook comes from legitimate skips. It is possible that a majority of the missing data in comes from this type of factor. But its rather curious why race/ethnicity is missing such high of values. Depressive epsiodes was dropped due to not being a categorical variable.

Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values.

Mean Imputation. The mean imputation reveals that on average 7.4 days of alcohol were used to complete the missing values, with 35.47 hours worked during the week.

summary(NSDUH_2019$daysalc) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   4.000   7.499  10.000  30.000   31467
summary(NSDUH_2019$weekhrswrkd) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   30.00   40.00   36.47   45.00   61.00   28467
NSDUH_2019$daysalc.imp.mean<-ifelse(is.na(NSDUH_2019$daysalc)==T, mean(NSDUH_2019$daysalc, na.rm=T), NSDUH_2019$daysalc)

NSDUH_2019$weekhrswrkd.imp.mean<-ifelse(is.na(NSDUH_2019$weekhrswrkd)==T, mean(NSDUH_2019$weekhrswrkd, na.rm=T), NSDUH_2019$weekhrswrkd)

mean(NSDUH_2019$daysalc.imp.mean, na.rm=T)
## [1] 7.498804
mean(NSDUH_2019$weekhrswrkd.imp.mean, na.rm=T)
## [1] 36.465