##Variable Selection and Research Question## The variable that I have chosen is heroin, and I predict that those that report heroin use will on average have a higher mental health score compared to those that do not. ##Data Prep##

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
dataset<- read.csv('/Users/apple/Downloads/SOC333_NSDUH_2016.csv')
head(dataset)
##   sexident Nervous Hopeless Restless Effort Sad Worthless k6score k6category
## 1     <NA>      NA       NA       NA     NA  NA        NA      NA       <NA>
## 2 Straight       0        0        0     NA   0         0      NA       <NA>
## 3 Straight       2        1        1      0   0         0       4   Low Risk
## 4     <NA>      NA       NA       NA     NA  NA        NA      NA       <NA>
## 5 Straight       1        3        2      2   1         2      11        MMD
## 6 Straight       2        1        1      2   1         1       8        MMD
##   marij_month cocaine_month crack_month heroin_month hallucinogen_month
## 1          No            No          No           No                 No
## 2         Yes            No          No           No                 No
## 3          No            No          No           No                 No
## 4          No            No          No           No                 No
## 5          No            No          No           No                 No
## 6          No            No          No           No                 No
##   inhalant_month meth_month painrelieve_month tranq_month stimulant_month
## 1             No         No                No          No              No
## 2             No         No                No          No              No
## 3             No         No                No          No              No
## 4             No         No                No          No              No
## 5             No         No                No          No              No
## 6             No         No                No          No              No
##   sedative_month
## 1             No
## 2             No
## 3             No
## 4             No
## 5             No
## 6             No

##FilterData##

heroinuse<-dataset%>%
select(heroin_month,k6score)%>%
filter(heroin_month %in% combine("Yes", "No"), !is.na(k6score))
## Warning: `combine()` is deprecated as of dplyr 1.0.0.
## Please use `vctrs::vec_c()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

##Comparison of Means##

##table##
heroinuse%>%
group_by(heroin_month)%>%
summarise(mentalscore=mean(k6score))
## # A tibble: 2 x 2
##   heroin_month mentalscore
## * <chr>              <dbl>
## 1 No                  4.42
## 2 Yes                10.2
##visualization##
heroinuse%>%
group_by(heroin_month)%>%
summarise(mentalscore=mean(k6score))%>%
ggplot()+
geom_col(aes(x=heroin_month,y=mentalscore, fill=heroin_month))

##Interpretation## Those who reported use of heroin within the last 30 days had a higher k6score. This means that those who use heroin are at a greater risk of being very mentally ill than those who do not. ##Comparison of Distribution##

heroinuse%>%
  ggplot()+
  geom_histogram(aes(x=k6score, fill= heroin_month))+
facet_wrap(~heroin_month)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.