library(readr)
## Warning: package 'readr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3

(Step 1) Importing the Data

data<-read.csv("F:/Skills Drill 3 NSDUH Data.csv")

(Step 2) Recoding the Variables

data <- data%>%
  mutate(MDCategory = ifelse(MDScore < 5, "Low Risk",
                      ifelse(MDScore>=5 & MDScore<=12, "Moderate Risk",
                      ifelse(MDScore >= 13, "High Risk", NA))))
data <- data%>%
  mutate(sexident = ifelse(sexident==1, "Straight",
                    ifelse(sexident==2, "Gay or Lesbian",
                    ifelse(sexident==3, "Bisexual", NA))))

(Step 3) Filtering the Data

new_data <- data%>%
  select(MDScore, sexident, MDCategory)%>%
  filter(sexident %in% c("Straight", "Gay or Lesbian"))

(Step 4) Comparing the Categorical Variables

table(new_data$MDCategory, new_data$sexident)%>%
  prop.table(2)
##                
##                 Gay or Lesbian   Straight
##   High Risk         0.14831461 0.06258647
##   Low Risk          0.51573034 0.63854300
##   Moderate Risk     0.33595506 0.29887053
Based on the table genererated above, the majority of people whose sexuality is straight is at low risk in terms of the mental distress category (about 64%). On the other hand, by comparing the percentage of people who are at high risk, people whose sexual identities are gay or lesbian are likely to report the mental distress scores over 13.

Chi-Square Test

chisq.test(new_data$MDCategory, new_data$sexident)
## 
##  Pearson's Chi-squared test
## 
## data:  new_data$MDCategory and new_data$sexident
## X-squared = 123.89, df = 2, p-value < 2.2e-16
Based on the Chi-square test above, the p-value is less than 0.05 (0.0000000000000022 < 0.05). This implies that there is statistically significant relationship between mental distress category and people’s sexual identities.

Visualization

new_data%>%
  group_by(sexident, MDCategory)%>%
  summarize(n=n())%>%
  mutate(percent =n/sum(n))%>%
  ggplot()+geom_col(aes(x=sexident, y=percent, fill=MDCategory))

(Step 5) Comparing the Categorical Variable and Continuous Variable

new_data%>%
  group_by(sexident)%>%
  summarize(avgScore = mean(MDScore, na.rm=TRUE))
## # A tibble: 2 x 2
##   sexident       avgScore
##   <chr>             <dbl>
## 1 Gay or Lesbian     6.09
## 2 Straight           4.25
Based on the table generated above, people whose sexual identities are gay or lesbian have higher average mental distress score. This implies that they are more likely to be distressed.

T-test

t.test(MDScore~sexident, data = new_data)
## 
##  Welch Two Sample t-test
## 
## data:  MDScore by sexident
## t = 9.4511, df = 913.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.461793 2.227990
## sample estimates:
## mean in group Gay or Lesbian       mean in group Straight 
##                     6.091011                     4.246120
Based on the table generated above, the p-value for this T-test is less than 0.05 (0.0000000000000022 < 0.05). This implies that there is a statistically significant relationship between people’s sexual identities and the mental distress scores they report.

Visualization

replicate(10000, sample(new_data$MDScore, 40)%>%
                 mean(na.rm=TRUE))%>%
  data.frame()%>%
  rename("mean"=1)%>%
  ggplot()+geom_histogram(aes(x=mean), fill="blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.