library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
Skill <- read_csv("~/Downloads/Skill.csv")
## Parsed with column specification:
## cols(
##   MDScore = col_double(),
##   sexident = col_double()
## )
head(Skill)
## # A tibble: 6 x 2
##   MDScore sexident
##     <dbl>    <dbl>
## 1      NA       99
## 2      NA        1
## 3       4        1
## 4      NA       99
## 5      11        1
## 6       8        1

Data Prep

Skill <-Skill %>%
mutate(Mental_Distress =
           ifelse(MDScore<5,"Low risk", 
           ifelse(MDScore>=5 & MDScore <=12 ,"Moderate risk",
           ifelse(MDScore>=13, "High Risk", NA))),
  sexident =
        ifelse(sexident==1,"Straight when sexident",
        ifelse(sexident==2,"Gay or Lesbian when sexident", 
        ifelse(sexident==3,"Bisexual when sexident", NA))),
  MDScore = ifelse(MDScore>24,NA, MDScore)) %>%
  
  select(MDScore, Mental_Distress, sexident)%>%
filter(sexident %in% c("Straight when sexident","Gay or Lesbian when sexident")) 

4A

Null hypothesis prep

table(Skill$sexident)%>%
prop.table()%>%
round(2)
## 
## Gay or Lesbian when sexident       Straight when sexident 
##                         0.02                         0.98
table(Skill$Mental_Distress)%>%
prop.table()%>%
round(2)
## 
##     High Risk      Low risk Moderate risk 
##          0.06          0.64          0.30

Null Hypothesis

Gay or Lesbian .02 * .06 = .012 High .02 * .64 = .0128 Low .02 * .30 = .006 Moderate

The Democratic candidate .98 * .06 = .0588 High .98 * .64 = .6272 Low .98 * .30 = .294 Moderate

table(Skill$sexident, Skill$Mental_Distress) %>%
  prop.table(1)
##                               
##                                 High Risk   Low risk Moderate risk
##   Gay or Lesbian when sexident 0.14831461 0.51573034    0.33595506
##   Straight when sexident       0.06258647 0.63854300    0.29887053

My actual observation shows that folks who are gay or lesbian when sexident have higher risks of mental stress compared to folks who are straight when sexident. We see this as Gay or lesbian when sexident are at 15% for high risk whereas straight when sexident is at .06%.

If there is no relationship betweent the two variables, then folks who are straight when sexident have higher risks of mental stress.

4B

chisq.test(Skill$Mental_Distress, Skill$sexident)
## 
##  Pearson's Chi-squared test
## 
## data:  Skill$Mental_Distress and Skill$sexident
## X-squared = 123.89, df = 2, p-value < 2.2e-16
options(scipen =9999)

Since the p-value is less than .05, we reject the null hypothesis, there’s no difference between the means, and our actual observations are significant.

5a

Skill %>%
  group_by(sexident) %>%
  summarize(MDScore = mean(MDScore, na.rm=TRUE))
## # A tibble: 2 x 2
##   sexident                     MDScore
##   <chr>                          <dbl>
## 1 Gay or Lesbian when sexident    6.09
## 2 Straight when sexident          4.25

This shows that Gay or Lesbian when sexident have higher levels of mental distress compared to Straight when sexident.

Population Distribution

Skill %>%
    filter(sexident %in% c("Straight when sexident","Gay or Lesbian when sexident")) %>%
    ggplot()+
    geom_histogram(aes(x=MDScore, fill=sexident))+
    facet_wrap(~sexident)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 411 rows containing non-finite values (stat_bin).

Sampling Prep

Heterosexual <- Skill %>%
  filter(sexident=="Straight when sexident")

Heterosexual <-
  replicate(10000,
  sample(Heterosexual$MDScore,40) %>%
  mean(na.rm=TRUE)) %>%
  data.frame()%>%
  rename("mean"=1)
Homosexual <- Skill %>%
  filter(sexident=="Gay or Lesbian when sexident")

Homosexual <-
  replicate(10000,
  sample(Homosexual$MDScore,40) %>%
  mean(na.rm=TRUE)) %>%
  data.frame()%>%
  rename("mean"=1)

Sampling Distribution

ggplot()+
geom_histogram(data=Heterosexual, aes(x=mean),fill="lavender")+ geom_histogram(data=Homosexual, aes(x=mean),fill="pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Both groups overlap in between 5, but we see that Gay or Lesbian when sexident has a higher average of Mental Distress than Straight when sexident.

Null Hypothesis

Skill%>%
summarize(MDScore = mean(MDScore,na.rm=TRUE))
## # A tibble: 1 x 1
##   MDScore
##     <dbl>
## 1    4.29

If a person’s sexual orientation did not makes no difference in their mental distress score, then we should find that the group-wise averages are 4.3 for members of both groups.

5b.

t.test(MDScore~sexident, data=Skill) 
## 
##  Welch Two Sample t-test
## 
## data:  MDScore by sexident
## t = 9.4511, df = 913.85, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.461793 2.227990
## sample estimates:
## mean in group Gay or Lesbian when sexident 
##                                   6.091011 
##       mean in group Straight when sexident 
##                                   4.246120
options(scipen =9999)

Since the p-value is less than .05, we reject the null hypothesis, there’s no difference between the means, and our findings are significant.