library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Skill <- read_csv("~/Downloads/Skill.csv")
## Parsed with column specification:
## cols(
## MDScore = col_double(),
## sexident = col_double()
## )
head(Skill)
## # A tibble: 6 x 2
## MDScore sexident
## <dbl> <dbl>
## 1 NA 99
## 2 NA 1
## 3 4 1
## 4 NA 99
## 5 11 1
## 6 8 1
Skill <-Skill %>%
mutate(Mental_Distress =
ifelse(MDScore<5,"Low risk",
ifelse(MDScore>=5 & MDScore <=12 ,"Moderate risk",
ifelse(MDScore>=13, "High Risk", NA))),
sexident =
ifelse(sexident==1,"Straight when sexident",
ifelse(sexident==2,"Gay or Lesbian when sexident",
ifelse(sexident==3,"Bisexual when sexident", NA))),
MDScore = ifelse(MDScore>24,NA, MDScore)) %>%
select(MDScore, Mental_Distress, sexident)%>%
filter(sexident %in% c("Straight when sexident","Gay or Lesbian when sexident"))
Null hypothesis prep
table(Skill$sexident)%>%
prop.table()%>%
round(2)
##
## Gay or Lesbian when sexident Straight when sexident
## 0.02 0.98
table(Skill$Mental_Distress)%>%
prop.table()%>%
round(2)
##
## High Risk Low risk Moderate risk
## 0.06 0.64 0.30
Null Hypothesis
Gay or Lesbian .02 * .06 = .012 High .02 * .64 = .0128 Low .02 * .30 = .006 Moderate
The Democratic candidate .98 * .06 = .0588 High .98 * .64 = .6272 Low .98 * .30 = .294 Moderate
table(Skill$sexident, Skill$Mental_Distress) %>%
prop.table(1)
##
## High Risk Low risk Moderate risk
## Gay or Lesbian when sexident 0.14831461 0.51573034 0.33595506
## Straight when sexident 0.06258647 0.63854300 0.29887053
My actual observation shows that folks who are gay or lesbian when sexident have higher risks of mental stress compared to folks who are straight when sexident. We see this as Gay or lesbian when sexident are at 15% for high risk whereas straight when sexident is at .06%.
If there is no relationship betweent the two variables, then folks who are straight when sexident have higher risks of mental stress.
chisq.test(Skill$Mental_Distress, Skill$sexident)
##
## Pearson's Chi-squared test
##
## data: Skill$Mental_Distress and Skill$sexident
## X-squared = 123.89, df = 2, p-value < 2.2e-16
options(scipen =9999)
Since the p-value is less than .05, we reject the null hypothesis, there’s no difference between the means, and our actual observations are significant.
Skill %>%
group_by(sexident) %>%
summarize(MDScore = mean(MDScore, na.rm=TRUE))
## # A tibble: 2 x 2
## sexident MDScore
## <chr> <dbl>
## 1 Gay or Lesbian when sexident 6.09
## 2 Straight when sexident 4.25
This shows that Gay or Lesbian when sexident have higher levels of mental distress compared to Straight when sexident.
Population Distribution
Skill %>%
filter(sexident %in% c("Straight when sexident","Gay or Lesbian when sexident")) %>%
ggplot()+
geom_histogram(aes(x=MDScore, fill=sexident))+
facet_wrap(~sexident)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 411 rows containing non-finite values (stat_bin).
Sampling Prep
Heterosexual <- Skill %>%
filter(sexident=="Straight when sexident")
Heterosexual <-
replicate(10000,
sample(Heterosexual$MDScore,40) %>%
mean(na.rm=TRUE)) %>%
data.frame()%>%
rename("mean"=1)
Homosexual <- Skill %>%
filter(sexident=="Gay or Lesbian when sexident")
Homosexual <-
replicate(10000,
sample(Homosexual$MDScore,40) %>%
mean(na.rm=TRUE)) %>%
data.frame()%>%
rename("mean"=1)
Sampling Distribution
ggplot()+
geom_histogram(data=Heterosexual, aes(x=mean),fill="lavender")+ geom_histogram(data=Homosexual, aes(x=mean),fill="pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Both groups overlap in between 5, but we see that Gay or Lesbian when sexident has a higher average of Mental Distress than Straight when sexident.
Null Hypothesis
Skill%>%
summarize(MDScore = mean(MDScore,na.rm=TRUE))
## # A tibble: 1 x 1
## MDScore
## <dbl>
## 1 4.29
If a person’s sexual orientation did not makes no difference in their mental distress score, then we should find that the group-wise averages are 4.3 for members of both groups.
t.test(MDScore~sexident, data=Skill)
##
## Welch Two Sample t-test
##
## data: MDScore by sexident
## t = 9.4511, df = 913.85, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.461793 2.227990
## sample estimates:
## mean in group Gay or Lesbian when sexident
## 6.091011
## mean in group Straight when sexident
## 4.246120
options(scipen =9999)
Since the p-value is less than .05, we reject the null hypothesis, there’s no difference between the means, and our findings are significant.