Create two samples from the gcookbook::heightweight data set , each of size 15, but one Female only, and the other Male only.
# Create initial data set from gcookbook
pop <-gcookbook::heightweight
# Set seed and sample number 'N'
set.seed(1515)
N <- 15
# Female sample of 15 students
samp_fem <- pop %>%
filter(sex == "f")
samp_fem <- (samp_fem) %>%
sample_n(N, replace=FALSE)
# Male sample of 15 students
samp_mal <- pop %>%
filter(sex == "m")
samp_mal <- samp_mal %>%
sample_n(N, replace=FALSE)
For the t-test parameters, I left the default arguments:
alternative = two.sided because we are performing a two sided t-test
mu = 0 because we are assuming the mean of the two samples are the same
var.equal = FALSE to approximate the degrees of freedom
conf.level = 0.95 because the sample size of each data frame is less than 30
# Two sample t-test for weight of both male and female samples
ttest <- t.test(samp_fem$weightLb, samp_mal$weightLb)
ttest
##
## Welch Two Sample t-test
##
## data: samp_fem$weightLb and samp_mal$weightLb
## t = 0.65267, df = 27.755, p-value = 0.5193
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.131051 15.731051
## sample estimates:
## mean of x mean of y
## 97.43333 93.63333
# Assign variable for error bars in plot below
se <- ttest$stderr
# Combine sample data frames
samp_combined <- bind_rows(samp_fem, samp_mal)
# Find the mean of each sex and add to new data frame
samp_means <- samp_combined %>%
group_by(sex) %>%
mutate(mean=mean(weightLb))
# Plot data
jitter <- ggplot(samp_means, aes(x=sex, y=weightLb, color=factor(sex))) +
geom_jitter(
width = 0.1,
size = 2,
alpha = .8) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=0.5) +
theme_bw() +
theme(
legend.position = "none") +
scale_x_discrete(labels = c("f" = "Female", "m" = "Male")) +
labs(
x = "Sex",
y = "Weight (lb)",
title = "Comparing female and male sample sets",
subtitle = paste0("Sample size of ", N, " female students and ", N, " male students")
)
jitter
My sample sets and plot are good examples of how random samples might be contrary to your general assumption of the data. I assumed that the sample sets would show that males have a higher mean weight than females, but this isn’t the case. The mean weight of females in this sample set is 97.43 and is 93.63 for males. I also thought that the error bars looked too big, but it makes sense with such a small sample size resulting in a standard error of 5.82.
As suggested, I created a grade column in my original pop data frame. I then created 10 new data frames, filtered by sex and grade, and ran a t-test for all grades. I used the default arguments for all t-tests, meaning the confidence level is 95% and the alpha is 0.05.
# Add grade variable to pop data frame
pop$grade <- as.factor(round(pop$ageYear)-6)
# Assign variables for t-test to use for later results table
alpha = 0.05
CL = 1-alpha
# Grade 6
fem_pop_g6 <- pop %>%
filter(sex == "f", grade == 6)
mal_pop_g6 <- pop %>%
filter(sex == "m", grade == 6)
ttest_g6 <- t.test(fem_pop_g6$heightIn, mal_pop_g6$heightIn, conf.level=CL)
# Grade 7
fem_pop_g7 <- pop %>%
filter(sex == "f", grade == 7)
mal_pop_g7 <- pop %>%
filter(sex == "m", grade == 7)
ttest_g7 <- t.test(fem_pop_g7$heightIn, mal_pop_g7$heightIn, conf.level=CL)
# Grade 8
fem_pop_g8 <- pop %>%
filter(sex == "f", grade == 8)
mal_pop_g8 <- pop %>%
filter(sex == "m", grade == 8)
ttest_g8 <- t.test(fem_pop_g8$heightIn, mal_pop_g8$heightIn, conf.level=CL)
# Grade 9
fem_pop_g9 <- pop %>%
filter(sex == "f", grade == 9)
mal_pop_g9 <- pop %>%
filter(sex == "m", grade == 9)
ttest_g9 <- t.test(fem_pop_g9$heightIn, mal_pop_g9$heightIn, conf.level=CL)
# Grade 10
fem_pop_g10 <- pop %>%
filter(sex == "f", grade == 10)
mal_pop_g10 <- pop %>%
filter(sex == "m", grade == 10)
ttest_g10 <- t.test(fem_pop_g10$heightIn, mal_pop_g10$heightIn, conf.level=CL)
The null hypothesis is that the there is no significant correlation between sex and height for each grade. We’ll know if the null hypothesis is true by looking at the p-value from each t-test. If the p-value is greater than 0.05, we can accept the null hypothesis. If the p-value is less than 0.05, we can reject the null hypothesis.
Results:
Grade 6
Is 0.610 > 0.05? TRUE
Grade 7
Is 0.983 > 0.05? TRUE
Grade 8
Is 0.00836 > 0.05? FALSE
Grade 9
Is 0.00288 > 0.05? FALSE
Grade 10
Is 0.000583 > 0.05? FALSE
For grades 6 and 7, the null hypothesis is TRUE. This means we’re 95% confident that there’s no statistical correlation between sex and height.
For grades 8, 9, and 10, the null hypothesis is FALSE. This means we can’t confidently rule out a statistical correlation between sex and height.
# Combine data frames created for each grade above
# Grade 6
tf6 <- t.test(fem_pop_g6$heightIn)
tm6 <- t.test(mal_pop_g6$heightIn)
g6 <- bind_rows(fem_pop_g6, mal_pop_g6)
g6plot <- ggplot(g6, aes(x=heightIn, color=factor(sex))) +
geom_density() +
geom_vline(aes(xintercept=tf6$estimate), color="#F8766D", size=1.25) +
geom_vline(aes(xintercept=tf6$conf.int[1]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tf6$conf.int[2]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm6$estimate), color="#00BFC4", size=1.25) +
geom_vline(aes(xintercept=tm6$conf.int[1]), color="#00BFC4", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm6$conf.int[2]), color="#00BFC4", alpha=.75, linetype="dashed") +
theme(
legend.position = "none"
) +
labs(title="Grade 6")
# Grade 7
tf7 <- t.test(fem_pop_g7$heightIn)
tm7 <- t.test(mal_pop_g7$heightIn)
g7 <- bind_rows(fem_pop_g7, mal_pop_g7)
g7plot <- ggplot(g7, aes(x=heightIn, color=factor(sex))) +
geom_density() +
geom_vline(aes(xintercept=tf7$estimate), color="#F8766D", size=1.25) +
geom_vline(aes(xintercept=tf7$conf.int[1]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tf7$conf.int[2]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm7$estimate), color="#00BFC4", size=1.25, alpha=0.75) +
geom_vline(aes(xintercept=tm7$conf.int[1]), color="#00BFC4", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm7$conf.int[2]), color="#00BFC4", alpha=.75, linetype="dashed") +
theme(
legend.position = "none"
) +
labs(title="Grade 7")
# Grade 8
tf8 <- t.test(fem_pop_g8$heightIn)
tm8 <- t.test(mal_pop_g8$heightIn)
g8 <- bind_rows(fem_pop_g8, mal_pop_g8)
g8plot <- ggplot(g8, aes(x=heightIn, color=factor(sex))) +
geom_density() +
geom_vline(aes(xintercept=tf8$estimate), color="#F8766D", size=1.25) +
geom_vline(aes(xintercept=tf8$conf.int[1]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tf8$conf.int[2]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm8$estimate), color="#00BFC4", size=1.25) +
geom_vline(aes(xintercept=tm8$conf.int[1]), color="#00BFC4", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm8$conf.int[2]), color="#00BFC4", alpha=.75, linetype="dashed") +
theme(
legend.position = "none"
) +
labs(title="Grade 8")
# Grade 9
tf9 <- t.test(fem_pop_g9$heightIn)
tm9 <- t.test(mal_pop_g9$heightIn)
g9 <- bind_rows(fem_pop_g9, mal_pop_g9)
g9plot <- ggplot(g9, aes(x=heightIn, color=factor(sex))) +
geom_density() +
geom_vline(aes(xintercept=tf9$estimate), color="#F8766D", size=1.25) +
geom_vline(aes(xintercept=tf9$conf.int[1]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tf9$conf.int[2]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm9$estimate), color="#00BFC4", size=1.25) +
geom_vline(aes(xintercept=tm9$conf.int[1]), color="#00BFC4", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm9$conf.int[2]), color="#00BFC4", alpha=.75, linetype="dashed") +
theme(
legend.position = "none"
) +
labs(title="Grade 9")
# Grade 10
tf10 <- t.test(fem_pop_g10$heightIn)
tm10 <- t.test(mal_pop_g10$heightIn)
g10 <- bind_rows(fem_pop_g10, mal_pop_g10)
g10plot <- ggplot(g10, aes(x=heightIn, color=factor(sex))) +
geom_density() +
geom_vline(aes(xintercept=tf10$estimate), color="#F8766D", size=1.25) +
geom_vline(aes(xintercept=tf10$conf.int[1]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tf10$conf.int[2]), color="#F8766D", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm10$estimate), color="#00BFC4", size=1.25) +
geom_vline(aes(xintercept=tm10$conf.int[1]), color="#00BFC4", alpha=.75, linetype="dashed") +
geom_vline(aes(xintercept=tm10$conf.int[2]), color="#00BFC4", alpha=.75, linetype="dashed") +
theme(
legend.position = "none"
) +
labs(title="Grade 10")
all_plots <- plot_grid(
g6plot,
g7plot,
g8plot,
g9plot,
g10plot,
align = c("hv"),
nrow = 3
)
all_plots