library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
library(effsize)
## Warning: package 'effsize' was built under R version 4.3.3
library(pwrss)
## Warning: package 'pwrss' was built under R version 4.3.3
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")
avg_height <- volley_data |>
group_by(gender) |>
summarize(avg_height = mean(w_p1_hgt, na.rm = TRUE)) |>
arrange(gender)
avg_height
## # A tibble: 2 × 2
## gender avg_height
## <chr> <dbl>
## 1 M 76.3
## 2 W 70.8
observed_diff <- (avg_height$avg_height[1] -
avg_height$avg_height[2])
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 5.45239323028152"
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (a vector)
x_sample <- sample(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
avgs_women <- volley_data |>
filter(gender == 'W') |>
filter(!is.na(w_p1_hgt)) |>
pull(w_p1_hgt) |>
bootstrap(n_iter = 100)
avgs_men <- volley_data |>
filter(gender == 'M') |>
filter(!is.na(w_p1_hgt)) |>
pull(w_p1_hgt) |>
bootstrap(n_iter = 100)
diffs_in_avgs <- avgs_men - avgs_women
print(diffs_in_avgs)
## [1] 5.453027 5.546417 5.364551 5.440534 5.527743 5.329363 5.402028 5.573859
## [9] 5.527548 5.466271 5.525505 5.473404 5.381219 5.479125 5.462153 5.387044
## [17] 5.501527 5.369570 5.364736 5.434377 5.468108 5.507756 5.415890 5.401298
## [25] 5.592909 5.449208 5.426446 5.248655 5.541787 5.355899 5.339433 5.435051
## [33] 5.415237 5.554063 5.615698 5.370643 5.414869 5.411769 5.474121 5.449017
## [41] 5.510427 5.486298 5.240733 5.411394 5.511050 5.358466 5.464445 5.474624
## [49] 5.435468 5.467133 5.582044 5.316094 5.519801 5.535349 5.280181 5.377968
## [57] 5.473498 5.395357 5.428780 5.482205 5.359231 5.471118 5.436623 5.500611
## [65] 5.422705 5.523888 5.501389 5.374445 5.519606 5.569582 5.351673 5.377020
## [73] 5.456608 5.412795 5.389953 5.322413 5.411484 5.798889 5.408565 5.503142
## [81] 5.406268 5.603715 5.332259 5.463436 5.262405 5.521506 5.544649 5.415915
## [89] 5.350702 5.526363 5.443898 5.637518 5.586876 5.616119 5.340391 5.504147
## [97] 5.434405 5.546973 5.465399 5.409988
ggplot() +
geom_function(xlim = c(-10, 10),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Bootstrapped Sampling Distribution of Gender Height Differences",
x = "Difference in Height Calculated",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-300, 300, 100)) +
theme_minimal()
cohen.d(d = filter(volley_data, gender == 'M') |>
filter(!is.na(w_p1_hgt)) |>
pull(w_p1_hgt),
f = filter(volley_data, gender == 'W') |>
filter(!is.na(w_p1_hgt)) |>
pull(w_p1_hgt),)
##
## Cohen's d
##
## d estimate: 2.196277 (large)
## 95 percent confidence interval:
## lower upper
## 2.105728 2.286825
volley_data |>
group_by(gender) |>
summarize(sd = sd(w_p1_hgt, na.rm = TRUE),
mean = mean(w_p1_hgt, na.rm = TRUE))
## # A tibble: 2 × 3
## gender sd mean
## <chr> <dbl> <dbl>
## 1 M 2.72 76.3
## 2 W 2.20 70.8
sample_size <- pwrss.t.2means(mu1 = 70.83,
mu2 = 76.28,
sd1 = 2.19,
sd2 = 2.71,
alpha = 0.1,
power = 0.80,
kappa = 1)
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 4
## n2 = 4
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 6
## Non-centrality parameter = -3.128
## Type I error rate = 0.1
## Type II error rate = 0.2
print(sample_size)
## $parms
## $parms$mu1
## [1] 70.83
##
## $parms$mu2
## [1] 76.28
##
## $parms$sd1
## [1] 2.19
##
## $parms$sd2
## [1] 2.71
##
## $parms$kappa
## [1] 1
##
## $parms$welch.df
## [1] FALSE
##
## $parms$paired
## [1] FALSE
##
## $parms$paired.r
## [1] 0.5
##
## $parms$alpha
## [1] 0.1
##
## $parms$margin
## [1] 0
##
## $parms$alternative
## [1] "not equal"
##
## $parms$verbose
## [1] TRUE
##
##
## $test
## [1] "t"
##
## $df
## [1] 6
##
## $ncp
## [1] -3.128337
##
## $power
## [1] 0.8
##
## $n
## n1 n2
## 4 4
##
## attr(,"class")
## [1] "pwrss" "t" "2means"
test <- pwrss.t.2means(mu1 = 70,
mu2 = 75,
sd1 = 1,
kappa = 1,
power = .9,
alpha = 0.05,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.9
## n1 = 3
## n2 = 3
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 4
## Non-centrality parameter = -6.124
## Type I error rate = 0.05
## Type II error rate = 0.1
plot(test)
This test shows the difference in mean height between men and women that were winning player one in each game. Male players on average have a taller height than female players.
Alpha was chosen to be .05 which means that there is a 5% risk of incorrectly rejecting the null hypothesis. In other words, there is a 5% chance that we would incorrectly assume that mens heights are higher than womens heights of player one. This is a reasonable choice for alpha since there are no serious consequences in this false result.
Power is chosen to be .99 which means there is a .99 probability of correctly rejecting the null hypothesis. This higher percentage is desirable since it indicates that there should be great evidence to reject the null hypothesis and find a correlation between gender and height.
avg_errors <- volley_data |>
filter(!is.na(w_p1_tot_errors) & !is.na(l_p1_tot_errors)) |>
select(w_p1_tot_errors, l_p1_tot_errors) |>
summarize(
avg_w1_errors = mean(w_p1_tot_errors),
avg_l1_errors = mean(l_p1_tot_errors)
)
avg_errors
## avg_w1_errors avg_l1_errors
## 1 3.177062 4.28169
observed_err_diff <- (avg_errors$avg_l1_errors -
avg_errors$avg_w1_errors)
paste("Observed Difference: ", observed_err_diff)
## [1] "Observed Difference: 1.1046277665996"
avgs_winners <- volley_data |>
filter(!is.na(w_p1_tot_errors)) |>
pull(w_p1_tot_errors) |>
bootstrap(n_iter = 100)
avgs_losers <- volley_data |>
filter(!is.na(l_p1_tot_errors)) |>
pull(l_p1_tot_errors) |>
bootstrap(n_iter = 100)
diffs_in_errs <- avgs_losers - avgs_winners
print(diffs_in_errs)
## [1] 0.9215292 1.1931590 1.3038229 1.1488934 1.1488934 1.2032193 1.1428571
## [8] 1.1247485 1.0040241 0.7464789 1.0201207 1.0160966 1.0784708 0.9114688
## [15] 0.7887324 1.2313883 1.1066398 1.1026157 0.9637827 1.1911469 1.1830986
## [22] 1.2474849 1.1891348 1.2394366 1.1146881 1.0965795 1.1710262 1.0140845
## [29] 0.8128773 1.0120724 1.3380282 0.9758551 1.1066398 0.9537223 1.0784708
## [36] 0.9698189 1.0543260 0.8490946 1.1146881 1.1609658 1.1710262 1.0583501
## [43] 0.9899396 1.0885312 0.7142857 1.3541247 1.3782696 1.1307847 1.2152918
## [50] 1.0523139 1.2776660 1.0382294 1.1851107 0.9456740 1.0704225 1.0100604
## [57] 1.3541247 1.1066398 1.0603622 0.9919517 1.1368209 1.1187123 1.1006036
## [64] 1.1287726 1.0482897 1.0643863 1.1448692 1.0000000 1.3400402 1.1086519
## [71] 1.1368209 0.9577465 0.7665996 1.0824950 1.2173038 1.1790744 0.9818913
## [78] 0.9758551 1.4245473 1.0885312 1.0563380 1.1690141 1.0845070 1.2776660
## [85] 1.0663984 1.0804829 1.1891348 1.0804829 0.8873239 1.0704225 0.9175050
## [92] 1.0704225 1.0885312 1.1368209 1.2334004 1.1046278 0.9979879 0.8148893
## [99] 1.0362173 1.0905433
f_sampling <- function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_errs))
ggplot() +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(observed_err_diff, 10),
geom = "area") +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(-10, -observed_err_diff),
geom = "area") +
geom_function(xlim = c(-10, 10),
fun = f_sampling) +
geom_vline(mapping = aes(xintercept = observed_err_diff,
color = paste("observed: ",
round(observed_err_diff, 1)))) +
labs(title = "Bootstrapped Sampling Distribution of Error Differences",
x = "Difference in Error Calculated",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-10, 10, 10)) +
scale_fill_manual(values = 'lightblue') +
theme_minimal()
diffs_in_errs_d <- diffs_in_errs - mean(diffs_in_errs)
# proportion of times the difference is more extreme
paste("p-value =",
sum(abs(observed_err_diff) < abs(diffs_in_errs_d)) /
length(diffs_in_errs_d))
## [1] "p-value = 0"
The p value of zero suggest that there is very strong evidence to reject the null hypothesis. In this instance, this means that the total errors between winning player one and losing player one are not equal. Losing players have a higher number of errors than winning players on average.