library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
data <- read_delim("./sports.csv", delim = ',')
## Rows: 2936 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): institution_name, city_txt, state_cd, classification_name, classif...
## dbl (21): year, unitid, zip_text, classification_code, ef_male_count, ef_fem...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_raw <- data
data <- data |>
mutate(sector = ifelse(grepl("Private", sector_name), "private","public"))
data |>
ggplot() +
geom_boxplot(mapping =
aes(x = total_rev_menwomen,
y = sector)) +
xlim(0,2000000)
## Warning: Removed 1060 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
avg_revenues <- data |>
group_by(sector) |>
summarize(avg_revenue = mean(total_rev_menwomen, na.rm = TRUE))
avg_revenues
## # A tibble: 2 × 2
## sector avg_revenue
## <chr> <dbl>
## 1 private 862279.
## 2 public 1675965.
data |>
filter(sector == "private") |>
select(total_rev_menwomen) |>
na.omit() |>
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 1410
# sample sizes
observed_diff <- (avg_revenues$avg_revenue[2] -
avg_revenues$avg_revenue[1])
paste("Observed Difference: ", observed_diff) # this is our sample observed difference b/w public and private uni
## [1] "Observed Difference: 813686.268369793"
# now we need to bootstrap to simulate sampling distribution
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (a vector)
x_sample <- sample(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
private_avgs <- data |>
filter(sector == 'private') |> # this is for private institutions
pluck("total_rev_menwomen") |>
na.omit() |>
bootstrap(n_iter = 100)
public_avgs <- data |>
filter(sector == "public") |> # this is for public
pluck("total_rev_menwomen") |>
na.omit() |>
bootstrap(n_iter = 100)
diffs_in_avg <- public_avgs - private_avgs
diffs_in_avg
## [1] 237253.07 452423.20 780769.38 910977.76 1380749.80 321115.84
## [7] 984219.62 689338.69 818565.84 599188.47 699132.87 1132527.72
## [13] 778373.10 563695.93 899179.65 551741.40 998064.87 582902.15
## [19] 507597.93 605903.33 1276159.97 -161989.32 683153.79 645880.08
## [25] 1306229.10 1780600.51 836417.04 933959.14 539937.28 767874.91
## [31] 1197022.12 1165831.36 495435.49 939142.92 990046.67 338189.66
## [37] 580930.38 727454.10 519432.04 981083.02 1555868.56 463652.40
## [43] 777203.63 847479.72 17408.89 1242306.76 552449.49 551636.69
## [49] 648858.12 450385.45 594553.83 877823.94 899513.40 668297.04
## [55] 477402.06 1083177.05 1358404.60 1076068.00 480930.06 754510.90
## [61] 655378.54 552743.00 757460.09 1181704.14 825490.04 521950.56
## [67] 941801.91 981050.48 641107.31 571057.56 601845.40 969882.26
## [73] 511206.31 309028.61 876575.33 1001059.40 1109904.18 1237431.78
## [79] 318362.23 684357.23 679071.89 626489.90 1020478.48 615950.59
## [85] 598214.16 977000.58 1172184.35 1083610.80 860927.13 439119.01
## [91] 691776.00 1378924.57 596451.69 900663.45 498303.56 1016736.91
## [97] 1304927.24 768222.85 449467.27 1181395.42
ggplot() +
geom_function(xlim = c(-1000000, 1000000),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avg))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Total Revenue Differences Sampling Distribution using Bootstrap",
x = "Difference In Revenue",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-1000000, 1000000, 250000)) +
theme_minimal()
# let's find cohen's d for our data
cohen.d(d = filter(data, sector == "public") |> pluck("total_rev_menwomen") |> na.omit(),
f = filter(data, sector == "private") |> pluck("total_rev_menwomen") |> na.omit())
##
## Cohen's d
##
## d estimate: 0.1310133 (negligible)
## 95 percent confidence interval:
## lower upper
## 0.03346384 0.22856271
# We got Cohen's d of 0.13 which is really small, so let's run the hypothesis test to see if groups are actually different or not.
# sample size calculation
test <- pwrss.t.2means(mu1 = 500000,
sd1 = sd(pluck(data, "total_rev_menwomen"), na.rm = TRUE),
kappa = 0.4,
power = .80, alpha = 0.1,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 1340
## n2 = 3350
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 4688
## Non-centrality parameter = 2.487
## Type I error rate = 0.1
## Type II error rate = 0.2
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
data |>
filter(sports == "Basketball") |>
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 206
# let's get observed difference and difference in averages for this hypothesis test.
average_revenues <- data |>
filter(sports == c("Basketball", "Soccer")) |>
group_by(sports) |>
summarise(avg_rev = mean(total_rev_menwomen, na.rm = TRUE))
average_revenues
## # A tibble: 2 × 2
## sports avg_rev
## <chr> <dbl>
## 1 Basketball 2170599.
## 2 Soccer 648908.
observed_difference <- (average_revenues$avg_rev[1] -
average_revenues$avg_rev[2])
paste("Observed Difference: ", observed_difference)
## [1] "Observed Difference: 1521690.24407913"
basketball_avgs <- data |>
filter(sports == 'Basketball') |> # this is for basketball
pluck("total_rev_menwomen") |>
na.omit() |>
bootstrap(n_iter = 100)
soccer_avgs <- data |>
filter(sports == 'Soccer') |> # this is for soccer
pluck("total_rev_menwomen") |>
na.omit() |>
bootstrap(n_iter = 100)
difference_in_averages <- basketball_avgs - soccer_avgs
difference_in_averages
## [1] 1448904.4 2130842.2 1653523.5 1490863.4 981099.9 2023321.7 1816485.6
## [8] 1840044.6 1560224.8 1569228.3 1847611.9 1964037.5 1815787.7 1536839.4
## [15] 1278682.2 1544087.1 1077788.0 1613309.3 1389765.6 1802516.5 1676089.3
## [22] 1998649.0 2216187.4 1659002.1 1690969.7 1632702.7 2115901.3 1459827.7
## [29] 1582228.1 1802016.6 1863595.1 2419799.0 1687947.2 1925257.1 1661934.4
## [36] 1858281.6 1991044.4 1947785.6 1840107.7 1978911.3 1749636.5 1545543.6
## [43] 1630835.0 1854274.8 1572556.0 1652878.4 1145827.6 1564519.4 2146435.8
## [50] 1802740.5 1916226.3 1596449.7 888894.2 1498291.7 2124791.7 1505006.7
## [57] 1667456.7 1988262.5 1779769.8 2268611.2 1893210.9 1321408.4 2011664.7
## [64] 1483871.2 1315779.4 1838912.7 1567456.2 1301094.4 1707581.7 1754081.5
## [71] 1124851.7 1372150.8 1642336.7 1636456.9 2213628.2 2084382.1 1574975.7
## [78] 2221637.7 1404581.7 1375881.9 1744499.8 1681569.1 1549884.8 2251684.2
## [85] 1191809.8 1577935.4 2068017.8 1844273.0 1754283.7 1580137.8 1933416.8
## [92] 1974389.0 1319622.2 2117225.3 2259295.0 1750970.3 1385327.5 1212334.1
## [99] 1862179.3 1996036.8
# now we have our difference in averages, we can move to calculate the p-value
# let's demean the differences first
diff_avg_demeaned <- difference_in_averages - mean(difference_in_averages)
paste("p-value ",
sum(abs(observed_difference) < abs(diff_avg_demeaned)) /
length(diff_avg_demeaned))
## [1] "p-value 0"
ggplot(average_revenues, mapping = aes(x = sports, y = avg_rev)) +
geom_col(fill = 'chocolate') +
labs(
title = "Comparing Basketball and Soccer Revenues in Indiana Colleges",
x = "Sports",
y = "Mean Revenue"
) +
theme_minimal()