Hypothesis Testing

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test
data <- read_delim("./sports.csv", delim = ',')
## Rows: 2936 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): institution_name, city_txt, state_cd, classification_name, classif...
## dbl (21): year, unitid, zip_text, classification_code, ef_male_count, ef_fem...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_raw <- data

First Null Hypothesis using Neyman-Pearson framework

= There is no difference between revenue of public and private universities.

data <- data |>
  mutate(sector = ifelse(grepl("Private", sector_name), "private","public"))
data |>
  ggplot() +
  geom_boxplot(mapping = 
                 aes(x = total_rev_menwomen, 
                     y = sector)) +
  xlim(0,2000000)
## Warning: Removed 1060 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

avg_revenues <- data |>
  group_by(sector) |>
  summarize(avg_revenue = mean(total_rev_menwomen, na.rm = TRUE)) 

avg_revenues
## # A tibble: 2 × 2
##   sector  avg_revenue
##   <chr>         <dbl>
## 1 private     862279.
## 2 public     1675965.
data |>
  filter(sector == "private") |>
  select(total_rev_menwomen) |>
  na.omit() |>
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1410
# sample sizes
observed_diff <- (avg_revenues$avg_revenue[2] -
                  avg_revenues$avg_revenue[1])
paste("Observed Difference: ", observed_diff) # this is our sample observed difference b/w public and private uni
## [1] "Observed Difference:  813686.268369793"
# now we need to bootstrap to simulate sampling distribution
bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (a vector)
    x_sample <- sample(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}
private_avgs <- data |>
  filter(sector == 'private') |> # this is for private institutions
  pluck("total_rev_menwomen") |>
  na.omit() |>
  bootstrap(n_iter = 100)

public_avgs <- data |>
  filter(sector == "public") |> # this is for public 
  pluck("total_rev_menwomen") |>
  na.omit() |>
  bootstrap(n_iter = 100)

diffs_in_avg <- public_avgs - private_avgs
diffs_in_avg
##   [1]  237253.07  452423.20  780769.38  910977.76 1380749.80  321115.84
##   [7]  984219.62  689338.69  818565.84  599188.47  699132.87 1132527.72
##  [13]  778373.10  563695.93  899179.65  551741.40  998064.87  582902.15
##  [19]  507597.93  605903.33 1276159.97 -161989.32  683153.79  645880.08
##  [25] 1306229.10 1780600.51  836417.04  933959.14  539937.28  767874.91
##  [31] 1197022.12 1165831.36  495435.49  939142.92  990046.67  338189.66
##  [37]  580930.38  727454.10  519432.04  981083.02 1555868.56  463652.40
##  [43]  777203.63  847479.72   17408.89 1242306.76  552449.49  551636.69
##  [49]  648858.12  450385.45  594553.83  877823.94  899513.40  668297.04
##  [55]  477402.06 1083177.05 1358404.60 1076068.00  480930.06  754510.90
##  [61]  655378.54  552743.00  757460.09 1181704.14  825490.04  521950.56
##  [67]  941801.91  981050.48  641107.31  571057.56  601845.40  969882.26
##  [73]  511206.31  309028.61  876575.33 1001059.40 1109904.18 1237431.78
##  [79]  318362.23  684357.23  679071.89  626489.90 1020478.48  615950.59
##  [85]  598214.16  977000.58 1172184.35 1083610.80  860927.13  439119.01
##  [91]  691776.00 1378924.57  596451.69  900663.45  498303.56 1016736.91
##  [97] 1304927.24  768222.85  449467.27 1181395.42
ggplot() +
  geom_function(xlim = c(-1000000, 1000000), 
    fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avg))) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff)))) +
  labs(title = "Total Revenue Differences Sampling Distribution using Bootstrap",
       x = "Difference In Revenue",
       y = "Probability Density",
       color = "") +
   scale_x_continuous(breaks = seq(-1000000, 1000000, 250000)) +
  theme_minimal()

# let's find cohen's d for our data

cohen.d(d = filter(data, sector == "public") |> pluck("total_rev_menwomen") |> na.omit(),
        f = filter(data, sector == "private") |> pluck("total_rev_menwomen") |> na.omit())
## 
## Cohen's d
## 
## d estimate: 0.1310133 (negligible)
## 95 percent confidence interval:
##      lower      upper 
## 0.03346384 0.22856271
# We got Cohen's d of 0.13 which is really small, so let's run the hypothesis test to see if groups are actually different or not.
# sample size calculation

test <- pwrss.t.2means(mu1 = 500000, 
                       sd1 = sd(pluck(data, "total_rev_menwomen"), na.rm = TRUE),
                       kappa = 0.4,
                       power = .80, alpha = 0.1, 
                       alternative = "not equal")
##  Difference between Two means 
##  (Independent Samples t Test) 
##  H0: mu1 = mu2 
##  HA: mu1 != mu2 
##  ------------------------------ 
##   Statistical power = 0.8 
##   n1 = 1340 
##   n2 = 3350 
##  ------------------------------ 
##  Alternative = "not equal" 
##  Degrees of freedom = 4688 
##  Non-centrality parameter = 2.487 
##  Type I error rate = 0.1 
##  Type II error rate = 0.2
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'

It looks like our current sample size of about 1900 total is not enough to reach the desired power. After running the sample size calculation, our sample sizes needed to be 1340 for first and 3350 for second sample. Before running this test, I chose the power to be 0.80 which seemed sufficient in this context. For alpha of 0.1, it was fine to allow 10% type I error.

We can’t conclude anything for first hypothesis due to lack of data.

Second Null Hypothesis using Fisher’s Significance Testing

= There is no difference between Basketball and Soccer’s revenue, on average.

data |>
  filter(sports == "Basketball") |>
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   206
# let's get observed difference and difference in averages for this hypothesis test.

average_revenues <- data |>
  filter(sports == c("Basketball", "Soccer")) |>
  group_by(sports) |>
  summarise(avg_rev = mean(total_rev_menwomen, na.rm = TRUE))

average_revenues
## # A tibble: 2 × 2
##   sports      avg_rev
##   <chr>         <dbl>
## 1 Basketball 2170599.
## 2 Soccer      648908.
observed_difference <- (average_revenues$avg_rev[1] -
                  average_revenues$avg_rev[2])
paste("Observed Difference: ", observed_difference)
## [1] "Observed Difference:  1521690.24407913"
basketball_avgs <- data |>
  filter(sports == 'Basketball') |> # this is for basketball
  pluck("total_rev_menwomen") |>
  na.omit() |>
  bootstrap(n_iter = 100)

soccer_avgs <- data |>
  filter(sports == 'Soccer') |> # this is for soccer
  pluck("total_rev_menwomen") |>
  na.omit() |>
  bootstrap(n_iter = 100)

difference_in_averages <- basketball_avgs - soccer_avgs
difference_in_averages
##   [1] 1448904.4 2130842.2 1653523.5 1490863.4  981099.9 2023321.7 1816485.6
##   [8] 1840044.6 1560224.8 1569228.3 1847611.9 1964037.5 1815787.7 1536839.4
##  [15] 1278682.2 1544087.1 1077788.0 1613309.3 1389765.6 1802516.5 1676089.3
##  [22] 1998649.0 2216187.4 1659002.1 1690969.7 1632702.7 2115901.3 1459827.7
##  [29] 1582228.1 1802016.6 1863595.1 2419799.0 1687947.2 1925257.1 1661934.4
##  [36] 1858281.6 1991044.4 1947785.6 1840107.7 1978911.3 1749636.5 1545543.6
##  [43] 1630835.0 1854274.8 1572556.0 1652878.4 1145827.6 1564519.4 2146435.8
##  [50] 1802740.5 1916226.3 1596449.7  888894.2 1498291.7 2124791.7 1505006.7
##  [57] 1667456.7 1988262.5 1779769.8 2268611.2 1893210.9 1321408.4 2011664.7
##  [64] 1483871.2 1315779.4 1838912.7 1567456.2 1301094.4 1707581.7 1754081.5
##  [71] 1124851.7 1372150.8 1642336.7 1636456.9 2213628.2 2084382.1 1574975.7
##  [78] 2221637.7 1404581.7 1375881.9 1744499.8 1681569.1 1549884.8 2251684.2
##  [85] 1191809.8 1577935.4 2068017.8 1844273.0 1754283.7 1580137.8 1933416.8
##  [92] 1974389.0 1319622.2 2117225.3 2259295.0 1750970.3 1385327.5 1212334.1
##  [99] 1862179.3 1996036.8
# now we have our difference in averages, we can move to calculate the p-value
# let's demean the differences first
diff_avg_demeaned <- difference_in_averages - mean(difference_in_averages)
paste("p-value ", 
      sum(abs(observed_difference) < abs(diff_avg_demeaned)) /
        length(diff_avg_demeaned))
## [1] "p-value  0"

We have got a p-value of <0.001, which clearly means we need to reject our null hypothesis. The p-value of zero is telling us if we assume the null hypothesis to be true, 0% of times our observed difference will be this much or more extreme. So we conclude with high confidence that there is in fact a difference between basketball and soccer revenue.

Visualizations:

we’re doing visualization for second hypothesis

ggplot(average_revenues, mapping = aes(x = sports, y = avg_rev)) +
geom_col(fill = 'chocolate') +
labs(
  title = "Comparing Basketball and Soccer Revenues in Indiana Colleges",
  x = "Sports",
  y = "Mean Revenue"
) +
theme_minimal()