library(ggthemes)
library(ggrepel)
## Loading required package: ggplot2
library(effsize)
library(pwrss)
## 
## Attaching package: 'pwrss'
## The following object is masked from 'package:stats':
## 
##     power.t.test
# This works to get rid of errors
library(conflicted)  

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag", "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.

Week 7 Data Dive - Hypothesis Testing

# load ncaa file I cleaned
ncaa <- read.csv("./ncaa_clean.csv", header = TRUE)

Null Hypothesis 1

Null Hypothesis: There number of non-football athletes is equal for football and non-football D1 schools.

Alternative Hypothesis: The number of non-football athletes is not equal for football and non-football D1 schools.

# make instances of each school in year 2019
null1 <- ncaa |>
  filter(year == 2019) |>
  filter(sports != 'Football') |>
  filter(classification_code %in% c(1,2,3)) |>
  group_by(institution_name) |>
  summarise(athletes = sum(sum_partic_men, na.rm = TRUE) + 
              sum(sum_partic_women, na.rm = TRUE),
            division = min(classification_code))
# gets number of D1 NCAA teams
nrow(null1)
## [1] 350
# separates D1 schools based on if they offer football
null1['Football'] <- with(null1, ifelse(division == 3, FALSE, TRUE))
# count of D1 programs with and without a football team
sum(null1['Football'] == TRUE)
## [1] 254
sum(null1['Football'] == FALSE)
## [1] 96

Using Cohen’s D:

cohen.d(d = filter(null1, Football == TRUE) |> pluck("athletes"),
        f = filter(null1, Football == FALSE) |> pluck("athletes"))
## 
## Cohen's d
## 
## d estimate: 0.3695696 (small)
## 95 percent confidence interval:
##     lower     upper 
## 0.1323369 0.6068023

A potential problem comes from the small sample size of schools without a football team. Although its almost one-hundred, it’s still a bit smaller than you’d hope. Even so, the D Estimate is pretty small, so this would warrant more testing of the null hypothesis.

Performing the test

I think its appropriate to use a z-test because of the large sample size available and known population variance.

The significance level, α, will be set at 0.02, a relatively low value, because of the high cost of getting this figure correct. Revenue appropriation and scholarship availability could be altered from these type of results, so we don’t want a lot of false positives.

The power level, \(\beta\), will be set to 0.15 per a test done below.

I think a meaningful difference in the number of athletes would be 20. This would be plenty enough athletes to support an additional non-revenue sport.

null1 |>
  group_by(Football) |>
  summarize(sd = sd(athletes),
            mean = mean(athletes))
## # A tibble: 2 × 3
##   Football    sd  mean
##   <lgl>    <dbl> <dbl>
## 1 FALSE     108.  396.
## 2 TRUE      179.  456.
# obtaining an ncp value to use
delta <- 60
se <- 17.9
ncp_test = delta / se
# used to get power level
power.z.test(ncp = ncp_test, alpha = 0.02, 
             alternative = "not equal", plot = TRUE)

##      power ncp.alt ncp.null alpha  z.crit.1 z.crit.2
##  0.8474617   3.352        0  0.02 -2.326348 2.326348
test <- pwrss.t.2means(mu1 = .4, 
                       kappa = 1,
                       power = .85, alpha = 0.02, 
                       alternative = "not equal")
##  Difference between Two means 
##  (Independent Samples t Test) 
##  H0: mu1 = mu2 
##  HA: mu1 != mu2 
##  ------------------------------ 
##   Statistical power = 0.85 
##   n1 = 143 
##   n2 = 143 
##  ------------------------------ 
##  Alternative = "not equal" 
##  Degrees of freedom = 284 
##  Non-centrality parameter = 3.382 
##  Type I error rate = 0.02 
##  Type II error rate = 0.15
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'

This shows we would need at least 142 samples in each category of our data. Unfortunately, we don’t have that much data from D1 without football. Our lack of data comes from there simply being a limited number of Division 1 colleges and universities to look to.

If we were to find a z-statistic, it would be as follows:

print(ncp_test)
## [1] 3.351955
print(qnorm(.99))
## [1] 2.326348

Because the z-test statistic is higher than the significance test, we would reject the null hypothesis. Since there’s not enough data, we can’t say for sure this is accurate, but if it were, this means that school’s with football teams sponsor more athletes as an institution, even when excluding football players, than schools without. Or in other words, having a football team means the school will sponsor more roster sports for non-football sports.

null1 |>
  ggplot() +
  geom_boxplot(mapping = aes(x = Football, y = athletes)) +
  labs(title="Athletes for D1 schools with and without a football team")

Null Hypothesis 2

Null Hypothesis: the number of athletes per sport, excluding football, is equal for football and non-football D1 schools.

null2 <- ncaa |>
  filter(year == 2018) |>
  filter(sports != 'Football') |>
  filter(classification_code %in% c(1,2,3)) |>
  group_by(institution_name) |>
  summarise(athletes = sum(sum_partic_men, na.rm = TRUE) + 
              sum(sum_partic_women, na.rm = TRUE),
            division = min(classification_code),
            sports = n())
# separates D1 schools based on if they offer football
null2['Football'] <- with(null2, ifelse(division == 3, FALSE, TRUE))

# creates column which is athletes per sport (on average)
null2['ApS'] <- null2['athletes'] / null2['sports']

Test: t-test

Getting the p-value:

# count of D1 programs with and without a football team
sum(null2['Football'] == TRUE)
## [1] 253
sum(null2['Football'] == FALSE)
## [1] 96
null2 |>
  group_by(Football) |>
  summarize(sd = sd(ApS),
            mean = mean(ApS))
## # A tibble: 2 × 3
##   Football    sd  mean
##   <lgl>    <dbl> <dbl>
## 1 FALSE     6.68  27.1
## 2 TRUE      6.28  29.8
# finding test statistic based on numbers found above
t_score <- (27.1-29.8) / (6.3 / sqrt(96))
df_ <- sum(null2['Football'] == FALSE) - 1
# Finding the p-value
p_value <- 2 * pt(q=-4, df=95, lower.tail=TRUE)
 
p_value
## [1] 0.0001253495

Our p value is about 0.0001, which is a very small value for p. This provides strong evidence that we should reject the null hypothesis, meaning that the number of athletes per sport team is not equal between football and non-football D1 schools.

null2 |>
  ggplot() +
  geom_boxplot(mapping = aes(x = Football, y = ApS)) +
  labs(title="Athletes per sport for D1 schools with and without a football team")