Reading the data from sports reference for 2022-2025 seasons. Removing games against non-power conference opponents, with the exception of NCAA tournament games.

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
d23 = read.csv("book3.csv")
d22 = read.csv("22.csv", sep = ",", header = T)
d24 = read.csv("24.csv", sep = ",", header = T)
d25 = read.csv("25.csv", sep = ",", header = T)
dat = bind_rows(list(d22,d23,d24,d25))
dat = dat[-c(67, 1, 2, 6, 11, 34, 35, 36, 39, 44, 46, 68, 71, 72, 73, 106, 107, 110, 111, 117), ]

Creating factors based on 3 point percentage, offensive rebounding percentage, and a combination of the two. The final sums are the values used for the above average games toward the end of the video.

thrsc = dat %>%
  mutate(wp = if_else(W.L == "W", 1,0)) %>%
  mutate(tpc = if_else(X3P. <= 0.267, 1,
                       if_else(X3P. <= 0.318, 2,
                               if_else(X3P. <= 0.375, 3, 4)))) %>%
  select(G, Date, wp, tpc, X3P., W.L, Opp, Tm, Opp.1, X3P, X3PA, ORB, FGA, FG)
thrsc = thrsc %>%
  mutate(opc = ORB/(FGA-FG)) %>%
  mutate(opct = if_else(opc <= 0.267, 1,
                        if_else(opc <= 0.318, 2,
                                if_else(opc <= 0.380, 3, 4))))
finale = thrsc %>%
  mutate(fin = if_else(tpc == 1 & opct == 1, 1,
                       if_else(tpc == 1 & opct == 2, 2,
                               if_else(tpc == 1 & opct == 3, 3,
                                       if_else(tpc == 1 & opct == 4, 4,
                                               if_else(tpc == 2 & opct == 1, 5,
                                                       if_else(tpc == 2 & opct == 2, 6,
                                                               if_else(tpc == 2 & opct == 3, 7,
                                                                       if_else(tpc == 2 & opct == 4, 8,
                                                                               if_else(tpc == 3 & opct == 1, 9,
                                                                                       if_else(tpc == 3 & opct == 2, 10,
                                                                                               if_else(tpc == 3 & opct == 3, 11,
                                                                                                       if_else(tpc == 3 & opct == 4, 12,
                                                                                                               if_else(tpc == 4 & opct == 1, 13,
                                                                                                                       if_else(tpc == 4 & opct == 2, 14,
                                                                                                                               if_else(tpc == 4 & opct == 3, 15, 16)))))))))))))))) %>%
  select(fin, W.L, wp) %>%
  arrange(fin)
t1o1 = 2/11
t1o2 = 2/4
t1o3 = 6/8
t1o4 = 2/5
t2o1 = 4/9
t2o2 = 7/9
t2o3 = 4/5
t2o4 = 3/5
t3o1 = 3/7
t3o2 = 2/3
t3o3 = 4/5
t3o4 = 7/9
t4o1 = 6/7
t4o2 = 4/7
t4o3 = 7/7
t4o4 = 5/6
t34 = (3+2+4+7+6+4+7+5) / (7+3+5+9+7+7+7+6)
o34 = (6+2+4+3+4+7+7+5) / (8+5+5+5+5+9+7+6)
c(t34, o34)
## [1] 0.745098 0.760000

Organizing the data for 3 point percentage in order to create NCAA tournament model.

quantile(dat$X3P., probs = c(0.25, 0.5, 0.75))
##   25%   50%   75% 
## 0.267 0.318 0.380
d1 = thrsc %>%
  filter(tpc == 1) %>%
  select(wp) %>%
  arrange(wp)
d2 = thrsc %>%
  filter(tpc == 2) %>%
  select(wp) %>%
  arrange(wp)
d3 = thrsc %>%
  filter(tpc == 3) %>%
  select(wp) %>%
  arrange(wp)
d4 = thrsc %>%
  filter(tpc == 4) %>%
  select(wp) %>%
  arrange(wp)
w1 = 12/28
w2 = 17/27
w3 = 17/25
w4 = 21/26

Creating a tibble based on each combination of results for 3 point percentage in order to simulate a 6 game sample in the NCAA tournament.

one = c(w1, w2, w3, w4)
two = c(w1, w2, w3, w4)
thrr = c(w1, w2, w3, w4)
four = c(w1, w2, w3, w4)
five = c(w1, w2, w3, w4)
six = c(w1, w2, w3, w4)
tour = expand.grid(g1 = one, g2 = two, g3 = thrr, g4 = four, g5 = five, g6 = six)
head(tour)
##          g1        g2        g3        g4        g5        g6
## 1 0.4285714 0.4285714 0.4285714 0.4285714 0.4285714 0.4285714
## 2 0.6296296 0.4285714 0.4285714 0.4285714 0.4285714 0.4285714
## 3 0.6800000 0.4285714 0.4285714 0.4285714 0.4285714 0.4285714
## 4 0.8076923 0.4285714 0.4285714 0.4285714 0.4285714 0.4285714
## 5 0.4285714 0.6296296 0.4285714 0.4285714 0.4285714 0.4285714
## 6 0.6296296 0.6296296 0.4285714 0.4285714 0.4285714 0.4285714

Looping through every distinct combination of the tibble, giving each separate p-value for winning the NCAA tournament.

win = apply(tour, 1L, \(p) prod(dbinom(1, 1, prob = p)))

Summing each individual p-value and dividing by the amount of trials, giving the mean proportion that the Illini win the National Championship in any given year. Finally, we subtract the proportion that they win 0 National Championship over a 20 year sample to find the proportion that they win AT LEAST one National Championship in this sample.

x = sum(win)/4096
x
## [1] 0.06647851
1 - dbinom(0, 20, prob = x)
## [1] 0.7473692