The goal of this project was to create a quantitative way of ranking coaches. The idea was that coaches would be rating on three main categories which make up a composite score. The three categories are as follow…
Recruiting Score: Simply compared to expectations at their current school, how well does each coach recruit? (25%)
Player Development Score: Based on the level of talent coming into each school, which coaches turn that talent into program success? (45%)
Talent Maximization Score: Which coaches are best at turning their NFL draft talent into program success? (30%)
In order to help determine what expectations should be at certain programs we create our own pseudo-conferences of programs that are roughly on the same tier. It’s not fool proof but it’s better than using conferences or subjective tiers.
teamInfo <- cfbd_team_info()
coachesDf <- cfbd_coaches() %>% filter(games > 3)
talent <- cfbd_team_talent()
fpi12 <- espn_ratings_fpi(year = 2012)
fpi13 <- espn_ratings_fpi(year = 2013)
fpi14 <- espn_ratings_fpi(year = 2014)
fpi15 <- espn_ratings_fpi(year = 2015)
fpi16 <- espn_ratings_fpi(year = 2016)
fpi17 <- espn_ratings_fpi(year = 2017)
fpi18 <- espn_ratings_fpi(year = 2018)
fpi19 <- espn_ratings_fpi(year = 2019)
fpi20 <- espn_ratings_fpi(year = 2020)
fpi21 <- espn_ratings_fpi(year = 2021)
fpi <- rbind(fpi17, fpi18, fpi19, fpi20, fpi21)
# Compiling Data
rec12 <- cfbd_recruiting_team(year = 2012)
rec13 <- cfbd_recruiting_team(year = 2013)
rec14 <- cfbd_recruiting_team(year = 2014)
rec15 <- cfbd_recruiting_team(year = 2015)
rec16 <- cfbd_recruiting_team(year = 2016)
rec17 <- cfbd_recruiting_team(year = 2017)
rec18 <- cfbd_recruiting_team(year = 2018)
rec19 <- cfbd_recruiting_team(year = 2019)
rec20 <- cfbd_recruiting_team(year = 2020)
rec21 <- cfbd_recruiting_team(year = 2021)
#espn_cfb_schedule(year = 2018) -> sch18 (Not available)
espn_cfb_schedule(year = 2019) -> sch19
espn_cfb_schedule(year = 2020) -> sch20
espn_cfb_schedule(year = 2021) -> sch21
## Year Talent Info
draftDF <- data.frame()
for(i in 2015:2021) {
cfbd_draft_picks(year = i) %>%
select(college_team, name, pre_draft_grade) %>%
group_by(college_team) %>%
summarise_at(vars(pre_draft_grade), list(sum)) %>%
arrange(-pre_draft_grade) %>%
rename(totalGrade = pre_draft_grade) -> temp
temp$year <- i
draftDF <- rbind(draftDF, temp)
}
clusteringFPI <- rbind(fpi12, fpi13, fpi14, fpi15, fpi16, fpi17, fpi18, fpi19, fpi20, fpi21)
recruitingFPI <- rbind(rec12, rec13, rec14, rec15, rec16, rec17, rec18, rec19, rec20, rec21)
scheduleClust <- rbind(sch19, sch20, sch21)
# Number of primetime games played will be used to compare teams
scheduleClust %>%
filter(broadcast_name == "ESPN" | broadcast_name == "CBS" |
broadcast_name == "FOX" | broadcast_name == "ABC") %>%
mutate(home_team_id1 = as.numeric(home_team_id), count = 1) %>%
group_by(home_team_id) %>%
summarise_at(vars(count), list(sum)) -> homePrimetime
scheduleClust %>%
filter(broadcast_name == "ESPN" | broadcast_name == "CBS" |
broadcast_name == "FOX" | broadcast_name == "ABC") %>%
mutate(home_team_id1 = as.numeric(home_team_id), count = 1) %>%
group_by(away_team_id) %>%
summarise_at(vars(count), list(sum)) -> awayPrimetime
# Counting number of times teams played in primetime
left_join(homePrimetime, awayPrimetime, by = c("home_team_id" = "away_team_id")) %>%
mutate(count.y = if_else(is.na(count.y), 0, count.y),
count.x = if_else(is.na(count.x), 0, count.x)) %>%
mutate(games = count.x + count.y) %>%
select(-c(count.x, count.y)) %>%
rename(team_id = home_team_id) -> primeTimeGames
# Stadium Capacity can be used to measure school/fan investment
teamInfo %>%
select(school, capacity, latitude, longitude) -> info
# What kind of talent do these schools draw?
recruitingFPI %>%
select(school = team, points, year) -> recFpi
#How many Coaches have these schools had in the last 12 years?
coachesDf$one <- 1
coachesDf %>%
mutate(Coach = paste(first_name, last_name)) %>%
# Weird data abnormality, no special significance
mutate(school = if_else(school == "SOUTH FLORIDA", "South Florida", school)) -> coaches1
schools <- unique(coaches1$school)
coach_list <- data.frame()
for(i in schools) {
coaches1 %>%
filter(school == i) %>%
mutate(newCoach = ifelse(Coach == lag(Coach), 0, 1)) -> temp
coach_list <- rbind(coach_list, temp)
}
coach_list %>%
filter(year >= 2010) %>%
drop_na(newCoach) %>%
group_by(school) %>%
summarise(Coaches = sum(newCoach)) -> coach_list1
# What kind of NFL talent do these schools produce?
draftDF %>%
group_by(college_team, year) %>%
summarise_at(vars(totalGrade), list(mean)) %>%
mutate(totalGrade = if_else(is.na(totalGrade), 0, totalGrade)) %>%
summarise_at(vars(totalGrade), list(mean)) -> draftGradeSums
# Weeks spent in the top 2 after week 9 since 1950
rankings <- data.frame()
for(i in 1950:2022) {
temp <- cfbd_rankings(year = i)
rankings <- rbind(rankings, temp)
}
rankings %>%
filter(rank <= 2 & poll == "AP Top 25" & week > 11) %>%
pull(school) %>%
table() %>%
data.frame() %>%
rename(school = 1) -> top1
clusteringFPI %>%
select(year, team_id, name, fpi) %>%
rename(school = name) %>%
left_join(primeTimeGames) %>%
mutate(games = if_else(is.na(games), 0, games)) %>%
left_join(coach_list1) %>%
left_join(info) %>%
left_join(recFpi, by = c("school" = "school", "year" = "year")) %>%
drop_na(capacity, points, games, fpi) %>%
mutate(team_id = as.numeric(team_id), fpi = scale(as.numeric(fpi)),
points = scale(as.numeric(points))) %>%
group_by(school) %>%
summarise_at(vars(team_id, fpi, capacity, longitude, latitude, points, games, Coaches),
list(mean, sd)) %>%
select(school, capacity = capacity_fn1, rec_avg = points_fn1, fpi_avg = fpi_fn1,
fpi_sd = fpi_fn2, prime_games = games_fn1,
coaches = Coaches_fn1, lat = latitude_fn1, long = longitude_fn1) %>%
left_join(draftGradeSums, by = c("school" = "college_team")) %>%
left_join(top1) %>%
mutate(totalGrade = if_else(is.na(totalGrade), 0, totalGrade),
Freq = if_else(is.na(Freq), 0, as.double(Freq))) %>%
drop_na(fpi_sd, lat, long)-> cluster
## Joining, by = "team_id"
## Joining, by = "school"
## Joining, by = "school"
## Joining, by = "school"
cluster %>%
mutate(lat = if_else(school == "Hawaii", 32.7157, lat),
long = if_else(school == "Hawaii", -117.1611 , lat)) %>%
mutate(prime_games = scale(prime_games), Freq = scale(Freq), lat = scale(lat),
long = scale(long), capacity = scale(capacity)) %>%
select(school, capacity, rec_avg, Freq, fpi_avg, fpi_sd, prime_games, coaches, lat, long) %>%
select(-c(rec_avg, fpi_avg, fpi_sd, coaches))-> cluster2
cluster2 %>% pull(school) -> schools
# Clustering can only be done with quantitative data, so it's important we keep
# the names attached somehow, turning tibble to a dataframe
cluster2 %>%
select(-school) %>% as.data.frame() -> cluster3
row.names(cluster3) = schools
Here we use K-means clustering to create distinct program clusters, which help establish program expectations
## Determining Optimal Number of Clusters and Creating Clusters
set.seed(13)
fviz_nbclust(cluster3, FUNcluster = kmeans, )
# 9 clusters is determined to be the optimal amount
k <- kmeans(cluster3, 6, nstart = 50)
We probably need more than 2 different clusters to establish distinct tiers so we go with 6 total clusters, which still creates really distinct clusters for us.
## Visualizing Clusters and Attaching Clusters to our data used in our regression analysis
fviz_cluster(k, data = cluster3, ggtheme = theme_minimal(), repel = TRUE,
palette = "Dark2", xlab = FALSE, ylab = FALSE, main = "Our 6 new 'Pseudo-Conferences'")
## Warning: ggrepel: 68 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
cbind(cluster3, cluster = k$cluster) %>%
mutate(school = row.names(cluster3)) %>%
as_tibble() %>%
select(school, cluster) -> clusters
Now that we have established some sort of baseline level of expectations for each program, we prepare other data to get scores for each coach in the 3 aforementioned areas.
## Home Game Info
gms21 <- cfbd_game_info(year = 2021)
gms19 <- cfbd_game_info(year = 2019)
gms18 <- cfbd_game_info(year = 2018)
gms17 <- cfbd_game_info(year = 2017)
gms16 <- cfbd_game_info(year = 2016)
games <- rbind(gms16, gms17, gms18, gms19, gms21)
# Gathering teams' average attendance
games %>%
group_by(home_team) %>%
summarise_at(vars(attendance), list(mean)) %>% drop_na(attendance) -> attendance
# At this point, I'm not sure if I had already done this but this is me gathering data for the regression models
left_join(coachesDf, talent) %>%
mutate(CoachName = paste(first_name, last_name)) %>%
select(CoachName, hire_date, school, year, talent) %>%
filter(year >= 2015) -> coach_talent
## Joining, by = c("school", "year")
coach_talent %>%
right_join(fpi, by = c("school" = "name", "year" = "year")) %>%
left_join(teamInfo, by = c("school" = 'school')) %>%
left_join(attendance, by = c("school" = "home_team")) %>%
left_join(draftDF, by = c("year" = "year", "school" = "college_team")) %>%
left_join(clusters) %>%
mutate(clusters = as.factor(cluster),
talent = if_else(is.na(talent), 0, talent)) -> fullCoachData
## Joining, by = "school"
The idea here is that we are trying to predict a team’s success (measured by FPI) with two variables, the NFL draft talent on a roster as well as the coach. the ‘totalGrade’ variable will be a big contributor, but good coaches will turn this talent into a higher FPI more often.
talentFpiModel <- lm(fpi ~ CoachName + totalGrade, data = fullCoachData)
talentFpiModel$coefficients %>% as.data.frame() %>% head(205) %>% slice(-1) -> talFpi_coeff
row.names(talFpi_coeff) -> coaches
talFpiCoeff <- tibble(coaches, talFpi_coeff$.)
talFpiCoeff %>%
mutate(coaches = substring(coaches, 10)) %>%
right_join(fullCoachData, by = c("coaches" = "CoachName")) %>%
rename(talFpiCoeff = `talFpi_coeff$.`) %>%
arrange(-talFpiCoeff) %>%
select(coaches, talFpiCoeff, school, year) %>%
group_by(school) %>%
#filter(year == 2022) %>%
slice(which.max(year)) %>%
drop_na(coaches) -> tfd2
# Rescaling these coefficients to create a score from 0 for the worst coach at mazimizing NFL talent, to the best coach
tfd2 %>%
cbind(talFpiScore = scales::rescale(tfd2$talFpiCoeff, by = c(0,100))*100) %>%
select(-talFpiCoeff) -> talFpiData
Similar methodology as the score above but we want to see which coaches provide the biggest boost to the talent recruited to their schools.
coachModel <- lm(fpi ~ CoachName + talent, data = fullCoachData)
coachModel$coefficients %>% as.data.frame() %>% head(205) %>% slice(-1) -> coach_coeff
row.names(coach_coeff) -> coaches
coachCoeff <- tibble(coaches, coach_coeff$.)
coachCoeff %>%
mutate(coaches = substring(coaches, 10)) %>%
right_join(fullCoachData, by = c("coaches" = "CoachName")) %>%
rename(talentMaxCoeff = `coach_coeff$.`) %>%
arrange(-talentMaxCoeff) %>%
select(coaches, talentMaxCoeff, school, year, talent) %>%
group_by(coaches) -> cd
cd %>%
summarise_at(vars(talentMaxCoeff), list(mean)) -> cd1
cd %>%
left_join(cd1) %>%
group_by(school) %>%
slice(which.max(year)) %>%
drop_na(coaches) -> cd2
## Joining, by = c("coaches", "talentMaxCoeff")
cd2 %>%
cbind(talentMaxScore = scales::rescale(cd2$talentMaxCoeff, by = c(0,100))*100) %>%
right_join(cfbd_team_info()) %>%
unique() -> fullCoachData2
## Joining, by = "school"
When we consider who the best recruiters are, we must consider what type of program these schools are, thus we use the cluster variable. We also use attendance as a proxy for fan engagement, which may vary in clusters outside the most elite one or two.
rec17 <- cfbd_recruiting_team(year = 2017)
rec18 <- cfbd_recruiting_team(year = 2018)
rec19 <- cfbd_recruiting_team(year = 2019)
rec20 <- cfbd_recruiting_team(year = 2020)
rec21 <- cfbd_recruiting_team(year = 2021)
recruiting_data <- rbind(rec17,rec18,rec19,rec20,rec21)
fullCoachData %>%
left_join(recruiting_data, by = c("year" = "year", "school" = "team")) %>%
drop_na(capacity, attendance) %>%
filter(capacity > 0 & attendance > 0) %>%
mutate(attendanceRate = attendance/capacity) -> fullCoachData3
# Predicting 247 composite recruiting class rankings
talentModel <- lm(points ~ CoachName + attendance + cluster, data = fullCoachData3)
talentModel$coefficients %>% as.data.frame() -> talent_coeff
row.names(talent_coeff) -> talent
talentCoeff <- tibble(talent, talent_coeff$.)
talentCoeff %>%
rename(recruitingCoeff = `talent_coeff$.`) %>%
filter(grepl("CoachName", talent)) %>%
mutate(coaches = substring(talent, 10)) %>%
select(-talent) %>%
right_join(fullCoachData, by = c("coaches" = "CoachName")) %>%
arrange(-recruitingCoeff) %>%
select(coaches, recruitingCoeff, school, year, talent) %>%
group_by(coaches) -> td
td %>%
summarise_at(vars(recruitingCoeff), list(mean)) -> td1
td %>%
left_join(td1) %>%
group_by(school) %>%
slice(which.max(year)) %>%
drop_na(coaches, recruitingCoeff) -> td2
## Joining, by = c("coaches", "recruitingCoeff")
td2 %>%
cbind(recruitingScore = scales::rescale(td2$recruitingCoeff, by = c(0,100))*100) -> talentData2
## What coaches are still coaching in 2022, who switched jobs?
cfbd_coaches(year = 2022) -> coaches22
coaches22 %>%
mutate(name = paste(first_name, last_name)) %>%
left_join(teamInfo) %>%
select(School = school, Conference = conference, name) -> c22
## Joining, by = "school"
fullCoachData3 %>%
left_join(c22, by = c("coaches" = "name")) %>%
left_join(talFpiData1) %>%
left_join(talentData2) %>%
ungroup() %>%
select(-c(school, conference)) %>%
drop_na(School) %>%
select(Coach = coaches, School, talentMaxScore, talentToWinsScore = talFpiScore, recruitingScore) %>%
mutate(TotalScore = talentMaxScore*0.45 + talentToWinsScore*0.3 + recruitingScore*0.25) %>%
unique() %>%
arrange(-TotalScore) -> coachesFinal
## Joining, by = c("coaches", "school", "year")
## Joining, by = c("coaches", "school", "year", "talent")
With that we get our final rankings… Here are my top 50 coaches for the 2022 College Football Season
coachesFinal %>% head(50) %>% select(-c(talentToWinsScore, talentMaxScore, recruitingScore)) %>%
print.data.frame()
## Coach School TotalScore
## 1 Nick Saban Alabama 95.44401
## 2 Kirby Smart Georgia 92.83131
## 3 Dabo Swinney Clemson 91.90546
## 4 Ryan Day Ohio State 87.13453
## 5 Lincoln Riley USC 78.00064
## 6 Brian Kelly LSU 77.22907
## 7 Jimbo Fisher Texas A&M 73.11528
## 8 Mario Cristobal Miami 71.23343
## 9 James Franklin Penn State 71.03596
## 10 Jim Harbaugh Michigan 70.50144
## 11 Kyle Whittingham Utah 67.77052
## 12 Paul Chryst Wisconsin 67.60723
## 13 Josh Heupel Tennessee 66.66868
## 14 Steve Sarkisian Texas 65.36049
## 15 Dave Aranda Baylor 63.74103
## 16 Gus Malzahn UCF 63.41658
## 17 Kirk Ferentz Iowa 63.10057
## 18 Matt Campbell Iowa State 63.10005
## 19 Mike Gundy Oklahoma State 61.81591
## 20 P.J. Fleck Minnesota 59.31576
## 21 Chip Kelly UCLA 59.30991
## 22 Bryan Harsin Auburn 59.11102
## 23 David Shaw Stanford 59.10795
## 24 Lane Kiffin Ole Miss 58.87092
## 25 Pat Fitzgerald Northwestern 58.50977
## 26 Jeff Hafley Boston College 56.65616
## 27 Jeff Brohm Purdue 55.42310
## 28 Dana Holgorsen Houston 54.61399
## 29 Herm Edwards Arizona State 54.45544
## 30 Sonny Dykes TCU 54.02118
## 31 Scott Frost Nebraska 54.01579
## 32 Mack Brown North Carolina 53.95132
## 33 Billy Napier Florida 53.93470
## 34 Mark Stoops Kentucky 53.86225
## 35 Sam Pittman Arkansas 53.48258
## 36 Dave Clawson Wake Forest 52.70264
## 37 Dave Doeren NC State 52.45664
## 38 Dino Babers Syracuse 51.36355
## 39 Luke Fickell Cincinnati 49.88894
## 40 Neal Brown West Virginia 49.81804
## 41 Jason Candle Toledo 49.79845
## 42 Scott Satterfield Louisville 49.76225
## 43 Willie Fritz Tulane 49.75831
## 44 Tom Allen Indiana 49.51678
## 45 Justin Wilcox California 49.40662
## 46 Kalani Sitake BYU 47.71799
## 47 Mike Locksley Maryland 47.24080
## 48 Eli Drinkwitz Missouri 46.81002
## 49 Shane Beamer South Carolina 44.87334
## 50 Bret Bielema Illinois 44.72336
cfbd_ratings_elo() %>%
left_join(coachesDf, by = c("team" = "school", "year")) %>%
mutate(coach = paste(first_name, last_name)) %>%
select(year, team, elo, coach) %>%
left_join(coachesFinal, by = c("coach" = "Coach")) %>%
left_join(clusters, by = c("team" = "school")) %>%
drop_na(TotalScore, cluster) %>%
mutate(cluster = as.factor(cluster)) -> validate
# Adjusting for program expectations w/cluster variable
validateModel <- lm(elo ~ coach + cluster, data = validate)
data.frame(validate$elo, validateModel$fitted.values) %>%
rename(TrueElo = 1, PredElo = 2) %>%
ggplot(aes(x = PredElo, y = TrueElo, alpha = 0.8)) +
geom_point() +
geom_smooth() +
stat_regline_equation(label.y = 2300, label.x = 1200, aes(label = ..rr.label..)) +
theme_light() +
theme(legend.position="none") +
ggtitle("How well does my TotalScore Variable predict success?",
subtitle = "Here success is measured by ELO for a technical reason that made it more convenient")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
After comparing our fitted values to the true values, we see that our coach scores are fairly indicative of success.
One area this model could be stronger is comparing coaches at group of 5 schools relative to their peers at Power 5 schools, when comparing coaches at smaller schools the cream rises to the top but when compared to the entire FBS pool of coaches some coaches aren’t getting their due (looking at Luke Fickell)
I’m working on weighting more recent results more strongly. I think Mack Brown is earning more legacy points than he is for his current body of work
I’m working on creating an objective way of identifying “exceptions to the rule” within clusters. Schools like Northwestern and Stanford have different recruiting strategies that their coaches shouldn’t shoulder the blame for. The service academies don’t have really any draft talent given their commitments, so how else can measure their roster’s talent apart from the team’s overall sucess.
At this point this draft is pretty outdated. Results will vary from my current draft and even some methodoology to some degree. But for the most part this covers how I went about these rankings and I’ll continue to circle back to update whatever may need updating as the season progresses.