One shortcoming of the Bradley-Terry team ratings (which used wins and losses and logistic regression) is that, when rating teams, they entirely ignore margins of victory. In what follows, we’ll build a model that rates teams based on point differentials and also predicts point differentials. We can then use the residuals of this model to turn these predicted point differentials into projected win probabilities.

Loading the Data and Packages

library(dplyr); library(data.table); library(reshape2)
library(lme4) 

TourneySeeds <- fread("/home/jcross/MarchMadness/data/NCAATourneySeeds.csv")
SampleSubmission <- fread("/home/jcross/MarchMadness/data/SampleSubmissionStage1.csv")
Seasons <- fread("/home/jcross/MarchMadness/data/Seasons.csv")
Teams <- fread("/home/jcross/MarchMadness/data/Teams.csv")
TourneySlots <- fread("/home/jcross/MarchMadness/data/NCAATourneySlots.csv")
TourneyDetailedResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyDetailedResults.csv")
TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyCompactResults.csv")

# TourneySeeds <- fread("/home/jcross/MarchMadness/data/WNCAATourneySeeds.csv")
# SampleSubmission <- fread("/home/jcross/MarchMadness/data/WSampleSubmissionStage1.csv")
# Seasons <- fread("/home/jcross/MarchMadness/data/WSeasons.csv")
# Teams <- fread("/home/jcross/MarchMadness/data/WTeams.csv")
# TourneySlots <- fread("/home/jcross/MarchMadness/data/WNCAATourneySlots.csv")
# TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/WNCAATourneyCompactResults.csv")

#new
RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/RegularSeasonCompactResults.csv")
#RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/WRegularSeasonCompactResults.csv")

Some Familiar Work on the Tables

TourneySeeds <- TourneySeeds %>% 
    mutate(SeedNum = gsub("[A-Z+a-z]", "", Seed)) %>% select(Season, TeamID, SeedNum)

## Regular Season Results

RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/RegularSeasonCompactResults.csv",
                                     data.table=FALSE)

RegularSeasonCompactResults <- RegularSeasonCompactResults %>% mutate(home = case_when(
                      WLoc == "N" ~ 0,
                      WLoc == "H" ~ 1,
                      WLoc == "A" ~ -1,
                      TRUE ~ 0
))


# Making a Data Frame of "Game to Predict" using the Sample Submission

games.to.predict <- cbind(SampleSubmission$ID, colsplit(SampleSubmission$ID, pattern = "_", names = c('season', 'team1', 'team2')))   

temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="TeamID"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="TeamID"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
games.to.predict <- games.to.predict %>% mutate(seed_diff = team2seed-team1seed)
games.to.predict$home <- 0

Our data wrangling looks much like it did for the Bradley-Terry model except that we now also compute the point differential for each game:

season <- 2015
sub1 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1, ptdiff=WScore-LScore) %>% 
    select(team1, team2, home, outcome, ptdiff)
  sub2 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0, ptdiff=LScore-WScore) %>% 
    select(team1, team2, home, outcome, ptdiff)
  reg.results <- rbind(sub1, sub2)
  
  
head(reg.results) #to take a look at what we've created
##   team1 team2 home outcome ptdiff
## 1  1103  1420    1       1     17
## 2  1104  1406    1       1     28
## 3  1112  1291    1       1     23
## 4  1113  1152    1       1     36
## 5  1119  1102    1       1      6
## 6  1120  1454    1       1     10

Once again, we used mixed effects model but this time we use a linear regression to predict point differential instead of a logistic regression to predict the log odds of victory.

  m.ptdiff <- lmer(ptdiff ~ home +  (1 | team1) + (1 | team2), data = reg.results)

  re <- ranef(m.ptdiff)$team1
  teamquality = data.frame(TeamID= as.numeric(row.names(re)),
                         quality=re[,"(Intercept)"])
left_join(teamquality %>% top_n(10, quality) %>% 
              arrange(desc(quality)), Teams)
##    TeamID  quality       TeamName FirstD1Season LastD1Season
## 1    1246 25.47505       Kentucky          1985         2019
## 2    1458 21.68493      Wisconsin          1985         2019
## 3    1112 21.53109        Arizona          1985         2019
## 4    1181 20.80873           Duke          1985         2019
## 5    1437 20.25855      Villanova          1985         2019
## 6    1211 19.57700        Gonzaga          1985         2019
## 7    1438 19.51281       Virginia          1985         2019
## 8    1428 18.31339           Utah          1985         2019
## 9    1314 17.89928 North Carolina          1985         2019
## 10   1326 17.21508        Ohio St          1985         2019

The highest rated teams is, again, Kentucky and here the interpretation of these coefficients is straight forward. Kentucky would be expected to beat the average team by 24.5 points and be expected to beat Wisconsin, the second highest rated team by 3.8 points. But what is Kentucky’s chance of winning that game? To estimate Kentucky’s win percentage, we can look at the residuals of our model.

  hist(resid(m.ptdiff))

  qqnorm(resid(m.ptdiff))

  summary(m.ptdiff)
## Linear mixed model fit by REML ['lmerMod']
## Formula: ptdiff ~ home + (1 | team1) + (1 | team2)
##    Data: reg.results
## 
## REML criterion at convergence: 82113.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2747 -0.6406  0.0000  0.6406  4.2747 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  team1    (Intercept)  70.0     8.367  
##  team2    (Intercept)  70.0     8.367  
##  Residual             102.6    10.128  
## Number of obs: 10708, groups:  team1, 351; team2, 351
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept) -4.878e-09  6.392e-01    0.00
## home         3.458e+00  1.094e-01   31.59
## 
## Correlation of Fixed Effects:
##      (Intr)
## home 0.000

The straight line in the Q-Q plot, means that the residuals (how far the point different was from our model’s predictions) were normally distributed. Based on our model summary, we can expect our point differential predictions to have a root mean square error (RMSE) of 10.13 points. In order for Kentucky to lose to Wisconsin, our prediction would have to be 3.8 point too optimistic, or put another way, we’d need to get a result that is \(\frac{25.5 - 21.7}{10.1} = 0.38\) standard deviations or further below average (from Kentucky’s point) of view.

  pnorm(3.8, mean=0, sd=10.1)
## [1] 0.6466299

Using the pnorm function, we can estimate that Kentucky has a 64.7% chance of victory in a game in which they are favored by 3.8 points if our lines have a RMSE of 10.1 points. To make predictions for all of the games in the test set, we need to make ratings for every team in every season. We can do this using a for loop.

for (season in 2014:2018){

  sub1 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1, ptdiff=WScore-LScore) %>% 
    select(team1, team2, home, outcome, ptdiff)
  sub2 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0, ptdiff=LScore-WScore) %>% 
    select(team1, team2, home, outcome, ptdiff)
  reg.results <- rbind(sub1, sub2)
  
  m.ptdiff <- lmer(ptdiff ~ home +  (1 | team1) + (1 | team2), data = reg.results) 
  pred.pt.diffs <- predict(m.ptdiff, games.to.predict[games.to.predict$season==season,], 
            type="response")
  games.to.predict[games.to.predict$season==season,"Pred"]<- pnorm(pred.pt.diffs, mean=0, sd=10.1)
}

Writing to a .csv file

write.csv(games.to.predict %>% select(Id, Pred), 'ptdiff_submission.csv', 
          row.names=FALSE)

Tweaking the Model and Combining Forecasts

We can also tweak the code is several ways. Shown below are three possible tweaks: 1) reducing the point differential for games that got out of hand so that a 30 point victory is seen as not that much more compelling than a 20 point victory and 2) doing a weighted regression with games closer to the tournament given more weight and 3) increasing the estimated error in the point differentials (from 10.1 points to 10.5, in this case) since team qualities may change in the tournament.

for (season in 2014:2018){
  
  sub1 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1, ptdiff=WScore-LScore, dw=.99^(132-DayNum)) %>% 
    select(team1, team2, home, outcome, ptdiff, dw)
  sub2 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0, ptdiff=LScore-WScore, dw=.99^(132-DayNum)) %>% 
    select(team1, team2, home, outcome, ptdiff, dw)
  reg.results <- rbind(sub1, sub2)
  
  reg.results <- reg.results %>% mutate(ptdiff = ifelse(abs(ptdiff)>15,sign(ptdiff)*(15+sqrt(abs(ptdiff-15))),ptdiff))
  
  m.ptdiff <- lmer(ptdiff ~ home +  (1 | team1) + (1 | team2), data = reg.results) 
  pred.pt.diffs <- predict(m.ptdiff, games.to.predict[games.to.predict$season==season,], 
            type="response")
  games.to.predict[games.to.predict$season==season,"point_diff_pred"]<- pnorm(pred.pt.diffs, mean=0, sd=10.5)
}

We can also combine these forecast with the Bradley-Terry forecasts and the Seed Benchmark forecasts to see if a small ensemble of forecasts performs better than any of the individual models:

Bradley-Terry Forecasts

for (pred.season in 2014:2018){
  print(pred.season)
sub1 <- RegularSeasonCompactResults %>% filter(Season==pred.season) %>% mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1) %>% select(team1, team2, home, outcome)

sub2 <- RegularSeasonCompactResults %>% filter(Season==pred.season) %>% mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0) %>% select(team1, team2, home, outcome)

reg.results <- rbind(sub1, sub2)
  
  mbt <- glmer(outcome ~ home +  (1 | team1) + (1 | team2), data = reg.results, family = binomial) 
  
  games.to.predict[games.to.predict$season==pred.season,"bt_pred"]<- predict(mbt, newdata=games.to.predict[games.to.predict$season==pred.season,], type="response")
}
## [1] 2014
## [1] 2015
## [1] 2016
## [1] 2017
## [1] 2018

Seed-Based Forecasts

games.to.predict <- games.to.predict %>% mutate(seed.pred = 0.5 + .03*(team2seed-team1seed))

Here we’ll give 40% of the weight to the seed benchmark, 30% to the Bradley-Terry model and 30% to the point differential model.

games.to.predict <- games.to.predict %>% mutate(Pred = .4*seed.pred + .3*bt_pred + .3*point_diff_pred)
write.csv(games.to.predict %>% select(Id, Pred), 'aggregate_submission.csv', row.names=FALSE)