Loading the Data and Packages

library(dplyr); library(data.table); library(reshape2)
library(lme4) 

TourneySeeds <- fread("/home/jcross/MarchMadness/data/NCAATourneySeeds.csv")
SampleSubmission <- fread("/home/jcross/MarchMadness/data/SampleSubmissionStage1.csv")
Seasons <- fread("/home/jcross/MarchMadness/data/Seasons.csv")
Teams <- fread("/home/jcross/MarchMadness/data/Teams.csv")
TourneySlots <- fread("/home/jcross/MarchMadness/data/NCAATourneySlots.csv")
TourneyDetailedResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyDetailedResults.csv")
TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyCompactResults.csv")

# TourneySeeds <- fread("/home/jcross/MarchMadness/data/WNCAATourneySeeds.csv")
# SampleSubmission <- fread("/home/jcross/MarchMadness/data/WSampleSubmissionStage1.csv")
# Seasons <- fread("/home/jcross/MarchMadness/data/WSeasons.csv")
# Teams <- fread("/home/jcross/MarchMadness/data/WTeams.csv")
# TourneySlots <- fread("/home/jcross/MarchMadness/data/WNCAATourneySlots.csv")
# TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/WNCAATourneyCompactResults.csv")

#new
RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/RegularSeasonCompactResults.csv")
#WRegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/WRegularSeasonCompactResults.csv")

Some Familiar Work on the Tables

TourneySeeds <- TourneySeeds %>% 
    mutate(SeedNum = gsub("[A-Z+a-z]", "", Seed)) %>% select(Season, TeamID, SeedNum)

## Regular Season Results

RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/RegularSeasonCompactResults.csv",
                                     data.table=FALSE)

RegularSeasonCompactResults <- RegularSeasonCompactResults %>% mutate(home = case_when(
                      WLoc == "N" ~ 0,
                      WLoc == "H" ~ 1,
                      WLoc == "A" ~ -1,
                      TRUE ~ 0
))


# Making a Data Frame of "Game to Predict" using the Sample Submission

games.to.predict <- cbind(SampleSubmission$ID, colsplit(SampleSubmission$ID, pattern = "_", names = c('season', 'team1', 'team2')))   

temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="TeamID"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="TeamID"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
games.to.predict <- games.to.predict %>% mutate(seed_diff = team2seed-team1seed)
games.to.predict$home <- 0

Tournament Results

To find out more about our models, we want to see how they peform on tournament data going all the way back to 1985 (not just the last 5 years, like our sample submission).

We need to do a little work to get the tournament data ready (and make it look like the SampleSubmission data).

tc1 <- TourneyCompactResults %>% filter(WTeamID < LTeamID) %>% mutate(ID = paste(Season, "_", WTeamID, "_", LTeamID, sep=""), outcome=1, team1=WTeamID, team2=LTeamID)
head(tc1)

tc2 <- TourneyCompactResults %>% filter(WTeamID > LTeamID) %>% mutate(ID = paste(Season, "_", LTeamID, "_", WTeamID, sep=""), outcome=0,team1=LTeamID, team2=WTeamID)
head(tc2)

results <- rbind(tc1, tc2) %>% select(ID, outcome, team1, team2, Season, DayNum)

temp <- left_join(results, TourneySeeds, by=c("Season"="Season", "team1"="TeamID"))
results.full <- left_join(temp, TourneySeeds, by=c("Season"="Season", "team2"="TeamID"))
colnames(results.full) <- c(colnames(results.full[,1:6]), "team1seed", "team2seed")
results.full <- results.full %>% mutate(team1seed = as.numeric(team1seed), team2seed=as.numeric(team2seed))
results.full$home <- 0
head(results.full)

We also need our own Log Loss Formula:

LogLoss <- function(pred, res){
  (-1/length(pred)) * sum (res * log(pred) + (1-res)*log(1-pred))
}

Seed Based Forecasts

results.full <- results.full %>% mutate(seed.pred = 0.5 + .03*(team2seed-team1seed))

Bradley-Terry

for (pred.season in 1985:2018){
  print(pred.season)
sub1 <- RegularSeasonCompactResults %>% filter(Season==pred.season) %>% mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1) %>% select(team1, team2, home, outcome)

sub2 <- RegularSeasonCompactResults %>% filter(Season==pred.season) %>% mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0) %>% select(team1, team2, home, outcome)

reg.results <- rbind(sub1, sub2)
  
  mbt <- glmer(outcome ~ home +  (1 | team1) + (1 | team2), data = reg.results, family = binomial) 
  
  results.full[results.full$Season==pred.season,"bt_pred"]<- predict(mbt, newdata=results.full[results.full$Season==pred.season,], type="response")
}

Fancy Point Differential Model

for (season in 1985:2018){
  
  sub1 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1, ptdiff=WScore-LScore, dw=.99^(132-DayNum)) %>% 
    select(team1, team2, home, outcome, ptdiff, dw)
  sub2 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0, ptdiff=LScore-WScore, dw=.99^(132-DayNum)) %>% 
    select(team1, team2, home, outcome, ptdiff, dw)
  reg.results <- rbind(sub1, sub2)
  
  reg.results <- reg.results %>% mutate(ptdiff = ifelse(abs(ptdiff)>15,sign(ptdiff)*(15+sqrt(abs(ptdiff-15))),ptdiff))
  
  m.ptdiff <- lmer(ptdiff ~ home +  (1 | team1) + (1 | team2), data = reg.results) 
  pred.pt.diffs <- predict(m.ptdiff, results.full[results.full$Season==season,], 
            type="response")
  results.full[results.full$Season==season,"point_diff_pred"]<- pnorm(pred.pt.diffs, mean=0, sd=10.5)
}

Getting logodds of different prediction models

results.full <- results.full %>% mutate(BT_logodds = log((bt_pred)/(1-bt_pred)),
                                        seed_logodds = log((seed.pred)/(1-seed.pred)),
                                        ptdiff_logodds = log((point_diff_pred)/(1-point_diff_pred))
                                       )

library(stats)

glm <- glm(outcome~BT_logodds+seed_logodds+ptdiff_logodds+0, data=results.full, family=binomial)
summary(glm)

#removing the BT_logodds from the model

glm <- glm(outcome~seed_logodds+ptdiff_logodds+0, data=results.full, family=binomial)
summary(glm)

results.full$combined.prediction <- predict(glm, type="response")

Now, looking at the results:

LogLoss(results.full$bt_pred, results.full$outcome)
LogLoss(results.full$seed.pred, results.full$outcome)
LogLoss(results.full$point_diff_pred, results.full$outcome)

LogLoss(results.full$combined.prediction, results.full$outcome)

Which of these predictions count as out of sample predictions?

library(caret)

fitControl <- trainControl(## 10-fold CV
  method = "cv",
  number = 50,
  savePredictions = TRUE
)

lreg<-train(outcome~seed_logodds+ptdiff_logodds+0, data=results.full,method="glm",family=binomial(),
             trControl=fitControl)
# you can ignore the error message

LogLoss(results.full$combined.prediction, results.full$outcome) # in sample log loss
LogLoss(lreg$pred$pred, lreg$pred$obs) # out of sample log loss