So far we’ve built this systems for forecasting tournament game results. It’s often true that a combination of systems works better than any of individul systems. In this code, we combine two forecasts: the linear seed based forecasts and the Bradley-Terry forecasts, giving 75% of the weight to the seed based forecasts. This combination is more accurate than either system on its own.

Loading the Data and Packages

.libPaths(c("/home/rstudioshared", "/home/rstudioshared/packages", "/home/rstudioshared/shared_files/packages"))

library(data.table); library(dplyr); library(reshape)

load('/home/rstudioshared/shared_files/data/march_madness/MMdata.RData')

Making the seed-based predictions

games.to.predict <- cbind(SampleSubmission$Id, colsplit(SampleSubmission$Id, split = "_", names = c('season', 'team1', 'team2')))   

temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="Team"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="Team"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))

temp <- left_join(as.data.frame(TourneyCompactResults), TourneySeeds, by=c("Season", "Wteam"="Team"))
compact.results <- left_join(temp, TourneySeeds, by=c("Season", "Lteam"="Team"))


set1 <- compact.results %>% select(SeedNum.x, SeedNum.y) %>% mutate(result=1)
set2 <- compact.results %>% select(SeedNum.y, SeedNum.x) %>% mutate(result=0)
colnames(set1) <- c("team1seed", "team2seed", "team1win")
colnames(set2) <- c("team1seed", "team2seed", "team1win")
full.set <- rbind(set1, set2)
full.set <- full.set %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
m.seed.diff <- lm(team1win~ I(team2seed-team1seed), data=full.set)

games.to.predict$seed.pred <- predict(m.seed.diff, games.to.predict, type="response")

Making the Bradley-Terry Predictions

RegularSeasonCompactResults$home <- RegularSeasonCompactResults$Wloc
levels(RegularSeasonCompactResults$home) <- list("-1"="A", "1"="H", "0"="N")
RegularSeasonCompactResults$home <- 
  as.numeric(as.character(RegularSeasonCompactResults$home))

library(lme4)

games.to.predict$home <- 0
for (season in 2012:2015){
  sub1 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(Wteam), team2=as.factor(Lteam), outcome=1) %>% 
    select(team1, team2, home, outcome)
  sub2 <- RegularSeasonCompactResults %>% 
    filter(Season==season) %>% 
    mutate(team1=as.factor(Lteam), team2=as.factor(Wteam), home=-1*home, outcome=0) %>% 
    select(team1, team2, home, outcome)
  reg.results <- rbind(sub1, sub2)
  mbt <- glmer(outcome ~ home +  (1 | team1) + (1 | team2), data = reg.results, 
               family = binomial) 
  games.to.predict[games.to.predict$season==season,"BT_pred"]<- 
    predict(mbt, games.to.predict[games.to.predict$season==season,], 
            type="response")
}

Combining the predictions with 75% of the weight given to the seed predictions an then writing them to a .csv file

games.to.predict <- games.to.predict %>% mutate(Pred = 0.25*BT_pred + 0.75*seed.pred)

write.csv(games.to.predict %>% select(Id, Pred), 'aggregate_submission.csv', 
          row.names=FALSE)

Combining Forecasts

Jared Cross

February 26, 2016

Loading the Data and Packages

Making the seed-based predictions

Making the Bradley-Terry Predictions

Combining the predictions with 75% of the weight given to the seed predictions an then writing them to a .csv file