So far we’ve built this systems for forecasting tournament game results. It’s often true that a combination of systems works better than any of individul systems. In this code, we combine two forecasts: the linear seed based forecasts and the Bradley-Terry forecasts, giving 75% of the weight to the seed based forecasts. This combination is more accurate than either system on its own.
.libPaths(c("/home/rstudioshared", "/home/rstudioshared/packages", "/home/rstudioshared/shared_files/packages"))
library(data.table); library(dplyr); library(reshape)
load('/home/rstudioshared/shared_files/data/march_madness/MMdata.RData')
games.to.predict <- cbind(SampleSubmission$Id, colsplit(SampleSubmission$Id, split = "_", names = c('season', 'team1', 'team2')))
temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="Team"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="Team"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
temp <- left_join(as.data.frame(TourneyCompactResults), TourneySeeds, by=c("Season", "Wteam"="Team"))
compact.results <- left_join(temp, TourneySeeds, by=c("Season", "Lteam"="Team"))
set1 <- compact.results %>% select(SeedNum.x, SeedNum.y) %>% mutate(result=1)
set2 <- compact.results %>% select(SeedNum.y, SeedNum.x) %>% mutate(result=0)
colnames(set1) <- c("team1seed", "team2seed", "team1win")
colnames(set2) <- c("team1seed", "team2seed", "team1win")
full.set <- rbind(set1, set2)
full.set <- full.set %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
m.seed.diff <- lm(team1win~ I(team2seed-team1seed), data=full.set)
games.to.predict$seed.pred <- predict(m.seed.diff, games.to.predict, type="response")
RegularSeasonCompactResults$home <- RegularSeasonCompactResults$Wloc
levels(RegularSeasonCompactResults$home) <- list("-1"="A", "1"="H", "0"="N")
RegularSeasonCompactResults$home <-
as.numeric(as.character(RegularSeasonCompactResults$home))
library(lme4)
games.to.predict$home <- 0
for (season in 2012:2015){
sub1 <- RegularSeasonCompactResults %>%
filter(Season==season) %>%
mutate(team1=as.factor(Wteam), team2=as.factor(Lteam), outcome=1) %>%
select(team1, team2, home, outcome)
sub2 <- RegularSeasonCompactResults %>%
filter(Season==season) %>%
mutate(team1=as.factor(Lteam), team2=as.factor(Wteam), home=-1*home, outcome=0) %>%
select(team1, team2, home, outcome)
reg.results <- rbind(sub1, sub2)
mbt <- glmer(outcome ~ home + (1 | team1) + (1 | team2), data = reg.results,
family = binomial)
games.to.predict[games.to.predict$season==season,"BT_pred"]<-
predict(mbt, games.to.predict[games.to.predict$season==season,],
type="response")
}
games.to.predict <- games.to.predict %>% mutate(Pred = 0.25*BT_pred + 0.75*seed.pred)
write.csv(games.to.predict %>% select(Id, Pred), 'aggregate_submission.csv',
row.names=FALSE)