Zach Mayer created a package for simulating the NCAA tournament based on a set of Kaggle Predictions. Let’s download it and give it a spin.
#install.packages("devtools")
#library(devtools)
#devtools::install_github('zachmayer/kaggleNCAA')
library(dplyr)
library("kaggleNCAA")
This package treats one set of predictions as “the truth”, meaning that the real life probabilities of each team winning each game are assumed to match the predictions. We can then explore how the tournament might play out (and what our Log Loss might be) if this was really the truth. It may turn out that even if we somehow knew the truth, that a different set of predictions would be more likely to win in a pool of many similar predictions.
dat <- parseBracket('/home/jcross/MarchMadness/data/aggregate_submission.csv', w=0)
#you could also use your own projections.
sim <- simTourney(dat, 5, progress=TRUE)
head(sim)
Notice that this package gives us aggregate results over the (in this case) five simulations. In order to judge our models, we need to look at the individual simulations. To do that, we need to overwrite two of the function in this package. We can do so by running the following code:
sim_tourney_internal <- function(preds, Year=2018){
all_rounds <- sort(unique(preds$round))
#Evaluate the playin rounds
if(all_rounds[1] == 0){
r <- 0L
round_teamid_1 <- preds[round == r, list(slot=next_slot, teamid_1=winner, keep_teamid_1=1L)]
round_teamid_2 <- preds[round == r ,list(slot=next_slot, teamid_2=winner, keep_teamid_2=1L)]
preds <- merge(preds, round_teamid_1, by=c('slot', 'teamid_1'), all.x=TRUE)
preds <- merge(preds, round_teamid_2, by=c('slot', 'teamid_2'), all.x=TRUE)
preds[is.na(keep_teamid_1) & teamid_1_playedin == (r + 1L) & round == 1L, keep := 0L]
preds[is.na(keep_teamid_2) & teamid_2_playedin == (r + 1L) & round == 1L, keep := 0L]
preds <- preds[keep==1L,]
preds[, c('keep_teamid_1', 'keep_teamid_2') := NULL]
all_rounds <- all_rounds[2:length(all_rounds)]
}
#Evaluate the regular rounds
for(r in 1:5){
round_teamid_1 <- preds[round == r, list(slot=next_slot, teamid_1=winner, keep_teamid_1=1L)]
round_teamid_2 <- preds[round == r ,list(slot=next_slot, teamid_2=winner, keep_teamid_2=1L)]
preds <- merge(preds, round_teamid_1, by=c('slot', 'teamid_1'), all.x=TRUE)
preds <- merge(preds, round_teamid_2, by=c('slot', 'teamid_2'), all.x=TRUE)
preds[is.na(keep_teamid_1) & round == (r + 1L), keep := 0L]
preds[is.na(keep_teamid_2) & round == (r + 1L), keep := 0L]
preds <- preds[keep==1L,]
preds[, c('keep_teamid_1', 'keep_teamid_2') := NULL]
}
preds <- preds[,list(slot, round, teamid_1, teamid_2, women, winner, loser)]
preds <- preds[,Id := paste(Year, teamid_1, teamid_2, sep="_")]
preds <- preds[,outcome := ifelse(teamid_1==winner, 1, 0)]
data.table::setkeyv(preds, 'slot')
return(preds)
}
simTourney <- function(
preds, N=1000, year=2018, progress=TRUE, upset_bias=0, w=NULL, parallel=FALSE){
utils::data('all_slots', package='kaggleNCAA', envir=environment())
#Checks
if(progress & parallel){
stop("Can't use a progress bar in parallel. Please set either progress or parallel to FALSE")
}
#Subset the data
preds <- preds[season==year,]
#Decide men or women
if(is.null(w)){
w <- preds[1,women]
message(paste('assuming women =', w))
}
stopifnot(w==0 | w == 1)
#Join slots to the predictions
n1 <- nrow(preds)
preds <- merge(preds, all_slots, by=c('season', 'teamid_1', 'teamid_2', 'women'))
stopifnot(n1 == nrow(preds))
#Determine seeds
preds[, seed_1_int := as.integer(substr(seed_1, 2, 3))]
preds[, seed_2_int := as.integer(substr(seed_2, 2, 3))]
#Add some columns for tracking the simulation
preds[, rand := stats::runif(.N),]
preds[, winner := ifelse(pred > rand, teamid_1, teamid_2)]
preds[, loser := ifelse(pred <= rand, teamid_1, teamid_2)]
preds[, keep := 1L]
#Add upset bias
if(upset_bias!=0){
preds[seed_1_int > seed_2_int, rand := rand - upset_bias]
preds[seed_1_int < seed_2_int, rand := rand + upset_bias]
}
#Decide on progress bars
if(progress){
apply_fun <- pbapply::pblapply
} else if(parallel){
apply_fun <- function(idx, fun){
foreach(i=idx) %dopar% fun(i)
}
} else{
apply_fun <- lapply
}
#Run the simulation
sims_list <- apply_fun(1:N, function(x) {
preds[, rand := stats::runif(.N),]
if(upset_bias!=0){
preds[seed_1_int > seed_2_int, rand := rand - upset_bias]
preds[seed_1_int < seed_2_int, rand := rand + upset_bias]
}
preds[, winner := ifelse(pred > rand, teamid_1, teamid_2)]
preds[, loser := ifelse(pred <= rand, teamid_1, teamid_2)]
sim_tourney_internal(preds, Year=year)
})
stopifnot(length(sims_list) == N)
return(sims_list)
}
Now, let’s try that again:
dat <- parseBracket('/home/jcross/MarchMadness/data/aggregate_submission.csv', w=0)
#you could also use your own projections.
sim <- simTourney(dat, 5, progress=TRUE)
str(sim)
sim[[1]] # the results of the first simulation
sim[[2]] # the results of the second simulation
You can run 100 simulations and time how long that takes:
t <- Sys.time()
sim <- simTourney(dat, 100, progress=TRUE)
Sys.time() - t
Now, let’s write functions to calculate the Log Loss of a set of predictions based on the outcome of each simulation.
LogLoss <- function(pred, res){
(-1/length(pred)) * sum (res * log(pred) + (1-res)*log(1-pred))
}
calcLogLosses <- function(list_of_sim_results, predictions, Year=2018) {
list_of_log_losses <- lapply(list_of_sim_results, function(x)
suppressWarnings(left_join(x, predictions %>% mutate(season = substr(Id, 1, 4))
%>% filter(season==Year), by=c("Id")) %>%
filter(round!=0) %>%
summarize(LogLoss(Pred, outcome)))
)
return(as.vector(unlist(list_of_log_losses)))
}
Let’s see how well “perfect” predictions do (when our predicted probabilities match the probabilities used to simulate the tournament).
preds <- read.csv("/home/jcross/MarchMadness/data/aggregate_submission.csv")
perfect_loglosses <- calcLogLosses(sim, preds)
summary(perfect_loglosses)
hist(perfect_loglosses)
Notice that 2018 was a very hard year to project. Here’s a table showing how some of the best Kaggle March Madness performers over the years performed. This should give us a sense of what’s a competitive score for our 2014-2018 submissions as well as what’s a good score for 2018.
We can compare these results to how we would have performed simply using seed predictions
preds <- read.csv("/home/jcross/MarchMadness/data/seed_submission.csv")
seed_loglosses <- calcLogLosses(sim, preds)
# how often do the perfect predictions beat the seed predictions?
mean(perfect_loglosses < seed_loglosses)
# notice that the two sets of predictions perform well in the same simulated tournaments
# some tournaments are much easier to forecast than others and we don't know what kind of
# tournament we will get
plot(perfect_loglosses, seed_loglosses)
cor(seed_loglosses, perfect_loglosses)
hist(perfect_loglosses - seed_loglosses)
Our perfect prediction thinks that one first round game, “2018_1199_1281”, is nearly a toss up. Let’s see what happens when we let the seed predictions guess at this game:
preds <- read.csv("/home/jcross/MarchMadness/data/seed_submission.csv")
preds[preds$Id == "2018_1199_1281",]$Pred <- 0.999
seed_loglosses <- calcLogLosses(sim, preds)
mean(perfect_loglosses < seed_loglosses)
plot(perfect_loglosses, seed_loglosses)
cor(seed_loglosses, perfect_loglosses)
hist(perfect_loglosses - seed_loglosses)
How should we interpret these results?
Try other guessing strategies. Can guessing be beneficial? How would you determine the “correct” amount of guessing?