library(dplyr); library(data.table); library(reshape2)
library(lme4)
TourneySeeds <- fread("/home/jcross/MarchMadness/data/NCAATourneySeeds.csv")
SampleSubmission <- fread("/home/jcross/MarchMadness/data/SampleSubmissionStage1.csv")
Seasons <- fread("/home/jcross/MarchMadness/data/Seasons.csv")
Teams <- fread("/home/jcross/MarchMadness/data/Teams.csv")
TourneySlots <- fread("/home/jcross/MarchMadness/data/NCAATourneySlots.csv")
TourneyDetailedResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyDetailedResults.csv")
TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyCompactResults.csv")
# TourneySeeds <- fread("/home/jcross/MarchMadness/data/WNCAATourneySeeds.csv")
# SampleSubmission <- fread("/home/jcross/MarchMadness/data/WSampleSubmissionStage1.csv")
# Seasons <- fread("/home/jcross/MarchMadness/data/WSeasons.csv")
# Teams <- fread("/home/jcross/MarchMadness/data/WTeams.csv")
# TourneySlots <- fread("/home/jcross/MarchMadness/data/WNCAATourneySlots.csv")
# TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/WNCAATourneyCompactResults.csv")
#new
RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/RegularSeasonCompactResults.csv")
#WRegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/WRegularSeasonCompactResults.csv")
TourneySeeds <- TourneySeeds %>%
mutate(SeedNum = gsub("[A-Z+a-z]", "", Seed)) %>% select(Season, TeamID, SeedNum)
## Regular Season Results
RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/RegularSeasonCompactResults.csv",
data.table=FALSE)
RegularSeasonCompactResults <- RegularSeasonCompactResults %>% mutate(home = case_when(
WLoc == "N" ~ 0,
WLoc == "H" ~ 1,
WLoc == "A" ~ -1,
TRUE ~ 0
))
# Making a Data Frame of "Game to Predict" using the Sample Submission
games.to.predict <- cbind(SampleSubmission$ID, colsplit(SampleSubmission$ID, pattern = "_", names = c('season', 'team1', 'team2')))
temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="TeamID"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="TeamID"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
games.to.predict <- games.to.predict %>% mutate(seed_diff = team2seed-team1seed)
games.to.predict$home <- 0
To find out more about our models, we want to see how they peform on tournament data going all the way back to 1985 (not just the last 5 years, like our sample submission).
We need to do a little work to get the tournament data ready (and make it look like the SampleSubmission data).
tc1 <- TourneyCompactResults %>% filter(WTeamID < LTeamID) %>% mutate(ID = paste(Season, "_", WTeamID, "_", LTeamID, sep=""), outcome=1, team1=WTeamID, team2=LTeamID)
head(tc1)
tc2 <- TourneyCompactResults %>% filter(WTeamID > LTeamID) %>% mutate(ID = paste(Season, "_", LTeamID, "_", WTeamID, sep=""), outcome=0,team1=LTeamID, team2=WTeamID)
head(tc2)
results <- rbind(tc1, tc2) %>% select(ID, outcome, team1, team2, Season, DayNum)
temp <- left_join(results, TourneySeeds, by=c("Season"="Season", "team1"="TeamID"))
results.full <- left_join(temp, TourneySeeds, by=c("Season"="Season", "team2"="TeamID"))
colnames(results.full) <- c(colnames(results.full[,1:6]), "team1seed", "team2seed")
results.full <- results.full %>% mutate(team1seed = as.numeric(team1seed), team2seed=as.numeric(team2seed))
results.full$home <- 0
head(results.full)
We also need our own Log Loss Formula:
LogLoss <- function(pred, res){
(-1/length(pred)) * sum (res * log(pred) + (1-res)*log(1-pred))
}
results.full <- results.full %>% mutate(seed.pred = 0.5 + .03*(team2seed-team1seed))
for (pred.season in 1985:2018){
print(pred.season)
sub1 <- RegularSeasonCompactResults %>% filter(Season==pred.season) %>% mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1) %>% select(team1, team2, home, outcome)
sub2 <- RegularSeasonCompactResults %>% filter(Season==pred.season) %>% mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0) %>% select(team1, team2, home, outcome)
reg.results <- rbind(sub1, sub2)
mbt <- glmer(outcome ~ home + (1 | team1) + (1 | team2), data = reg.results, family = binomial)
results.full[results.full$Season==pred.season,"bt_pred"]<- predict(mbt, newdata=results.full[results.full$Season==pred.season,], type="response")
}
for (season in 1985:2018){
sub1 <- RegularSeasonCompactResults %>%
filter(Season==season) %>%
mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1, ptdiff=WScore-LScore, dw=.99^(132-DayNum)) %>%
select(team1, team2, home, outcome, ptdiff, dw)
sub2 <- RegularSeasonCompactResults %>%
filter(Season==season) %>%
mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0, ptdiff=LScore-WScore, dw=.99^(132-DayNum)) %>%
select(team1, team2, home, outcome, ptdiff, dw)
reg.results <- rbind(sub1, sub2)
reg.results <- reg.results %>% mutate(ptdiff = ifelse(abs(ptdiff)>15,sign(ptdiff)*(15+sqrt(abs(ptdiff-15))),ptdiff))
m.ptdiff <- lmer(ptdiff ~ home + (1 | team1) + (1 | team2), data = reg.results)
pred.pt.diffs <- predict(m.ptdiff, results.full[results.full$Season==season,],
type="response")
results.full[results.full$Season==season,"point_diff_pred"]<- pnorm(pred.pt.diffs, mean=0, sd=10.5)
}
results.full <- results.full %>% mutate(BT_logodds = log((bt_pred)/(1-bt_pred)),
seed_logodds = log((seed.pred)/(1-seed.pred)),
ptdiff_logodds = log((point_diff_pred)/(1-point_diff_pred))
)
library(stats)
glm <- glm(outcome~BT_logodds+seed_logodds+ptdiff_logodds+0, data=results.full, family=binomial)
summary(glm)
#removing the BT_logodds from the model
glm <- glm(outcome~seed_logodds+ptdiff_logodds+0, data=results.full, family=binomial)
summary(glm)
results.full$combined.prediction <- predict(glm, type="response")
Now, looking at the results:
LogLoss(results.full$bt_pred, results.full$outcome)
LogLoss(results.full$seed.pred, results.full$outcome)
LogLoss(results.full$point_diff_pred, results.full$outcome)
LogLoss(results.full$combined.prediction, results.full$outcome)
Which of these predictions count as out of sample predictions?
library(caret)
fitControl <- trainControl(## 10-fold CV
method = "cv",
number = 50,
savePredictions = TRUE
)
lreg<-train(outcome~seed_logodds+ptdiff_logodds+0, data=results.full,method="glm",family=binomial(),
trControl=fitControl)
# you can ignore the error message
LogLoss(results.full$combined.prediction, results.full$outcome) # in sample log loss
LogLoss(lreg$pred$pred, lreg$pred$obs) # out of sample log loss