THE DATA
library(dplyr); library(tidyr)
reg <- read.csv('/home/rstudioshared/shared_files/data/RegularSeasonCompactResults.csv')
tourney <- read.csv('/home/rstudioshared/shared_files/data/TourneyCompactResults.csv')
teams <- read.csv('/home/rstudioshared/shared_files/data/teams.csv')
sub <- read.csv('/home/rstudioshared/shared_files/data/sample_submission.csv')
reg1 <- reg %>% rename(team = Wteam, opp.team=Lteam, score=Wscore, opp.score=Lscore) %>% mutate(win=1, loc= (Wloc=="H") - (Wloc=="A"))
reg2 <- reg %>% rename(team= Lteam, opp.team=Wteam, score=Lscore, opp.score=Wscore) %>% mutate(win=0, loc= (Wloc=="A") - (Wloc=="H"))
reg <- rbind(reg1, reg2)
Basic Functions
RMSE <- function(predictions, actuals){
sqrt(mean((predictions-actuals)^2))
}
LogLoss <- function(predictions, actuals){
(-1/length(predictions)) * sum (actuals * log(predictions) + (1-actuals)*log(1-predictions))
}
GameWeight <- function(rating, opp.rating, loc, HFA, power=0.5){
(dnorm(qnorm(Ewins(rating, opp.rating, loc, HFA)))/dnorm(0))^power}
MarginWins <- function(margin){pnorm(margin/10)}
Ewins <- function(rating, opp.rating, loc=0, HFA=100)
{ 1/(1 + 10^((opp.rating-rating-loc*HFA)/400))}
RatingAdjust <- function(rating, opp.rating, wins, K=32, loc, HFA=100, margin=0, Kmargin=0, gw.power=0.5){
rating + (K*(wins - Ewins(rating, opp.rating, loc, HFA)) +
Kmargin*(MarginWins(margin) - Ewins(rating, opp.rating, loc, HFA)))*
GameWeight(rating=rating, opp.rating=opp.rating, loc=loc, HFA=HFA, power=gw.power)
}
The Work Horse
updateELO <- function(df, K = 25, Kmargin=0, start.elos=NA, HFA=100, gw.power=0){
d <- df
print(nrow(d))
d$elo.start <- NA
d$elo.end <- NA
d$opp.elo.start <- NA
d$opp.elo.end <- NA
d <- d %>% arrange(Daynum)
if (!is.data.frame(start.elos)) {
d[!duplicated(d$team),"elo.start"] <- 1500
d[!duplicated(d$team),"opp.elo.start"] <- 1500
} else {
n<-start.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"team"],start.elos$team),]$elo.end
n[is.na(n)] <- 1500
d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),]$elo.start <- n
n<-start.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"opp.team"],start.elos$team),]$elo.end
n[is.na(n)] <- 1500
d[is.na(d$opp.elo.end),][!duplicated(d[is.na(d$opp.elo.end),]$team),]$opp.elo.start<-n
}
d <- d %>% mutate(elo.end=RatingAdjust(elo.start, opp.elo.start, wins=win, K=K, margin=score-opp.score, loc=loc, Kmargin=Kmargin, gw.power=gw.power))
d <- d %>% mutate(opp.elo.end=RatingAdjust(opp.elo.start, elo.start, wins=1-win, K=K, margin=score-opp.score,loc=loc, Kmargin=Kmargin, gw.power=gw.power))
temp.elos <- d %>% filter(!is.na(elo.end)) %>% group_by(team) %>% top_n(1, Daynum) %>% select(elo.end)
while(sum(is.na(d$elo.end))>0){
n<-temp.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"team"],temp.elos$team),]$elo.end
d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),]$elo.start <- n[!is.na(n)]
n<-temp.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"opp.team"],temp.elos$team),]$elo.end
d[is.na(d$opp.elo.end),][!duplicated(d[is.na(d$opp.elo.end),]$team),]$opp.elo.start<-n[!is.na(n)]
d <- d %>% mutate(elo.end=RatingAdjust(elo.start, opp.elo.start, wins=win, K=K, margin=score-opp.score, loc=loc, HFA=HFA, Kmargin=Kmargin, gw.power=gw.power))
d <- d %>% mutate(opp.elo.end=RatingAdjust(opp.elo.start, elo.start, wins=1-win, K=K, margin=score-opp.score,loc=loc, HFA=HFA, Kmargin=Kmargin, gw.power=gw.power))
temp.elos <- d %>% filter(!is.na(elo.end)) %>% group_by(team) %>% top_n(1, Daynum) %>% select(elo.end)
}
final.elos <- d %>% group_by(team) %>% top_n(1, Daynum) %>% select(elo.end)
return(final.elos %>% arrange(desc(elo.end)))
}
Functions for Regressing towards the mean, Evaluating Predictions and Submitting to Kaggle
RegressElos <- function(df, regression=0.5){
df[,2] <- regression*1500 + (1-regression)*df[,2]
return(df)
}
EvaluateELO <- function(elo_df, tourney_df, season){
t_df <- tourney_df %>% filter(Season==season) %>% mutate(win=1, loc=0)
t_df <- left_join(t_df, elo_df, by=c("Wteam"="team"))
t_df <- left_join(t_df, elo_df, by=c("Lteam"="team"))
t_df <- t_df %>% mutate(prediction = Ewins(elo.end.x, elo.end.y))
print(t_df %>% summarize(rmse = RMSE(prediction, win), logloss = LogLoss(prediction,win)))
}
CheckELO <- function(K=20, Kmargin=20, R=0.1, start.season=2012, end.season=2016, gw.power=0){
results <- data.frame(season=numeric(), RMSE=numeric(), LogLoss=numeric())
f <- updateELO(reg %>% filter(Season==start.season), K=K, Kmargin=Kmargin, gw.power=0)
res <- EvaluateELO(f, tourney, season=start.season)
results[1,] <- as.numeric(c(start.season, res))
if(length(start.season:end.season)>1){
for (i in seq_along((start.season:end.season)[-1])){
f <- RegressElos(f, R)
f <- updateELO(reg %>% filter(Season==start.season+i), K=K, Kmargin=Kmargin, gw.power=0,start.elos=f)
res <- EvaluateELO(f, tourney, season=start.season+i)
results[1+i,] <- as.numeric(c(start.season+i, res))
}
}
return(results)
}
CreateSubmissionFile <- function(K=20, Kmargin=20, R=0.1, start.season=2010, end.season=2016, gw.power=0){
sub <- sub[,c("id", "pred")] %>% separate(id, into=c("year", "team", "opp.team"), sep="_", remove=FALSE)
elos.by.year <- data.frame(year=character(), team = character(), elo=numeric())
f <- updateELO(reg %>% filter(Season==start.season), K=K, Kmargin=Kmargin, gw.power=gw.power)
elos.by.year <- rbind(elos.by.year, data.frame(year=as.character(start.season), team=as.character(f$team), elo=f$elo.end))
if(length(start.season:end.season)>1){
for (i in seq_along((start.season:end.season)[-1])){
f <- RegressElos(f, R)
f <- updateELO(reg %>% filter(Season==start.season+i), K=K, Kmargin=Kmargin, gw.power=gw.power, start.elos=f)
elos.by.year <- rbind(elos.by.year, data.frame(year=as.character(start.season+i), team=as.character(f$team), elo=f$elo.end))
}
}
sub <- left_join(sub, elos.by.year, by=c("year", "team"))
sub <- left_join(sub, elos.by.year, by=c("year", "opp.team" = "team"))
sub <- sub %>% mutate(pred = Ewins(elo.x, elo.y, loc=0, HFA=100))
write.csv(sub %>% select(id, pred), 'submission.csv', row.names=FALSE)
}
Functions in Action
results <- CheckELO(K=35, Kmargin=15, R=0.1, start.season=2009, end.season=2016, gw.power=0.5)
results
## season RMSE LogLoss
## 1 2009 0.4488340 0.5897275
## 2 2010 0.4367674 0.5658740
## 3 2011 0.4571054 0.6022060
## 4 2012 0.4376569 0.5706315
## 5 2013 0.4614511 0.6103910
## 6 2014 0.4502553 0.5849283
## 7 2015 0.4223022 0.5331219
## 8 2016 0.4289351 0.5485168
results %>% filter(season>=2012) %>% summarize(rmse = mean(RMSE), logloss = mean(LogLoss))
## rmse logloss
## 1 0.4401201 0.5695179
CreateSubmissionFile(K=35, Kmargin=15, R=0.1, start.season=2009, end.season=2016, gw.power=0.5)