THE DATA

library(dplyr); library(tidyr)

reg <- read.csv('/home/rstudioshared/shared_files/data/RegularSeasonCompactResults.csv')
tourney <- read.csv('/home/rstudioshared/shared_files/data/TourneyCompactResults.csv')
teams <- read.csv('/home/rstudioshared/shared_files/data/teams.csv')
sub <- read.csv('/home/rstudioshared/shared_files/data/sample_submission.csv')


reg1 <- reg %>% rename(team = Wteam, opp.team=Lteam, score=Wscore, opp.score=Lscore) %>% mutate(win=1, loc= (Wloc=="H") - (Wloc=="A"))
reg2 <- reg %>% rename(team= Lteam, opp.team=Wteam, score=Lscore, opp.score=Wscore) %>% mutate(win=0, loc= (Wloc=="A") - (Wloc=="H"))
reg <- rbind(reg1, reg2)

Basic Functions

RMSE <- function(predictions, actuals){
  sqrt(mean((predictions-actuals)^2))
}

LogLoss <- function(predictions, actuals){
  (-1/length(predictions)) * sum (actuals * log(predictions) + (1-actuals)*log(1-predictions))
}

GameWeight <- function(rating, opp.rating, loc, HFA, power=0.5){
  (dnorm(qnorm(Ewins(rating, opp.rating, loc, HFA)))/dnorm(0))^power}

MarginWins <- function(margin){pnorm(margin/10)}

Ewins <- function(rating, opp.rating, loc=0, HFA=100)
{ 1/(1 + 10^((opp.rating-rating-loc*HFA)/400))}

RatingAdjust <- function(rating, opp.rating, wins, K=32, loc, HFA=100, margin=0, Kmargin=0, gw.power=0.5){
  rating + (K*(wins - Ewins(rating, opp.rating, loc, HFA)) + 
          Kmargin*(MarginWins(margin) - Ewins(rating, opp.rating, loc, HFA)))*
        GameWeight(rating=rating, opp.rating=opp.rating, loc=loc, HFA=HFA, power=gw.power)
        
}

The Work Horse

updateELO <- function(df, K = 25, Kmargin=0, start.elos=NA, HFA=100, gw.power=0){
  d <- df
  print(nrow(d))
  d$elo.start <- NA
  d$elo.end <- NA
  d$opp.elo.start <- NA
  d$opp.elo.end <- NA
  d <- d %>% arrange(Daynum)
  
  if (!is.data.frame(start.elos)) {
    d[!duplicated(d$team),"elo.start"] <- 1500
    d[!duplicated(d$team),"opp.elo.start"] <- 1500
  } else {
    n<-start.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"team"],start.elos$team),]$elo.end
    n[is.na(n)] <- 1500
    d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),]$elo.start <- n
    n<-start.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"opp.team"],start.elos$team),]$elo.end
    n[is.na(n)] <- 1500
    d[is.na(d$opp.elo.end),][!duplicated(d[is.na(d$opp.elo.end),]$team),]$opp.elo.start<-n
  }
  d <- d %>% mutate(elo.end=RatingAdjust(elo.start, opp.elo.start, wins=win, K=K, margin=score-opp.score, loc=loc, Kmargin=Kmargin, gw.power=gw.power))
  d <- d %>% mutate(opp.elo.end=RatingAdjust(opp.elo.start, elo.start, wins=1-win, K=K, margin=score-opp.score,loc=loc, Kmargin=Kmargin, gw.power=gw.power))
  temp.elos <- d %>% filter(!is.na(elo.end)) %>% group_by(team) %>% top_n(1, Daynum) %>% select(elo.end)
  
  while(sum(is.na(d$elo.end))>0){
    n<-temp.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"team"],temp.elos$team),]$elo.end
    d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),]$elo.start <- n[!is.na(n)]
    n<-temp.elos[match(d[is.na(d$elo.end),][!duplicated(d[is.na(d$elo.end),]$team),"opp.team"],temp.elos$team),]$elo.end
    d[is.na(d$opp.elo.end),][!duplicated(d[is.na(d$opp.elo.end),]$team),]$opp.elo.start<-n[!is.na(n)]
    d <- d %>% mutate(elo.end=RatingAdjust(elo.start, opp.elo.start, wins=win, K=K, margin=score-opp.score, loc=loc, HFA=HFA, Kmargin=Kmargin, gw.power=gw.power))
    d <- d %>% mutate(opp.elo.end=RatingAdjust(opp.elo.start, elo.start, wins=1-win, K=K, margin=score-opp.score,loc=loc, HFA=HFA, Kmargin=Kmargin, gw.power=gw.power))
    temp.elos <- d %>% filter(!is.na(elo.end)) %>% group_by(team) %>% top_n(1, Daynum) %>% select(elo.end)
  }
  final.elos <- d %>% group_by(team) %>% top_n(1, Daynum) %>% select(elo.end)
  return(final.elos %>% arrange(desc(elo.end)))
}

Functions for Regressing towards the mean, Evaluating Predictions and Submitting to Kaggle

RegressElos <- function(df, regression=0.5){
  df[,2] <- regression*1500 + (1-regression)*df[,2]
  return(df)
}



EvaluateELO <- function(elo_df, tourney_df, season){
  t_df <- tourney_df %>% filter(Season==season) %>% mutate(win=1, loc=0)
  t_df <- left_join(t_df, elo_df, by=c("Wteam"="team"))
  t_df <- left_join(t_df, elo_df, by=c("Lteam"="team"))
  t_df <- t_df %>% mutate(prediction = Ewins(elo.end.x, elo.end.y))
print(t_df %>% summarize(rmse = RMSE(prediction, win), logloss = LogLoss(prediction,win)))
}

CheckELO <- function(K=20, Kmargin=20, R=0.1, start.season=2012, end.season=2016, gw.power=0){
  results <- data.frame(season=numeric(), RMSE=numeric(), LogLoss=numeric())
  f <- updateELO(reg %>% filter(Season==start.season), K=K, Kmargin=Kmargin, gw.power=0)
  res <- EvaluateELO(f, tourney, season=start.season)
  results[1,] <- as.numeric(c(start.season, res))
  if(length(start.season:end.season)>1){
    for (i in seq_along((start.season:end.season)[-1])){
      f <- RegressElos(f, R)
      f <- updateELO(reg %>% filter(Season==start.season+i), K=K, Kmargin=Kmargin, gw.power=0,start.elos=f)
      res <- EvaluateELO(f, tourney, season=start.season+i)
      results[1+i,] <- as.numeric(c(start.season+i, res))
    }
      
  }
  return(results)
}

CreateSubmissionFile <- function(K=20, Kmargin=20, R=0.1, start.season=2010, end.season=2016, gw.power=0){
  sub <- sub[,c("id", "pred")]  %>% separate(id, into=c("year", "team", "opp.team"), sep="_", remove=FALSE)
  elos.by.year <- data.frame(year=character(), team = character(), elo=numeric())
  f <- updateELO(reg %>% filter(Season==start.season), K=K, Kmargin=Kmargin, gw.power=gw.power)
  elos.by.year <- rbind(elos.by.year, data.frame(year=as.character(start.season), team=as.character(f$team), elo=f$elo.end))
  if(length(start.season:end.season)>1){
    for (i in seq_along((start.season:end.season)[-1])){
      f <- RegressElos(f, R)
      f <- updateELO(reg %>% filter(Season==start.season+i), K=K, Kmargin=Kmargin, gw.power=gw.power, start.elos=f)
      elos.by.year <- rbind(elos.by.year, data.frame(year=as.character(start.season+i), team=as.character(f$team), elo=f$elo.end))
    }
  }
  sub <- left_join(sub, elos.by.year, by=c("year", "team"))
  sub <- left_join(sub, elos.by.year, by=c("year", "opp.team" = "team"))
  sub <- sub %>% mutate(pred = Ewins(elo.x, elo.y, loc=0, HFA=100))
  write.csv(sub %>% select(id, pred), 'submission.csv', row.names=FALSE)
  
}

Functions in Action

  1. Testing a combination of parameters:
results <- CheckELO(K=35, Kmargin=15, R=0.1, start.season=2009, end.season=2016, gw.power=0.5)
results
##   season      RMSE   LogLoss
## 1   2009 0.4488340 0.5897275
## 2   2010 0.4367674 0.5658740
## 3   2011 0.4571054 0.6022060
## 4   2012 0.4376569 0.5706315
## 5   2013 0.4614511 0.6103910
## 6   2014 0.4502553 0.5849283
## 7   2015 0.4223022 0.5331219
## 8   2016 0.4289351 0.5485168
results %>% filter(season>=2012) %>% summarize(rmse = mean(RMSE), logloss = mean(LogLoss))
##        rmse   logloss
## 1 0.4401201 0.5695179
  1. Making a Submission File
CreateSubmissionFile(K=35, Kmargin=15, R=0.1, start.season=2009, end.season=2016, gw.power=0.5)