EC313 Forecasting Project: Formula 1 Race Outcomes

Chris Tucker

25/03/2015

Introduction

Literature/Context

Human/Judgemental forecasts

Other predictions/forecasts

Bookmaker Prices

Issues/Limitations

Data & Method

Driver ELO calculated, subjective, weighted to those who have driven more races. Autosport also have a driver ranking system but that is flawed team/car factors and DNF’s and the data is only available for 2014.

f1.4mula1 <- read.csv("C:\\Users\\User1\\Documents\\4mula1.csv", stringsAsFactors=FALSE)
f1.4mula1$date <- as.Date(paste(f1.4mula1$date,f1.4mula1$year,sep=" "),"%d %b %Y")
f1.4mula1 <- f1.4mula1[order(f1.4mula1$driver0,f1.4mula1$date),]
f1.4mula1$driver.exp <- sequence(rle(f1.4mula1$driver0)$lengths)-1
f1.4mula1 <- f1.4mula1[order(f1.4mula1$team0,f1.4mula1$date),]
f1.4mula1$team.exp <- sequence(rle(f1.4mula1$team0)$lengths)-1
f1.4mula1$id <- paste(f1.4mula1$year,f1.4mula1$date,f1.4mula1$race0,sep="-")
f1.4mula1$pos2 <- f1.4mula1$position
f1.4mula1$pos2[regexpr("^\\d+ laps",f1.4mula1$time)>-1] <- 1000000
f1_races <- f1.4mula1[duplicated(f1.4mula1[,c("id")])==F,c("id","date")]
f1_races <- f1_races[order(f1_races$id),]
f1.4mula1$ones <- 1
f1_races.field <- aggregate(f1.4mula1$ones,by=list(f1.4mula1$id),FUN=sum)
colnames(f1_races.field) <- gsub("Group.1","id",colnames(f1_races.field))
colnames(f1_races.field) <- gsub("x","field",colnames(f1_races.field))
f1.4mula1 <- merge(f1.4mula1,f1_races.field,by=c("id"),all.x=T)
f1.4mula1$ones <- NULL
f1_elo <- data.frame("id"=f1_races$id,stringsAsFactors=F) #date = substr(f1_races$id,6,15)
f1_outcome <- data.frame("id"=f1_races$id,stringsAsFactors=F)
drivers <- f1.4mula1$driver0[duplicated(f1.4mula1$driver0)==F]
#next line creates a teams variable - would love to do elo for teams but no time
#teams <- f1.4mula1$team0[duplicated(f1.4mula1$team0)==F]
for (i in drivers) {
f1_elo[[i]] <- rep(1000,nrow(f1_races))
f1_outcome[[i]] <- rep(NA,nrow(f1_races))
}
#this creates outcome in right formula for elo calculation
f1.4mula1$outcome <- (f1.4mula1$field-f1.4mula1$position)/(f1.4mula1$field*(f1.4mula1$field-1)/2)
#empty variable for recent form variables
f1.4mula1$last.pos <- NA
f1.4mula1$last.5.pos <- NA
#finally make sure data ordered correctly for Elo calculation
f1.4mula1 <- f1.4mula1[order(f1.4mula1$id),]
for (i in 1:NROW(f1_races)) { #loop to calculate elo scores, updating for every race
#print(i/NROW(f1_races))
full.field <- f1.4mula1[f1.4mula1$id==f1_races$id[i],c("driver0","position","outcome")]
field <- NROW(full.field)
#first update
for(d in 1:field) { #
if(i>1) {
f1.4mula1$last.pos[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <-
f1_outcome[i-1,full.field$driver0[d]]
}
if(i>5) {
f1.4mula1$last.5.pos[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <-
mean(f1_outcome[c(i-6,i-1),full.field$driver0[d]],na.rm=T)
}
}
#now do elo
for(d in 1:field) { #need to get strengths for every driver in race
f1_races[i,paste("R",d,sep=".")] <- f1_elo[i,full.field$driver0[d]]
f1.4mula1$Elo[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <- f1_elo[i,full.field$driver0[d]]
}
for(d in 1:field) { #now to calculated expected finish
rest <- f1_races[i,grep("R[.]",colnames(f1_races))]
rest <- rest[-grep(paste("R.",d,"$",sep=""),colnames(rest))]
f1_races[i,paste("E",d,sep=".")] <- sum(1/(1+(10^((rest-f1_races[i,paste("R",d,sep=".")])/400))),na.rm=T)/(field*(field-1)/2)
f1.4mula1$Exp.pos[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <- f1_races[i,paste("E",d,sep=".")]
f1_races[i,paste("A",d,sep=".")] <- full.field$outcome[d]
}
if (i<nrow(f1_elo)) {
for(d in 1:field) { #need to get strengths for every driver in race
if(is.na(f1_races[i,paste("A",d,sep=".")])==F){
f1_elo[(i+1):nrow(f1_elo),full.field$driver0[d]] <- rep(f1_races[i,paste("R",d,sep=".")] + 40*(f1_races[i,paste("A",d,sep=".")] - f1_races[i,paste("E",d,sep=".")]),nrow(f1_elo)-i)
f1_outcome[i,full.field$driver0[d]] <- full.field$position[d]
}
}
}
}
f1_elo$raceid<-c(1:917)
plot(f1_elo$raceid,f1_elo$lewis_hamilton,type="l",col=1,xlim=range(650,917),xlab="Races over time",ylab="ELO",main=paste("ELO of Drivers",sep=""),ylim=range(970,1150),xaxt="n")
lines(f1_elo$raceid,f1_elo$sebastian_vettel,type="l",col=2)
lines(f1_elo$raceid,f1_elo$fernando_alonso,type="l",col=3)
lines(f1_elo$raceid,f1_elo$nico_rosberg,type="l",col=4)
legend("topleft",pch=c(15),ncol=4,col=c(1:4),
legend=c("Hamilton","Vettel","Alonso","Rosberg"),bty="n")

An Ordered Logistic Model will be built to determine the probabilities of each driver finishing in each position in the top 10 using various factors to determine race outcome, including:

Results

Results

library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     drivers
f1.reg <- lm(position ~ driver.exp + team.exp + last.pos + Elo + Exp.pos+grid, 
             data=f1.4mula1[f1.4mula1$year<2015,],na.action=na.exclude)
forecast.race <- f1.4mula1[f1.4mula1$year==2015,]
forecast.race$"(Intercept)" <- 1
forecast.race$predictions <- predict(f1.reg,forecast.race)
model.ord <- polr(as.factor(pos2) ~ driver.exp + team.exp + last.pos + Elo + Exp.pos +grid, 
                  data=f1.4mula1[f1.4mula1$year<2015,], method = "logistic")
Phat.2    <- predict(model.ord, newdata=forecast.race, type="probs")
forecast.race$P1 <- Phat.2[,1]
forecast.race$P2 <- Phat.2[,2]
forecast.race$P3 <- Phat.2[,3]
forecast.race$P4 <- Phat.2[,4]
forecast.race$P5 <- Phat.2[,5]
forecast.race$P6 <- Phat.2[,6]
forecast.race$P7 <- Phat.2[,7]
forecast.race$P8 <- Phat.2[,8]
forecast.race$P9 <- Phat.2[,9]
forecast.race$P10 <- Phat.2[,10]
forecast.race$Pother <- rowSums(Phat.2[,11:c(NCOL(Phat.2)-1)])
forecast.race$PDNF <- Phat.2[,NCOL(Phat.2)]
forecast.race$p1rank <- rank(-forecast.race$P1)
forecast.race$drid<-c(1:18)
plot(forecast.race$drid,forecast.race$P1,ylim=range(0,0.4),col=2,pch=15,type="p",xaxt="n",xlab="",main=paste("Forecasts of Driver Positions for Australia 2015",sep=""),ylab="Probability of Outcome")
for(i in colnames(forecast.race)[grep("[.]h",colnames(forecast.race))]) {
  for(j in forecast.race$drid) {
    lines(j,1/forecast.race[forecast.race$drid==j,i],col="darkred",pch=0,type="p",cex=0.5)
  }  
} 
lines(forecast.race$drid,forecast.race$P2,col=3,pch=16,type="p")
for(i in colnames(forecast.race)[grep("[.]h",colnames(forecast.race))]) {
  for(j in forecast.race$drid) {
    lines(j,1/forecast.race[forecast.race$drid==j,i],col="darkgreen",pch=0,type="p",cex=0.5)
  }  
}
lines(forecast.race$drid,forecast.race$P3,col=4,pch=17,type="p")
for(i in colnames(forecast.race)[grep("[.]h",colnames(forecast.race))]) {
  for(j in forecast.race$drid) {
    lines(j,1/forecast.race[forecast.race$drid==j,i],col="darkgreen",pch=0,type="p",cex=0.5)
  }  
}
legend("topleft",ncol=3,pch=c(15,16,17),col=c(2:4),
       legend=c("Prob(P1)","Prob(P2)","Prob(P3)"),bty="n")
axis(1,at=forecast.race$drid,labels=paste(forecast.race$driver1),las=2,cex.axis=0.65)

Comparison of Forecast vs Actual

df<-data.frame("Driver"=forecast.race$driver1,"Forecast Pos."=forecast.race$p1rank,"Actual Pos"=forecast.race$position,"Difference"=(forecast.race$p1rank - forecast.race$position))
df<-df[df$Actual.Pos <= 11, ]
df<-df[df$Forecast.Pos. <= 15, ]
plot(df$Forecast.Pos.,df$Actual.Pos,xlim=range(1:15),ylim=range(1,11),xlab="Forecasted Position",ylab="Actual Position",main=paste("Forecasted vs. Actual for Australia 2015",sep=""))
abline(0,1)

Conclusions and Next Steps

Replicate using 2014 as a test year - Will Hamilton and Mercedes be as dominant?

Looking to extend the basic model by forecasting grid positions so that future races can be forecast more than a day in advance, and use model selection techniques to include some of the following factors:

Try and beat judgemental forecasts!