EC313 Forecasting Project: Formula 1 Race Outcomes

Chris Tucker

25/03/2015

Introduction

Formula 1 is big business with budgets for the top teams bordering £0.5 billion.
Data collection, cleaning & manipulation, constructing a model, training it on the 2014 season
The 2014 season saw Mercedes walk away with the constructors title and Hamilton winning the drivers championship.
This project will aim to forecast the outcome of races (top 10 positions only).

Literature/Context

Human/Judgemental forecasts

Forecasting model will be better than judgemental forecasts?

Other predictions/forecasts

Bookmaker Prices

Season odds vs Race odds, Spearmans rank of 0.953

Issues/Limitations

5 drivers with only 1 F1 race experience
No driver has raced at the Mexico City circuit

Data & Method

Driver ELO calculated, subjective, weighted to those who have driven more races. Autosport also have a driver ranking system but that is flawed team/car factors and DNF’s and the data is only available for 2014.

f1.4mula1 <- read.csv("C:\\Users\\User1\\Documents\\4mula1.csv", stringsAsFactors=FALSE)
f1.4mula1$date <- as.Date(paste(f1.4mula1$date,f1.4mula1$year,sep=" "),"%d %b %Y")
f1.4mula1 <- f1.4mula1[order(f1.4mula1$driver0,f1.4mula1$date),]
f1.4mula1$driver.exp <- sequence(rle(f1.4mula1$driver0)$lengths)-1
f1.4mula1 <- f1.4mula1[order(f1.4mula1$team0,f1.4mula1$date),]
f1.4mula1$team.exp <- sequence(rle(f1.4mula1$team0)$lengths)-1
f1.4mula1$id <- paste(f1.4mula1$year,f1.4mula1$date,f1.4mula1$race0,sep="-")
f1.4mula1$pos2 <- f1.4mula1$position
f1.4mula1$pos2[regexpr("^\\d+ laps",f1.4mula1$time)>-1] <- 1000000
f1_races <- f1.4mula1[duplicated(f1.4mula1[,c("id")])==F,c("id","date")]
f1_races <- f1_races[order(f1_races$id),]
f1.4mula1$ones <- 1
f1_races.field <- aggregate(f1.4mula1$ones,by=list(f1.4mula1$id),FUN=sum)
colnames(f1_races.field) <- gsub("Group.1","id",colnames(f1_races.field))
colnames(f1_races.field) <- gsub("x","field",colnames(f1_races.field))
f1.4mula1 <- merge(f1.4mula1,f1_races.field,by=c("id"),all.x=T)
f1.4mula1$ones <- NULL
f1_elo <- data.frame("id"=f1_races$id,stringsAsFactors=F) #date = substr(f1_races$id,6,15)
f1_outcome <- data.frame("id"=f1_races$id,stringsAsFactors=F)
drivers <- f1.4mula1$driver0[duplicated(f1.4mula1$driver0)==F]
#next line creates a teams variable - would love to do elo for teams but no time
#teams <- f1.4mula1$team0[duplicated(f1.4mula1$team0)==F]
for (i in drivers) {
f1_elo[[i]] <- rep(1000,nrow(f1_races))
f1_outcome[[i]] <- rep(NA,nrow(f1_races))
}
#this creates outcome in right formula for elo calculation
f1.4mula1$outcome <- (f1.4mula1$field-f1.4mula1$position)/(f1.4mula1$field*(f1.4mula1$field-1)/2)
#empty variable for recent form variables
f1.4mula1$last.pos <- NA
f1.4mula1$last.5.pos <- NA
#finally make sure data ordered correctly for Elo calculation
f1.4mula1 <- f1.4mula1[order(f1.4mula1$id),]
for (i in 1:NROW(f1_races)) { #loop to calculate elo scores, updating for every race
#print(i/NROW(f1_races))
full.field <- f1.4mula1[f1.4mula1$id==f1_races$id[i],c("driver0","position","outcome")]
field <- NROW(full.field)
#first update
for(d in 1:field) { #
if(i>1) {
f1.4mula1$last.pos[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <-
f1_outcome[i-1,full.field$driver0[d]]
}
if(i>5) {
f1.4mula1$last.5.pos[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <-
mean(f1_outcome[c(i-6,i-1),full.field$driver0[d]],na.rm=T)
}
}
#now do elo
for(d in 1:field) { #need to get strengths for every driver in race
f1_races[i,paste("R",d,sep=".")] <- f1_elo[i,full.field$driver0[d]]
f1.4mula1$Elo[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <- f1_elo[i,full.field$driver0[d]]
}
for(d in 1:field) { #now to calculated expected finish
rest <- f1_races[i,grep("R[.]",colnames(f1_races))]
rest <- rest[-grep(paste("R.",d,"$",sep=""),colnames(rest))]
f1_races[i,paste("E",d,sep=".")] <- sum(1/(1+(10^((rest-f1_races[i,paste("R",d,sep=".")])/400))),na.rm=T)/(field*(field-1)/2)
f1.4mula1$Exp.pos[f1.4mula1$driver0==full.field$driver0[d] & f1.4mula1$id==f1_races$id[i]] <- f1_races[i,paste("E",d,sep=".")]
f1_races[i,paste("A",d,sep=".")] <- full.field$outcome[d]
}
if (i<nrow(f1_elo)) {
for(d in 1:field) { #need to get strengths for every driver in race
if(is.na(f1_races[i,paste("A",d,sep=".")])==F){
f1_elo[(i+1):nrow(f1_elo),full.field$driver0[d]] <- rep(f1_races[i,paste("R",d,sep=".")] + 40*(f1_races[i,paste("A",d,sep=".")] - f1_races[i,paste("E",d,sep=".")]),nrow(f1_elo)-i)
f1_outcome[i,full.field$driver0[d]] <- full.field$position[d]
}
}
}
}
f1_elo$raceid<-c(1:917)
plot(f1_elo$raceid,f1_elo$lewis_hamilton,type="l",col=1,xlim=range(650,917),xlab="Races over time",ylab="ELO",main=paste("ELO of Drivers",sep=""),ylim=range(970,1150),xaxt="n")
lines(f1_elo$raceid,f1_elo$sebastian_vettel,type="l",col=2)
lines(f1_elo$raceid,f1_elo$fernando_alonso,type="l",col=3)
lines(f1_elo$raceid,f1_elo$nico_rosberg,type="l",col=4)
legend("topleft",pch=c(15),ncol=4,col=c(1:4),
legend=c("Hamilton","Vettel","Alonso","Rosberg"),bty="n")

An Ordered Logistic Model will be built to determine the probabilities of each driver finishing in each position in the top 10 using various factors to determine race outcome, including:

Driver ELO
Likelihood of DNF
Grid Position
Team and driver experience
Most recent finishing positions (driver form)

Results

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     drivers

f1.reg <- lm(position ~ driver.exp + team.exp + last.pos + Elo + Exp.pos+grid, 
             data=f1.4mula1[f1.4mula1$year<2015,],na.action=na.exclude)
forecast.race <- f1.4mula1[f1.4mula1$year==2015,]
forecast.race$"(Intercept)" <- 1
forecast.race$predictions <- predict(f1.reg,forecast.race)
model.ord <- polr(as.factor(pos2) ~ driver.exp + team.exp + last.pos + Elo + Exp.pos +grid, 
                  data=f1.4mula1[f1.4mula1$year<2015,], method = "logistic")
Phat.2    <- predict(model.ord, newdata=forecast.race, type="probs")
forecast.race$P1 <- Phat.2[,1]
forecast.race$P2 <- Phat.2[,2]
forecast.race$P3 <- Phat.2[,3]
forecast.race$P4 <- Phat.2[,4]
forecast.race$P5 <- Phat.2[,5]
forecast.race$P6 <- Phat.2[,6]
forecast.race$P7 <- Phat.2[,7]
forecast.race$P8 <- Phat.2[,8]
forecast.race$P9 <- Phat.2[,9]
forecast.race$P10 <- Phat.2[,10]
forecast.race$Pother <- rowSums(Phat.2[,11:c(NCOL(Phat.2)-1)])
forecast.race$PDNF <- Phat.2[,NCOL(Phat.2)]
forecast.race$p1rank <- rank(-forecast.race$P1)
forecast.race$drid<-c(1:18)
plot(forecast.race$drid,forecast.race$P1,ylim=range(0,0.4),col=2,pch=15,type="p",xaxt="n",xlab="",main=paste("Forecasts of Driver Positions for Australia 2015",sep=""),ylab="Probability of Outcome")
for(i in colnames(forecast.race)[grep("[.]h",colnames(forecast.race))]) {
  for(j in forecast.race$drid) {
    lines(j,1/forecast.race[forecast.race$drid==j,i],col="darkred",pch=0,type="p",cex=0.5)
  }  
} 
lines(forecast.race$drid,forecast.race$P2,col=3,pch=16,type="p")
for(i in colnames(forecast.race)[grep("[.]h",colnames(forecast.race))]) {
  for(j in forecast.race$drid) {
    lines(j,1/forecast.race[forecast.race$drid==j,i],col="darkgreen",pch=0,type="p",cex=0.5)
  }  
}
lines(forecast.race$drid,forecast.race$P3,col=4,pch=17,type="p")
for(i in colnames(forecast.race)[grep("[.]h",colnames(forecast.race))]) {
  for(j in forecast.race$drid) {
    lines(j,1/forecast.race[forecast.race$drid==j,i],col="darkgreen",pch=0,type="p",cex=0.5)
  }  
}
legend("topleft",ncol=3,pch=c(15,16,17),col=c(2:4),
       legend=c("Prob(P1)","Prob(P2)","Prob(P3)"),bty="n")
axis(1,at=forecast.race$drid,labels=paste(forecast.race$driver1),las=2,cex.axis=0.65)

Comparison of Forecast vs Actual

df<-data.frame("Driver"=forecast.race$driver1,"Forecast Pos."=forecast.race$p1rank,"Actual Pos"=forecast.race$position,"Difference"=(forecast.race$p1rank - forecast.race$position))
df<-df[df$Actual.Pos <= 11, ]
df<-df[df$Forecast.Pos. <= 15, ]
plot(df$Forecast.Pos.,df$Actual.Pos,xlim=range(1:15),ylim=range(1,11),xlab="Forecasted Position",ylab="Actual Position",main=paste("Forecasted vs. Actual for Australia 2015",sep=""))
abline(0,1)

Rsquare = 0.484

Conclusions and Next Steps

Replicate using 2014 as a test year - Will Hamilton and Mercedes be as dominant?

Looking to extend the basic model by forecasting grid positions so that future races can be forecast more than a day in advance, and use model selection techniques to include some of the following factors:

Home Advantage
In Race Penalties
Number of race wins for drivers/teams
Ability of drivers to overtake on the first lap
Others factors including, safety cars, pit-stop times, team budget

Try and beat judgemental forecasts!