library(data.table)
library(ggthemes)
library(lubridate)
library(modelr)
library(ohenery)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(readr)
library(ggrepel)
library(skimr)
library(kableExtra)
library(mlogit)
library(DT)
library(visdat)
library(tidytext)
library(DataExplorer)
race_result_race <- suppressMessages(read_csv("race-result-race.csv"))
race_result_horse <- suppressMessages(read_csv("race-result-horse.csv"))
rdf <- suppressMessages(inner_join(race_result_horse, race_result_race))
tdf <- rdfjdiff <- function(x, y, t="equal", i=0, j=0){
case_when(
t == 'equal'~ if_else(x ==y,1L,0L),
t == 'rng' ~ if_else(x >= y-i & x <= y+j,1L,0L),
TRUE ~ 0L
)
}
rwm <- function(subject, d){
max <- 400
den <- 270
n <- map_dbl(d, ~ sum(between(as.double(difftime(.x, d, units = "days")), 1e-9, max) * subject * exp(as.numeric((-if_else((.x-d)<270,(.x-d)/3,(.x-d)/2)/den)))))
d <- map_dbl(d, ~ sum(between(as.double(difftime(.x, d, units = "days")), 1e-9, max) * exp(as.numeric((-if_else((.x-d)<270,(.x-d)/3,(.x-d)/2)/den)))))
if_else(is.na(n/d),0,n/d)
}
miAvg <- function(subject, dte, max){
num <- map_dbl(dte, ~ mean(between(as.double(difftime(.x, dte, units = "days")), 1e-9, max) * subject ))
den <- map_dbl(dte, ~ mean(between(as.double(difftime(.x, dte, units = "days")), 1e-9, max) ))
if_else(is.na(num/den),0,(num/den))
}[Describe data set here]
A summary of the data set provided for this analysis is set forth below:
| VARIABLE_NAME | DEFINITION | Additional Comments |
|---|---|---|
finishing_position |
Order of finish for the horse | We will focus on the first three finishing positions (win, place & show) |
horse_number |
Horse program number | Non |
horse_name |
Horse’s name | None |
horse_id |
Unique identifier | An alternative to using the horse’s name - likely to remove from dat set |
jockey |
Name of jockey that rode horse | Do good jockey’s win more races |
trainer |
Name of horse’s trainer | Do good trainers win more races |
actual_weight |
Horse true weight carried | The more weight a horse carries the slower she will run |
declared_weight |
Weight declared in racing program | The difference between declared and and actual weig |
draw |
Post position the horse broke from | Outside post position may lead to ground loss (running further) |
length_behind_winner |
# of length the horse was behind winner at finish | Smaller distance means better performance |
running_position_1 |
Position at first call | Provides information how race was run |
running_position_2 |
Position at second call | Provides information how race was run |
running_position_3 |
Position at third call | Provides information how race was run |
running_position_4 |
Position at fourth call | Provides information how race was run |
finish_time |
finishing time of the horse | How fast the race was run |
win_odds |
The horse’s win odds | Reflects publics opinion of the horse |
running_position_5 |
Position at fifth call | Provides information how race was run |
running_position_6 |
Position at sixth call | Remove |
race_id |
Unique race identifier (ie 2014-001) | None |
src |
Html page data was scraped from | Remove from data set. |
race_date |
Data of race | None |
race_course |
Course the race was run on | None |
race_number |
The race number on the particular race card | None |
race_class |
The class of the race | Reflects the estimated quality of the race |
race_distance |
The distance of the race | TBD |
track_condition |
Condition of the racing surface | TBD |
race_name |
Name of the race, if any | Remove |
track |
Race course thre race was run on | TBD |
sectional_time |
Race time splits for the race | Will need to data wrangle this variable |
incident_report |
Race comments / incidents | Explore the informational content |
Our feature engineering will focus on both normalizing variable and creating new variable. All of these efforts are aimed at improving the our models ability to pick winning wagers. The following is a table of the feature engineering performed on our race_data data.
| VARIABLE_NAME | DEFINITION | derived from |
|---|---|---|
fp_norm |
Normalized finishing_position | finishing_position |
dr_norm |
Normalized draw of the horse | draw |
fc_norm |
Normalized first call position of the horse | running_position_1 |
bl_norm |
||
log_odds |
log() of the win_odds | win_odds |
wt_rel |
weight adjustment (relative weight / 5) | actual_weight |
wt_relxdis |
wt_rel x distance of race | |
jpct |
jockey winning percentage by year | Do good jockey’s win more races |
tpct |
trainer winning percentage | Do good trainers win more races |
jcp |
odds adjusted cum probability of jockey wins | The more weight a horse carries the slower she will run |
tcp |
odds adjusted cum probability of trainer wins | The difference between declared and and actual weig |
draw |
Post position the horse broke from | Outside post position may lead to ground loss (runnin |
| fp3_rw1 | Recency-weighted_1 mean of last 3 fp_norm | |
| fp3_rw2 | Recency-weighted_2 mean of last 3 fp_norm | |
| fc3_rw1 | Recency-weighted_1 mean of last 3 fc_norm | |
| fc3_rw2 | Recency-weighted_2 mean of last 3 fc_norm |
estimated ground loss
columns <- c("one", "two", "three", "four", "five", "six", "seven")
tdf <- tdf %>%
head(n=5) %>%
select(race_id, horse_name, incident_report, finishing_position)%>%
mutate(htext = str_extract_all(incident_report,(str_c('(?<=^|\\.)[^.]*\\b',horse_name,'\\b[^.]*\\.?')))) %>%
select(-incident_report) %>%
unnest(htext)
datatable(tdf)race_data <- race_data %>%
select(-ind) %>%
mutate(track = factor(track)) %>%
mutate(pew = (1/win_odds)*.82) %>%
mutate(yr = year(race_date)) %>%
group_by(race_id) %>%
arrange(finishing_position) %>%
filter(finishing_position < 99) %>%
mutate(n = n()) %>%
mutate(fp_nrm = (finishing_position - min(finishing_position)) /(max(finishing_position)- min(finishing_position))) %>%
mutate(fc_nrm = (running_position_1 - min(running_position_1)) /(max(running_position_1)- min(running_position_1)) ) %>%
mutate(dr_nrm = (draw - min(draw)) /(max(draw)- min(draw))) %>%
mutate(bl_nrm = 0.5-(((length_behind_winner*10)/(race_distance *3.28084))-0.5)) %>%
mutate(log_odds = log(win_odds)) %>%
mutate(wt_rel =(actual_weight - sum(if_else(actual_weight <800,actual_weight, 0))/sum(if_else(actual_weight <800,1, 0)))/5) %>%
mutate(wt_relxdis = wt_rel * race_distance) %>%
ungroup() %>%
arrange(jockey) %>%
group_by(yr, jockey) %>%
mutate(jraces = n()) %>%
mutate(jwins = sum(if_else(finishing_position ==1,1,0))) %>%
mutate(jwins_ex = sum(pew)) %>%
mutate(jockeyPct = jwins / jraces) %>%
mutate(jvar = sum(pew*(1-pew))) %>%
mutate(jockeyCP = pnorm(jwins, jwins_ex, jvar^.5)) %>%
ungroup() %>%
arrange(trainer) %>%
group_by(yr, trainer) %>%
mutate(traces = n()) %>%
mutate(twins = sum(if_else(finishing_position ==1,1,0))) %>%
mutate(twins_ex = sum(pew)) %>%
mutate(trainerPct = twins / traces) %>%
mutate(tvar = sum(pew*(1-pew))) %>%
mutate(trainerCP = pnorm(twins, twins_ex, tvar^.5)) %>%
ungroup() %>%
select(-jraces, -traces, -twins, -jwins, -jwins_ex, -twins_ex, -jvar, -tvar) %>%
arrange(race_date, race_number, finishing_position) %>%
# ******************************
#Arrange for past performance view
# *******************************
group_by(horse_name) %>%
arrange(horse_name, desc(race_date)) %>%
# rest - days since last race
mutate(rest = as.double(difftime(race_date,lead(race_date,1),units="days"))) %>%
mutate(rest = if_else(is.na(rest),0,rest)) %>%
# dayssince - rest since last race - median rest
mutate(dayssince = rest-median(rest)) %>%
# races last 90 days - counts the number of races in last 90 days
mutate(raceslast90 = map_int(race_date, ~ sum(between(as.numeric(difftime(.x, race_date, units = "days")), 1e-9, 90)))) %>%
# Recency weighted normalized finish position - finish1
#mutate(finish1 = rwm(fp_nrm,race_date)) %>%
mutate(finish1 = mean(lead(fp_nrm,1),lead(fp_nrm,2),lead(fp_nrm,3), trim=0, na.rm=TRUE)) %>%
# Finish Position
mutate(finish2 = mean(lead(finishing_position,1),lead(finishing_position,2),lead(finishing_position,3), trim=0, na.rm=TRUE)) %>%
# First Call
mutate(firstcall = if_else(!is.na(lead(running_position_1,1)), lead(running_position_1,1),lag(running_position_1,1))) %>%
# First Call Distance
mutate(firstcalldist = firstcall * race_distance) %>%
# Beaten Length
mutate(finish2 = mean(lead(bl_nrm,1),lead(bl_nrm,2),lead(bl_nrm,3), trim=0, na.rm=TRUE)) %>%
# Win History
mutate(winhistory = mean(lead(winner,1),lead(winner,2),lead(winner,3), trim=0, na.rm=TRUE)) %>%
# Meters Per Second
mutate(mps = if_else(!is.na(race_distance / finish_time),race_distance/finish_time,0)) %>%
# Class Move
mutate(classMove = if_else(!is.na(as.numeric(race_class)-as.numeric(lead(race_class,1))),as.numeric(race_class)-as.numeric(lead(race_class,1)),0)) %>%
# MoveOnTurn
mutate(moveOnTurn = if_else(!is.na(as.numeric(lead(running_position_3,1))-as.numeric(lead(running_position_2,1))),as.numeric(running_position_3)-as.numeric(lead(running_position_2,1)),0)) %>%
# Recency Meters Per Second
mutate(mrMps = mean(lead(mps,1),lead(mps,2),lead(mps,3), trim=0, na.rm=TRUE)) %>%
# running style
mutate(runstyle = mean(running_position_2/field)) %>%
mutate(runstyle = case_when(
runstyle <=0.33 ~ 1L,
runstyle <=0.66 ~ 2L,
runstyle >0.66 ~ 3L,
TRUE ~ 0L
)) %>%
mutate(postDiff = case_when(
draw == 1 & runstyle == 1 ~ -4.5,
draw == 2 & runstyle == 1 ~ -1.33,
draw == 3 & runstyle == 1 ~ -4.99,
draw == 4 & runstyle == 1 ~ -6.99,
draw == 5 & runstyle == 1 ~ -2.65,
draw == 6 & runstyle == 1 ~ 6.11,
draw == 7 & runstyle == 1 ~ -1.83,
draw == 8 & runstyle == 1 ~ -3.97,
draw == 9 & runstyle == 1 ~ 4.41,
draw == 10 & runstyle == 1 ~ -1.26,
draw == 11 & runstyle == 1 ~ -5.82,
draw == 11 & runstyle == 1 ~ -5.82,
draw == 13 & runstyle == 1 ~ -8.50,
draw == 14 & runstyle == 1 ~ -8.50,
draw == 15 & runstyle == 1 ~ -8.50,
draw == 16 & runstyle == 1 ~ -8.50,
draw == 1 & runstyle == 2 ~ 1.11,
draw == 2 & runstyle == 2 ~ -2.03,
draw == 3 & runstyle == 2 ~ 1.03,
draw == 4 & runstyle == 2 ~ 1.13,
draw == 5 & runstyle == 2 ~ -4.67,
draw == 6 & runstyle == 2 ~ 0.66,
draw == 7 & runstyle == 2 ~ -2.75,
draw == 8 & runstyle == 2 ~ -2.32,
draw == 9 & runstyle == 2 ~ 0.31,
draw == 10 & runstyle == 2 ~ 3.48,
draw == 11 & runstyle == 2 ~ 0.79,
draw == 11 & runstyle == 2 ~ 1.51,
draw == 13 & runstyle == 2 ~ 4.59,
draw == 14 & runstyle == 2 ~ 4.59,
draw == 15 & runstyle == 2 ~ 4.59,
draw == 16 & runstyle == 2 ~ 4.59,
draw == 1 & runstyle == 3 ~ 3.04,
draw == 2 & runstyle == 3 ~ -2.72,
draw == 3 & runstyle == 3 ~ 8.34,
draw == 4 & runstyle == 3 ~ 0.81,
draw == 5 & runstyle == 3 ~ 1.03,
draw == 6 & runstyle == 3 ~ -2.38,
draw == 7 & runstyle == 3 ~ -6.66,
draw == 8 & runstyle == 3 ~ 0.84,
draw == 9 & runstyle == 3 ~ 1.16,
draw == 10 & runstyle == 3 ~ -0.88,
draw == 11 & runstyle == 3 ~ -3.22,
draw == 11 & runstyle == 3 ~ 2.82,
draw == 13 & runstyle == 3 ~ -1.01,
draw == 14 & runstyle == 3 ~ -1.01,
draw == 15 & runstyle == 3 ~ -1.01,
draw == 16 & runstyle == 3 ~ -1.01,
TRUE ~ 0
)) %>%
# eft
mutate(eft = (race_distance + ((wt_rel + postDiff)*0.30))/ mrMps) %>%
ungroup() %>%
group_by(race_id) %>%
arrange(race_id, race_number, finishing_position) %>%
group_by(race_id) %>%
mutate(raceShape = if_else(!is.na(lead(runstyle,1)),runstyle/mean(runstyle,trim=0,na.rm=TRUE),0)) %>%
filter(horse_name !="No Horse") %>%
select(-new_horse_number, -altkey, -datekey) %>%
filter(!is.na(eft)) %>%
ungroup() %>%
select(race_id, finishing_position, win_odds, log_odds, wt_rel,wt_relxdis, jockeyCP, jockeyPct, trainerCP, trainerPct, rest, raceslast90, finish1, finish2, firstcall, firstcalldist, winhistory, mps, classMove, moveOnTurn, mrMps, runstyle, postDiff, eft, raceShape, pew, horse_name, row_nbr, winner) %>%
mutate(row_nbr = race_id) %>%
mutate(row_nbr = str_replace(row_nbr,'(-)','')) %>%
mutate(row_nbr = as.numeric(row_nbr))
vis_dat(race_data, warn_large_data = FALSE)
skim(race_data)
plot_bar(race_data)
plot_histogram(race_data)
plot_boxplot(race_data, by = "winner")plot_bar(race_data)
AVTM <- race_data %>%
group_by(race_id) %>%
filter(finishing_position <= 3) %>%
select(race_id, race_date, race_number, finishing_position, actual_weight, finish_time, race_course, race_class, track_condition, track, race_distance) %>%
mutate(avtm = mean(finish_time)) %>%
filter(finishing_position == 1) %>%
mutate(race_course =factor(race_course)) %>%
mutate(race_class =factor(race_class)) %>%
mutate(race_class =fct_other(race_class,keep=c("Class 4","Class 1","Class 2","Class 3","Class 5"),other_level="Class 0")) %>%
mutate(track_condition = factor(track_condition)) %>%
mutate(track_condition = fct_other(track_condition,keep=c("GOOD","GOOD TO FIRM","FAST"),other_level="OFF")) %>%
mutate(race_distance = factor(race_distance)) %>%
ungroup()
plot_bar(AVTM)
plot_histogram(AVTM)
plot_boxplot(AVTM, by = "avtm")
skim(AVTM)
mod_time <- lm(avtm ~ race_distance + race_class + actual_weight + race_course + track , data = AVTM)
summary(mod_time)
AVTM <- AVTM %>%
add_predictions(mod_time, "patm") %>%
add_residuals(mod_time, "dres") %>%
mutate(dres = dres / as.numeric(race_distance)) %>%
group_by(race_date,track) %>%
mutate(avg_dres = mean(dres)) %>%
mutate(tnf = avg_dres * as.numeric(race_distance)) %>%
ungroup()
mod_time2 <- lm(avtm ~ race_distance + race_class + actual_weight + race_course + track + tnf , data = AVTM)
summary(mod_time2)
AVTM <- AVTM %>%
add_predictions(mod_time2, "patm2")
AVTM <- AVTM %>%
mutate(true_tnf = tnf) %>%
mutate(tnf = 0) %>%
mutate(race_class = 'Class 3') %>%
add_predictions(mod_time2, "base_time") %>%
select(race_id, true_tnf, base_time)
race_data <- race_data %>%
inner_join(AVTM, by="race_id") %>%
mutate(rsfactor = factor(runstyle)) %>%
mutate(snt = ((finish_time+ true_tnf)/base_time)-1) %>%
arrange(race_date, race_number, finishing_position) %>%
mutate(recencySNT = rwm(snt,race_date)) %>%
mutate(recencyTime = rwm(finish_time, race_date)) %>%
mutate(recencyDistance = rwm(race_distance, race_date)) %>%
mutate(estTime = (race_distance + postDiff + wt_rel)/(recencyTime/recencyDistance))
skim(race_data)
datatable(head(race_data, n =50))options("scipen"=100, "digits"=4)
training_data %>%
filter(finishing_position < 99) %>% # Remove dummy rows
group_by(race_id) %>%
ungroup()
ha_mod0 <- hensm(finishing_position ~ log_odds + wt_rel + finish1 + classMove + firstcall + firstcalldist + winhistory + postDiff + moveOnTurn + mrMps + runstyle + eft + raceShape, data = training_data, group = race_id)
#ha_mod0 <- hensm(finishing_position ~ log_odds + wt_rel + jockeyCP + jockeyPct + trainerCP + trainerPct + finish1 + finish2 + firstcall + firstcalldist + #winhistory + postDiff + classMove + moveOnTurn + mrMps + runstyle + eft + raceShape, data = training_data, group = race_id)
# print(he_mod0)
print(ha_mod0)
# Second round of modeling to remove bias caused by use public odds
prd <- holdout_data %>%
mutate(prd_mu=as.numeric(predict(ha_mod0,newdata=.,group=race_id,type='mu'))) %>%
select(race_id, horse_name, finishing_position, pew, win_odds, prd_mu) %>%
mutate(er = prd_mu * win_odds)
kable(head(prd,n=50), digits = 4, format.args = list(scientific = FALSE)) %>%
kable_styling()
#print(prd, n=1000)prd <- prd %>%
rename(div = win_odds) %>%
mutate(bet = if_else(er>1,1,0)) %>%
mutate(result = if_else(bet==0,0,if_else(finishing_position==1,div,-1))) %>%
mutate(wins = if_else(result>0,1,0)) %>%
mutate(ttlwins = sum(wins)) %>%
mutate(nbr_bets =accumulate(bet, `+`, .init = 0)[-1]) %>%
mutate(bal = accumulate(result, `+`, .init = 100)[-1])
ggplot(prd, aes(x=nbr_bets, y =log(bal/100), group=1)) +
geom_line() +
theme_fivethirtyeight() +
ggtitle("Holdout Betting Strategy Results") +
theme(axis.title = element_text()) + xlab('Number of Bets') + ylab('log(wealth / initial wealth)')
kable(tail(prd,10)) %>%
kable_styling()