library(data.table)
library(ggthemes)
library(lubridate)
library(modelr)
library(ohenery)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(readr)
library(ggrepel)
library(skimr)
library(kableExtra)
library(mlogit)
library(DT)
library(visdat)
library(tidytext)
library(DataExplorer)
race_result_race <- suppressMessages(read_csv("race-result-race.csv"))
race_result_horse <- suppressMessages(read_csv("race-result-horse.csv"))
rdf <- suppressMessages(inner_join(race_result_horse, race_result_race))
tdf <- rdf

Helper Functions

Analysis


Data

[Describe data set here]

A summary of the data set provided for this analysis is set forth below:

VARIABLE_NAME DEFINITION Additional Comments
finishing_position Order of finish for the horse We will focus on the first three finishing positions (win, place & show)
horse_number Horse program number Non
horse_name Horse’s name None
horse_id Unique identifier An alternative to using the horse’s name - likely to remove from dat set
jockey Name of jockey that rode horse Do good jockey’s win more races
trainer Name of horse’s trainer Do good trainers win more races
actual_weight Horse true weight carried The more weight a horse carries the slower she will run
declared_weight Weight declared in racing program The difference between declared and and actual weig
draw Post position the horse broke from Outside post position may lead to ground loss (running further)
length_behind_winner # of length the horse was behind winner at finish Smaller distance means better performance
running_position_1 Position at first call Provides information how race was run
running_position_2 Position at second call Provides information how race was run
running_position_3 Position at third call Provides information how race was run
running_position_4 Position at fourth call Provides information how race was run
finish_time finishing time of the horse How fast the race was run
win_odds The horse’s win odds Reflects publics opinion of the horse
running_position_5 Position at fifth call Provides information how race was run
running_position_6 Position at sixth call Remove
race_id Unique race identifier (ie 2014-001) None
src Html page data was scraped from Remove from data set.
race_date Data of race None
race_course Course the race was run on None
race_number The race number on the particular race card None
race_class The class of the race Reflects the estimated quality of the race
race_distance The distance of the race TBD
track_condition Condition of the racing surface TBD
race_name Name of the race, if any Remove
track Race course thre race was run on TBD
sectional_time Race time splits for the race Will need to data wrangle this variable
incident_report Race comments / incidents Explore the informational content

Data Preprocessing

Feature Engineering

Our feature engineering will focus on both normalizing variable and creating new variable. All of these efforts are aimed at improving the our models ability to pick winning wagers. The following is a table of the feature engineering performed on our race_data data.

Figure X Schedule of Feature Engineering

VARIABLE_NAME DEFINITION derived from
fp_norm Normalized finishing_position finishing_position
dr_norm Normalized draw of the horse draw
fc_norm Normalized first call position of the horse running_position_1
bl_norm
log_odds log() of the win_odds win_odds
wt_rel weight adjustment (relative weight / 5) actual_weight
wt_relxdis wt_rel x distance of race
jpct jockey winning percentage by year Do good jockey’s win more races
tpct trainer winning percentage Do good trainers win more races
jcp odds adjusted cum probability of jockey wins The more weight a horse carries the slower she will run
tcp odds adjusted cum probability of trainer wins The difference between declared and and actual weig
draw Post position the horse broke from Outside post position may lead to ground loss (runnin
fp3_rw1 Recency-weighted_1 mean of last 3 fp_norm
fp3_rw2 Recency-weighted_2 mean of last 3 fp_norm
fc3_rw1 Recency-weighted_1 mean of last 3 fc_norm
fc3_rw2 Recency-weighted_2 mean of last 3 fc_norm

estimated ground loss

FEATURE ENGINEERING

race_data <- race_data %>% 
  select(-ind) %>%
  mutate(track = factor(track)) %>% 
  mutate(pew = (1/win_odds)*.82) %>% 
  mutate(yr = year(race_date)) %>% 
  group_by(race_id) %>%
  arrange(finishing_position) %>% 
  filter(finishing_position < 99) %>% 
  mutate(n = n()) %>% 
  mutate(fp_nrm = (finishing_position - min(finishing_position)) /(max(finishing_position)- min(finishing_position))) %>% 
  mutate(fc_nrm = (running_position_1 - min(running_position_1)) /(max(running_position_1)- min(running_position_1)) ) %>% 
  mutate(dr_nrm = (draw - min(draw)) /(max(draw)- min(draw))) %>%
  mutate(bl_nrm = 0.5-(((length_behind_winner*10)/(race_distance *3.28084))-0.5)) %>% 
  mutate(log_odds = log(win_odds)) %>%
  mutate(wt_rel =(actual_weight - sum(if_else(actual_weight <800,actual_weight, 0))/sum(if_else(actual_weight <800,1, 0)))/5) %>% 
  mutate(wt_relxdis = wt_rel * race_distance) %>% 
  ungroup() %>% 
  arrange(jockey) %>% 
  group_by(yr, jockey) %>% 
  mutate(jraces = n()) %>% 
  mutate(jwins = sum(if_else(finishing_position ==1,1,0))) %>%
  mutate(jwins_ex = sum(pew)) %>% 
  mutate(jockeyPct = jwins / jraces) %>% 
  mutate(jvar = sum(pew*(1-pew))) %>% 
  mutate(jockeyCP = pnorm(jwins, jwins_ex, jvar^.5)) %>% 
  ungroup() %>% 
  arrange(trainer) %>% 
  group_by(yr, trainer) %>% 
  mutate(traces = n()) %>% 
  mutate(twins = sum(if_else(finishing_position ==1,1,0))) %>%
  mutate(twins_ex = sum(pew)) %>% 
  mutate(trainerPct = twins / traces) %>% 
  mutate(tvar = sum(pew*(1-pew))) %>% 
  mutate(trainerCP = pnorm(twins, twins_ex, tvar^.5)) %>% 
  ungroup() %>% 
  select(-jraces, -traces, -twins, -jwins, -jwins_ex, -twins_ex, -jvar, -tvar) %>% 
  arrange(race_date, race_number, finishing_position) %>% 
  
  # ******************************
  #Arrange for past performance view
  # *******************************
  group_by(horse_name) %>%
  arrange(horse_name, desc(race_date)) %>% 
  
  # rest - days since last race
  mutate(rest = as.double(difftime(race_date,lead(race_date,1),units="days"))) %>% 
  mutate(rest = if_else(is.na(rest),0,rest)) %>%
  
  # dayssince - rest since last race - median rest
  mutate(dayssince = rest-median(rest)) %>%
  
  # races last 90 days - counts the number of races in last 90 days
  mutate(raceslast90 = map_int(race_date, ~ sum(between(as.numeric(difftime(.x, race_date, units = "days")), 1e-9, 90)))) %>%
   
  # Recency weighted normalized finish position - finish1
    #mutate(finish1 = rwm(fp_nrm,race_date)) %>%
    mutate(finish1 = mean(lead(fp_nrm,1),lead(fp_nrm,2),lead(fp_nrm,3), trim=0, na.rm=TRUE)) %>% 
  
  # Finish Position
    mutate(finish2 = mean(lead(finishing_position,1),lead(finishing_position,2),lead(finishing_position,3), trim=0, na.rm=TRUE)) %>% 
  
  # First Call
    mutate(firstcall = if_else(!is.na(lead(running_position_1,1)), lead(running_position_1,1),lag(running_position_1,1))) %>% 
  
  # First Call Distance
    mutate(firstcalldist = firstcall * race_distance) %>% 
  
  # Beaten Length
    mutate(finish2 = mean(lead(bl_nrm,1),lead(bl_nrm,2),lead(bl_nrm,3), trim=0, na.rm=TRUE)) %>% 
  
  # Win History
    mutate(winhistory = mean(lead(winner,1),lead(winner,2),lead(winner,3), trim=0, na.rm=TRUE)) %>%   
  
  # Meters Per Second
    mutate(mps = if_else(!is.na(race_distance / finish_time),race_distance/finish_time,0)) %>% 
  
  # Class Move
    mutate(classMove = if_else(!is.na(as.numeric(race_class)-as.numeric(lead(race_class,1))),as.numeric(race_class)-as.numeric(lead(race_class,1)),0)) %>% 
  
  # MoveOnTurn
    mutate(moveOnTurn = if_else(!is.na(as.numeric(lead(running_position_3,1))-as.numeric(lead(running_position_2,1))),as.numeric(running_position_3)-as.numeric(lead(running_position_2,1)),0)) %>% 
  
  # Recency Meters Per Second
    mutate(mrMps = mean(lead(mps,1),lead(mps,2),lead(mps,3), trim=0, na.rm=TRUE)) %>% 
  
  # running style
  mutate(runstyle = mean(running_position_2/field)) %>% 
  mutate(runstyle = case_when(
    runstyle <=0.33 ~ 1L,
    runstyle <=0.66 ~ 2L,
    runstyle >0.66 ~ 3L,
    TRUE ~ 0L
  )) %>% 
  mutate(postDiff = case_when(
    draw == 1 & runstyle == 1 ~ -4.5,
    draw == 2 & runstyle == 1 ~ -1.33,
    draw == 3 & runstyle == 1 ~ -4.99,
    draw == 4 & runstyle == 1 ~ -6.99,
    draw == 5 & runstyle == 1 ~ -2.65,
    draw == 6 & runstyle == 1 ~ 6.11,
    draw == 7 & runstyle == 1 ~ -1.83,
    draw == 8 & runstyle == 1 ~ -3.97,
    draw == 9 & runstyle == 1 ~ 4.41,
    draw == 10 & runstyle == 1 ~ -1.26,
    draw == 11 & runstyle == 1 ~ -5.82,
    draw == 11 & runstyle == 1 ~ -5.82,
    draw == 13 & runstyle == 1 ~ -8.50,
    draw == 14 & runstyle == 1 ~ -8.50,
    draw == 15 & runstyle == 1 ~ -8.50,
    draw == 16 & runstyle == 1 ~ -8.50,
    draw == 1 & runstyle == 2 ~ 1.11,
    draw == 2 & runstyle == 2 ~ -2.03,
    draw == 3 & runstyle == 2 ~ 1.03,
    draw == 4 & runstyle == 2 ~ 1.13,
    draw == 5 & runstyle == 2 ~ -4.67,
    draw == 6 & runstyle == 2 ~ 0.66,
    draw == 7 & runstyle == 2 ~ -2.75,
    draw == 8 & runstyle == 2 ~ -2.32,
    draw == 9 & runstyle == 2 ~ 0.31,
    draw == 10 & runstyle == 2 ~ 3.48,
    draw == 11 & runstyle == 2 ~ 0.79,
    draw == 11 & runstyle == 2 ~ 1.51,
    draw == 13 & runstyle == 2 ~ 4.59,
    draw == 14 & runstyle == 2 ~ 4.59,
    draw == 15 & runstyle == 2 ~ 4.59,
    draw == 16 & runstyle == 2 ~ 4.59,
    draw == 1 & runstyle == 3 ~ 3.04,
    draw == 2 & runstyle == 3 ~ -2.72,
    draw == 3 & runstyle == 3 ~ 8.34,
    draw == 4 & runstyle == 3 ~ 0.81,
    draw == 5 & runstyle == 3 ~ 1.03,
    draw == 6 & runstyle == 3 ~ -2.38,
    draw == 7 & runstyle == 3 ~ -6.66,
    draw == 8 & runstyle == 3 ~ 0.84,
    draw == 9 & runstyle == 3 ~ 1.16,
    draw == 10 & runstyle == 3 ~ -0.88,
    draw == 11 & runstyle == 3 ~ -3.22,
    draw == 11 & runstyle == 3 ~ 2.82,
    draw == 13 & runstyle == 3 ~ -1.01,
    draw == 14 & runstyle == 3 ~ -1.01,
    draw == 15 & runstyle == 3 ~ -1.01,
    draw == 16 & runstyle == 3 ~ -1.01,
    TRUE ~ 0
  )) %>% 
  
  # eft 
    mutate(eft = (race_distance + ((wt_rel + postDiff)*0.30))/ mrMps) %>% 
  
  ungroup() %>% 
  group_by(race_id) %>% 
  arrange(race_id, race_number, finishing_position) %>% 
  group_by(race_id) %>% 
  mutate(raceShape = if_else(!is.na(lead(runstyle,1)),runstyle/mean(runstyle,trim=0,na.rm=TRUE),0)) %>% 
  filter(horse_name !="No Horse") %>%
  select(-new_horse_number, -altkey, -datekey) %>% 
  filter(!is.na(eft)) %>% 
  ungroup() %>%
  select(race_id, finishing_position, win_odds, log_odds, wt_rel,wt_relxdis, jockeyCP, jockeyPct, trainerCP, trainerPct, rest, raceslast90, finish1, finish2, firstcall, firstcalldist, winhistory, mps, classMove, moveOnTurn, mrMps, runstyle, postDiff, eft, raceShape, pew, horse_name, row_nbr, winner) %>% 
  mutate(row_nbr = race_id) %>% 
  mutate(row_nbr = str_replace(row_nbr,'(-)','')) %>% 
  mutate(row_nbr = as.numeric(row_nbr))

  vis_dat(race_data, warn_large_data = FALSE)

  skim(race_data)
  
  plot_bar(race_data)
  plot_histogram(race_data)
  plot_boxplot(race_data, by = "winner")

NORMALIZED SPEED FIGURES

plot_bar(race_data)

AVTM <- race_data %>% 
  group_by(race_id) %>% 
  filter(finishing_position <= 3) %>% 
  select(race_id, race_date, race_number, finishing_position, actual_weight, finish_time, race_course, race_class, track_condition, track, race_distance) %>%
  mutate(avtm = mean(finish_time)) %>% 
  filter(finishing_position == 1) %>% 
  mutate(race_course =factor(race_course)) %>% 
  mutate(race_class =factor(race_class)) %>% 
  mutate(race_class =fct_other(race_class,keep=c("Class 4","Class 1","Class 2","Class 3","Class 5"),other_level="Class 0")) %>% 
  mutate(track_condition = factor(track_condition)) %>%
  mutate(track_condition = fct_other(track_condition,keep=c("GOOD","GOOD TO FIRM","FAST"),other_level="OFF")) %>% 
  mutate(race_distance = factor(race_distance)) %>% 
  ungroup()

  plot_bar(AVTM)
  plot_histogram(AVTM)
  plot_boxplot(AVTM, by = "avtm")
  
  skim(AVTM)
  mod_time <- lm(avtm ~ race_distance + race_class + actual_weight + race_course + track , data = AVTM)
  summary(mod_time)
  
  
  
  AVTM <- AVTM %>% 
    add_predictions(mod_time, "patm") %>% 
    add_residuals(mod_time, "dres") %>% 
    mutate(dres = dres / as.numeric(race_distance)) %>% 
    group_by(race_date,track) %>% 
    mutate(avg_dres = mean(dres)) %>% 
    mutate(tnf = avg_dres * as.numeric(race_distance)) %>% 
    ungroup()
  
  
  mod_time2 <- lm(avtm ~ race_distance + race_class + actual_weight + race_course + track + tnf , data = AVTM)
  
  summary(mod_time2)  
  
  
  AVTM <- AVTM %>% 
    add_predictions(mod_time2, "patm2")


AVTM <- AVTM %>% 
  mutate(true_tnf = tnf) %>% 
  mutate(tnf = 0) %>% 
  mutate(race_class = 'Class 3') %>% 
  add_predictions(mod_time2, "base_time") %>% 
  select(race_id, true_tnf, base_time) 
  
race_data <- race_data %>%   
  inner_join(AVTM, by="race_id") %>% 
  mutate(rsfactor = factor(runstyle)) %>% 
  mutate(snt = ((finish_time+ true_tnf)/base_time)-1) %>% 
  arrange(race_date, race_number, finishing_position) %>% 
  mutate(recencySNT = rwm(snt,race_date)) %>% 
  mutate(recencyTime = rwm(finish_time, race_date)) %>% 
  mutate(recencyDistance = rwm(race_distance, race_date)) %>%
  mutate(estTime = (race_distance + postDiff + wt_rel)/(recencyTime/recencyDistance))

 
 
  
skim(race_data)
datatable(head(race_data, n =50))

Model Evaluation and Betting Strategy