Helper Functions

jdiff <- function(x, y, t="equal", i=0, j=0){
  case_when(
    t == 'equal'~ if_else(x ==y,1L,0L),
    t == 'rng' ~ if_else(x >= y-i & x <= y+j,1L,0L),
    TRUE ~ 0L
  )
}



rwm <- function(subject, d){
 max <- 400
 den <- 270  
  n <- map_dbl(d, ~ sum(between(as.double(difftime(.x, d, units = "days")), 1e-9, max) * subject * exp(as.numeric((-if_else((.x-d)<270,(.x-d)/3,(.x-d)/2)/den)))))
  d <- map_dbl(d, ~ sum(between(as.double(difftime(.x, d, units = "days")), 1e-9, max) *  exp(as.numeric((-if_else((.x-d)<270,(.x-d)/3,(.x-d)/2)/den)))))
if_else(is.na(n/d),0,n/d)
}


miAvg <- function(subject, dte, max){
  
  num <- map_dbl(dte, ~ mean(between(as.double(difftime(.x, dte, units = "days")), 1e-9, max) * subject ))
  
  den <- map_dbl(dte, ~ mean(between(as.double(difftime(.x, dte, units = "days")), 1e-9, max) ))
  
  if_else(is.na(num/den),0,(num/den))
}

Analysis

Data

[Describe data set here]

A summary of the data set provided for this analysis is set forth below:

VARIABLE_NAME	DEFINITION	Additional Comments
`finishing_position`	Order of finish for the horse	We will focus on the first three finishing positions (win, place & show)
`horse_number`	Horse program number	Non
`horse_name`	Horse’s name	None
`horse_id`	Unique identifier	An alternative to using the horse’s name - likely to remove from dat set
`jockey`	Name of jockey that rode horse	Do good jockey’s win more races
`trainer`	Name of horse’s trainer	Do good trainers win more races
`actual_weight`	Horse true weight carried	The more weight a horse carries the slower she will run
`declared_weight`	Weight declared in racing program	The difference between declared and and actual weig
`draw`	Post position the horse broke from	Outside post position may lead to ground loss (running further)
`length_behind_winner`	# of length the horse was behind winner at finish	Smaller distance means better performance
`running_position_1`	Position at first call	Provides information how race was run
`running_position_2`	Position at second call	Provides information how race was run
`running_position_3`	Position at third call	Provides information how race was run
`running_position_4`	Position at fourth call	Provides information how race was run
`finish_time`	finishing time of the horse	How fast the race was run
`win_odds`	The horse’s win odds	Reflects publics opinion of the horse
`running_position_5`	Position at fifth call	Provides information how race was run
`running_position_6`	Position at sixth call	Remove
`race_id`	Unique race identifier (ie 2014-001)	None
`src`	Html page data was scraped from	Remove from data set.
`race_date`	Data of race	None
`race_course`	Course the race was run on	None
`race_number`	The race number on the particular race card	None
`race_class`	The class of the race	Reflects the estimated quality of the race
`race_distance`	The distance of the race	TBD
`track_condition`	Condition of the racing surface	TBD
`race_name`	Name of the race, if any	Remove
`track`	Race course thre race was run on	TBD
`sectional_time`	Race time splits for the race	Will need to data wrangle this variable
`incident_report`	Race comments / incidents	Explore the informational content

Data Preprocessing

Feature Engineering

Our feature engineering will focus on both normalizing variable and creating new variable. All of these efforts are aimed at improving the our models ability to pick winning wagers. The following is a table of the feature engineering performed on our race_data data.

Figure X Schedule of Feature Engineering

VARIABLE_NAME	DEFINITION	derived from
`fp_norm`	Normalized finishing_position	finishing_position
`dr_norm`	Normalized draw of the horse	draw
`fc_norm`	Normalized first call position of the horse	running_position_1
`bl_norm`
`log_odds`	log() of the win_odds	win_odds
`wt_rel`	weight adjustment (relative weight / 5)	actual_weight
`wt_relxdis`	wt_rel x distance of race
`jpct`	jockey winning percentage by year	Do good jockey’s win more races
`tpct`	trainer winning percentage	Do good trainers win more races
`jcp`	odds adjusted cum probability of jockey wins	The more weight a horse carries the slower she will run
`tcp`	odds adjusted cum probability of trainer wins	The difference between declared and and actual weig
`draw`	Post position the horse broke from	Outside post position may lead to ground loss (runnin
fp3_rw1	Recency-weighted_1 mean of last 3 fp_norm
fp3_rw2	Recency-weighted_2 mean of last 3 fp_norm
fc3_rw1	Recency-weighted_1 mean of last 3 fc_norm
fc3_rw2	Recency-weighted_2 mean of last 3 fc_norm

estimated ground loss

INCIDENT DATA

columns <- c("one", "two", "three", "four", "five", "six", "seven")

tdf <- tdf %>%
  head(n=5) %>% 
  select(race_id, horse_name, incident_report, finishing_position)%>% 
  mutate(htext = str_extract_all(incident_report,(str_c('(?<=^|\\.)[^.]*\\b',horse_name,'\\b[^.]*\\.?')))) %>% 
  select(-incident_report) %>% 
  unnest(htext) 
  

  datatable(tdf)

FEATURE ENGINEERING

race_data <- race_data %>% 
  select(-ind) %>%
  mutate(track = factor(track)) %>% 
  mutate(pew = (1/win_odds)*.82) %>% 
  mutate(yr = year(race_date)) %>% 
  group_by(race_id) %>%
  arrange(finishing_position) %>% 
  filter(finishing_position < 99) %>% 
  mutate(n = n()) %>% 
  mutate(fp_nrm = (finishing_position - min(finishing_position)) /(max(finishing_position)- min(finishing_position))) %>% 
  mutate(fc_nrm = (running_position_1 - min(running_position_1)) /(max(running_position_1)- min(running_position_1)) ) %>% 
  mutate(dr_nrm = (draw - min(draw)) /(max(draw)- min(draw))) %>%
  mutate(bl_nrm = 0.5-(((length_behind_winner*10)/(race_distance *3.28084))-0.5)) %>% 
  mutate(log_odds = log(win_odds)) %>%
  mutate(wt_rel =(actual_weight - sum(if_else(actual_weight <800,actual_weight, 0))/sum(if_else(actual_weight <800,1, 0)))/5) %>% 
  mutate(wt_relxdis = wt_rel * race_distance) %>% 
  ungroup() %>% 
  arrange(jockey) %>% 
  group_by(yr, jockey) %>% 
  mutate(jraces = n()) %>% 
  mutate(jwins = sum(if_else(finishing_position ==1,1,0))) %>%
  mutate(jwins_ex = sum(pew)) %>% 
  mutate(jockeyPct = jwins / jraces) %>% 
  mutate(jvar = sum(pew*(1-pew))) %>% 
  mutate(jockeyCP = pnorm(jwins, jwins_ex, jvar^.5)) %>% 
  ungroup() %>% 
  arrange(trainer) %>% 
  group_by(yr, trainer) %>% 
  mutate(traces = n()) %>% 
  mutate(twins = sum(if_else(finishing_position ==1,1,0))) %>%
  mutate(twins_ex = sum(pew)) %>% 
  mutate(trainerPct = twins / traces) %>% 
  mutate(tvar = sum(pew*(1-pew))) %>% 
  mutate(trainerCP = pnorm(twins, twins_ex, tvar^.5)) %>% 
  ungroup() %>% 
  select(-jraces, -traces, -twins, -jwins, -jwins_ex, -twins_ex, -jvar, -tvar) %>% 
  arrange(race_date, race_number, finishing_position) %>% 
  
  # ******************************
  #Arrange for past performance view
  # *******************************
  group_by(horse_name) %>%
  arrange(horse_name, desc(race_date)) %>% 
  
  # rest - days since last race
  mutate(rest = as.double(difftime(race_date,lead(race_date,1),units="days"))) %>% 
  mutate(rest = if_else(is.na(rest),0,rest)) %>%
  
  # dayssince - rest since last race - median rest
  mutate(dayssince = rest-median(rest)) %>%
  
  # races last 90 days - counts the number of races in last 90 days
  mutate(raceslast90 = map_int(race_date, ~ sum(between(as.numeric(difftime(.x, race_date, units = "days")), 1e-9, 90)))) %>%
   
  # Recency weighted normalized finish position - finish1
    #mutate(finish1 = rwm(fp_nrm,race_date)) %>%
    mutate(finish1 = mean(lead(fp_nrm,1),lead(fp_nrm,2),lead(fp_nrm,3), trim=0, na.rm=TRUE)) %>% 
  
  # Finish Position
    mutate(finish2 = mean(lead(finishing_position,1),lead(finishing_position,2),lead(finishing_position,3), trim=0, na.rm=TRUE)) %>% 
  
  # First Call
    mutate(firstcall = if_else(!is.na(lead(running_position_1,1)), lead(running_position_1,1),lag(running_position_1,1))) %>% 
  
  # First Call Distance
    mutate(firstcalldist = firstcall * race_distance) %>% 
  
  # Beaten Length
    mutate(finish2 = mean(lead(bl_nrm,1),lead(bl_nrm,2),lead(bl_nrm,3), trim=0, na.rm=TRUE)) %>% 
  
  # Win History
    mutate(winhistory = mean(lead(winner,1),lead(winner,2),lead(winner,3), trim=0, na.rm=TRUE)) %>%   
  
  # Meters Per Second
    mutate(mps = if_else(!is.na(race_distance / finish_time),race_distance/finish_time,0)) %>% 
  
  # Class Move
    mutate(classMove = if_else(!is.na(as.numeric(race_class)-as.numeric(lead(race_class,1))),as.numeric(race_class)-as.numeric(lead(race_class,1)),0)) %>% 
  
  # MoveOnTurn
    mutate(moveOnTurn = if_else(!is.na(as.numeric(lead(running_position_3,1))-as.numeric(lead(running_position_2,1))),as.numeric(running_position_3)-as.numeric(lead(running_position_2,1)),0)) %>% 
  
  # Recency Meters Per Second
    mutate(mrMps = mean(lead(mps,1),lead(mps,2),lead(mps,3), trim=0, na.rm=TRUE)) %>% 
  
  # running style
  mutate(runstyle = mean(running_position_2/field)) %>% 
  mutate(runstyle = case_when(
    runstyle <=0.33 ~ 1L,
    runstyle <=0.66 ~ 2L,
    runstyle >0.66 ~ 3L,
    TRUE ~ 0L
  )) %>% 
  mutate(postDiff = case_when(
    draw == 1 & runstyle == 1 ~ -4.5,
    draw == 2 & runstyle == 1 ~ -1.33,
    draw == 3 & runstyle == 1 ~ -4.99,
    draw == 4 & runstyle == 1 ~ -6.99,
    draw == 5 & runstyle == 1 ~ -2.65,
    draw == 6 & runstyle == 1 ~ 6.11,
    draw == 7 & runstyle == 1 ~ -1.83,
    draw == 8 & runstyle == 1 ~ -3.97,
    draw == 9 & runstyle == 1 ~ 4.41,
    draw == 10 & runstyle == 1 ~ -1.26,
    draw == 11 & runstyle == 1 ~ -5.82,
    draw == 11 & runstyle == 1 ~ -5.82,
    draw == 13 & runstyle == 1 ~ -8.50,
    draw == 14 & runstyle == 1 ~ -8.50,
    draw == 15 & runstyle == 1 ~ -8.50,
    draw == 16 & runstyle == 1 ~ -8.50,
    draw == 1 & runstyle == 2 ~ 1.11,
    draw == 2 & runstyle == 2 ~ -2.03,
    draw == 3 & runstyle == 2 ~ 1.03,
    draw == 4 & runstyle == 2 ~ 1.13,
    draw == 5 & runstyle == 2 ~ -4.67,
    draw == 6 & runstyle == 2 ~ 0.66,
    draw == 7 & runstyle == 2 ~ -2.75,
    draw == 8 & runstyle == 2 ~ -2.32,
    draw == 9 & runstyle == 2 ~ 0.31,
    draw == 10 & runstyle == 2 ~ 3.48,
    draw == 11 & runstyle == 2 ~ 0.79,
    draw == 11 & runstyle == 2 ~ 1.51,
    draw == 13 & runstyle == 2 ~ 4.59,
    draw == 14 & runstyle == 2 ~ 4.59,
    draw == 15 & runstyle == 2 ~ 4.59,
    draw == 16 & runstyle == 2 ~ 4.59,
    draw == 1 & runstyle == 3 ~ 3.04,
    draw == 2 & runstyle == 3 ~ -2.72,
    draw == 3 & runstyle == 3 ~ 8.34,
    draw == 4 & runstyle == 3 ~ 0.81,
    draw == 5 & runstyle == 3 ~ 1.03,
    draw == 6 & runstyle == 3 ~ -2.38,
    draw == 7 & runstyle == 3 ~ -6.66,
    draw == 8 & runstyle == 3 ~ 0.84,
    draw == 9 & runstyle == 3 ~ 1.16,
    draw == 10 & runstyle == 3 ~ -0.88,
    draw == 11 & runstyle == 3 ~ -3.22,
    draw == 11 & runstyle == 3 ~ 2.82,
    draw == 13 & runstyle == 3 ~ -1.01,
    draw == 14 & runstyle == 3 ~ -1.01,
    draw == 15 & runstyle == 3 ~ -1.01,
    draw == 16 & runstyle == 3 ~ -1.01,
    TRUE ~ 0
  )) %>% 
  
  # eft 
    mutate(eft = (race_distance + ((wt_rel + postDiff)*0.30))/ mrMps) %>% 
  
  ungroup() %>% 
  group_by(race_id) %>% 
  arrange(race_id, race_number, finishing_position) %>% 
  group_by(race_id) %>% 
  mutate(raceShape = if_else(!is.na(lead(runstyle,1)),runstyle/mean(runstyle,trim=0,na.rm=TRUE),0)) %>% 
  filter(horse_name !="No Horse") %>%
  select(-new_horse_number, -altkey, -datekey) %>% 
  filter(!is.na(eft)) %>% 
  ungroup() %>%
  select(race_id, finishing_position, win_odds, log_odds, wt_rel,wt_relxdis, jockeyCP, jockeyPct, trainerCP, trainerPct, rest, raceslast90, finish1, finish2, firstcall, firstcalldist, winhistory, mps, classMove, moveOnTurn, mrMps, runstyle, postDiff, eft, raceShape, pew, horse_name, row_nbr, winner) %>% 
  mutate(row_nbr = race_id) %>% 
  mutate(row_nbr = str_replace(row_nbr,'(-)','')) %>% 
  mutate(row_nbr = as.numeric(row_nbr))

  vis_dat(race_data, warn_large_data = FALSE)

  skim(race_data)
  
  plot_bar(race_data)
  plot_histogram(race_data)
  plot_boxplot(race_data, by = "winner")

NORMALIZED SPEED FIGURES

plot_bar(race_data)

AVTM <- race_data %>% 
  group_by(race_id) %>% 
  filter(finishing_position <= 3) %>% 
  select(race_id, race_date, race_number, finishing_position, actual_weight, finish_time, race_course, race_class, track_condition, track, race_distance) %>%
  mutate(avtm = mean(finish_time)) %>% 
  filter(finishing_position == 1) %>% 
  mutate(race_course =factor(race_course)) %>% 
  mutate(race_class =factor(race_class)) %>% 
  mutate(race_class =fct_other(race_class,keep=c("Class 4","Class 1","Class 2","Class 3","Class 5"),other_level="Class 0")) %>% 
  mutate(track_condition = factor(track_condition)) %>%
  mutate(track_condition = fct_other(track_condition,keep=c("GOOD","GOOD TO FIRM","FAST"),other_level="OFF")) %>% 
  mutate(race_distance = factor(race_distance)) %>% 
  ungroup()

  plot_bar(AVTM)
  plot_histogram(AVTM)
  plot_boxplot(AVTM, by = "avtm")
  
  skim(AVTM)
  mod_time <- lm(avtm ~ race_distance + race_class + actual_weight + race_course + track , data = AVTM)
  summary(mod_time)
  
  
  
  AVTM <- AVTM %>% 
    add_predictions(mod_time, "patm") %>% 
    add_residuals(mod_time, "dres") %>% 
    mutate(dres = dres / as.numeric(race_distance)) %>% 
    group_by(race_date,track) %>% 
    mutate(avg_dres = mean(dres)) %>% 
    mutate(tnf = avg_dres * as.numeric(race_distance)) %>% 
    ungroup()
  
  
  mod_time2 <- lm(avtm ~ race_distance + race_class + actual_weight + race_course + track + tnf , data = AVTM)
  
  summary(mod_time2)  
  
  
  AVTM <- AVTM %>% 
    add_predictions(mod_time2, "patm2")


AVTM <- AVTM %>% 
  mutate(true_tnf = tnf) %>% 
  mutate(tnf = 0) %>% 
  mutate(race_class = 'Class 3') %>% 
  add_predictions(mod_time2, "base_time") %>% 
  select(race_id, true_tnf, base_time) 
  
race_data <- race_data %>%   
  inner_join(AVTM, by="race_id") %>% 
  mutate(rsfactor = factor(runstyle)) %>% 
  mutate(snt = ((finish_time+ true_tnf)/base_time)-1) %>% 
  arrange(race_date, race_number, finishing_position) %>% 
  mutate(recencySNT = rwm(snt,race_date)) %>% 
  mutate(recencyTime = rwm(finish_time, race_date)) %>% 
  mutate(recencyDistance = rwm(race_distance, race_date)) %>%
  mutate(estTime = (race_distance + postDiff + wt_rel)/(recencyTime/recencyDistance))

 
 
  
skim(race_data)
datatable(head(race_data, n =50))

Holdout Data

set.seed(4763)
holdout_races <- sample(2014001:2016807, 600, replace=F)

holdout_data <- race_data %>% 
  filter(row_nbr %in% holdout_races)


#Create test set
training_data  <- anti_join(race_data, holdout_data, by = 'row_nbr')

str(training_data)
str(holdout_data)

Model Building

options("scipen"=100, "digits"=4)
  
training_data %>%
  filter(finishing_position < 99) %>% # Remove dummy rows
  group_by(race_id) %>% 
  ungroup() 
  
  
ha_mod0 <- hensm(finishing_position ~ log_odds + wt_rel + finish1 + classMove + firstcall + firstcalldist + winhistory + postDiff + moveOnTurn + mrMps + runstyle + eft + raceShape, data = training_data, group = race_id)  


#ha_mod0 <- hensm(finishing_position ~ log_odds + wt_rel +  jockeyCP + jockeyPct + trainerCP + trainerPct  + finish1 + finish2 + firstcall + firstcalldist + #winhistory + postDiff + classMove + moveOnTurn + mrMps + runstyle + eft + raceShape, data = training_data, group = race_id)  
  
# print(he_mod0)
  print(ha_mod0)

  
  

# Second round of modeling to remove bias caused by use public odds  
    


prd <- holdout_data %>%
 mutate(prd_mu=as.numeric(predict(ha_mod0,newdata=.,group=race_id,type='mu'))) %>% 
 select(race_id, horse_name, finishing_position, pew, win_odds, prd_mu) %>% 
 mutate(er = prd_mu * win_odds) 
  
  
 kable(head(prd,n=50), digits = 4, format.args = list(scientific = FALSE)) %>% 
 kable_styling()

   
 #print(prd, n=1000)

Final Project - Searching for Positive Returns With Harville and Henry - Appendix

CODE SET TO eval = FALSE