p <- getURL("https://raw.githubusercontent.com/MundyMSDS/DATA607/master/pps.csv")
past_performances <- read_csv(p, col_names = c("c1","c2","c3","c4","c5","c6","c7","c8","c9","c10","c11","c12","c13","c14","c15","c16",
"c17","c18","c19","c20","c21","c22","c23","c24","c25","c26","c27","c28","c29","c30","c31","c32","c33","c34","c35","c36","c37","c38","c39","c40","c41","c42","c43","c44","c45","c46"),
col_types = ("cccccccccccccccccccccccccccccccccccccccccccccc"))
past_performances## # A tibble: 4,871 x 46
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 H Sarat~ SAR 2019~ 10 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 R Maiden 2019~ 1 Dirt 1 1/~ $90,~ (UP ~ 7 <NA> <NA> <NA>
## 3 E SAR 2019~ 1 "\"E~ <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 E SAR 2019~ 1 Trif~ <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 E SAR 2019~ 1 Supe~ <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 E SAR 2019~ 1 Pick~ <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 7 E SAR 2019~ 1 "Dou~ 1 2019~ SAR Stre~ Stre~ "\"N~ Geld~
## 8 I 2019 Vet ~ <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 9 P WO Stre~ J Ca~ Jul ~ 95 9 Maid~ $65K 1 1/~ Synth Fast~
## 10 P WO Stre~ J Ca~ Jun ~ 94 10 Maid~ $95K 1 1/~ Turf Firm~
## # ... with 4,861 more rows, and 34 more variables: c13 <chr>, c14 <chr>,
## # c15 <chr>, c16 <chr>, c17 <chr>, c18 <chr>, c19 <chr>, c20 <chr>,
## # c21 <chr>, c22 <chr>, c23 <chr>, c24 <chr>, c25 <chr>, c26 <chr>,
## # c27 <chr>, c28 <chr>, c29 <chr>, c30 <chr>, c31 <chr>, c32 <chr>,
## # c33 <chr>, c34 <chr>, c35 <chr>, c36 <chr>, c37 <chr>, c38 <chr>,
## # c39 <chr>, c40 <chr>, c41 <chr>, c42 <chr>, c43 <chr>, c44 <chr>,
## # c45 <chr>, c46 <chr>
past_performances <- past_performances %>%
mutate(record_id = row_number() ) %>%
filter(c1 == 'P' | c1 == 'I') %>%
mutate(info= ifelse(c1=='I',c3,'')) %>%
mutate(horse = ifelse(c1=='I', lead(c3,1),c3)) %>%
mutate(horse = ifelse(c1=='I', lead(horse,1),c3)) %>%
mutate(yr = ifelse (c1 == 'I' & str_sub(c2,1,1)=='2',c2,NA)) %>%
mutate(record_id = row_number()) %>%
separate(c8, into=c("race_cls", "race_price"), sep = ";", remove = FALSE) %>%
group_by(horse) %>% mutate(sub_record_id = seq(n()))
pp <- past_performances %>%
select(record_id, horse)
head(pp) %>%
kable() %>%
kable_styling() | record_id | horse |
|---|---|
| 1 | Street Talkin Guy |
| 2 | Street Talkin Guy |
| 3 | Street Talkin Guy |
| 4 | Street Talkin Guy |
| 5 | One Eyed Jack |
| 6 | Four Ten |
get_pp_code <- function(dt_str, track, race, finish) {
d <- str_trunc(dt_str,2, side = "left", ellipsis = "")
m <- str_sub(dt_str,5,2)
m <- month(ymd(dt_str),label=TRUE, abbr=TRUE)
yr <- str_trunc(dt_str,4, side = "right", ellipsis = "")
yr <- str_trunc(yr,2, side = "left", ellipsis = "")
r <- str_pad(race, 2, "left", "0")
p <- str_pad(finish,2, "left", "0")
s <- "_"
x <- str_c(d,m,yr,s,r,track,p, sep="")
}
get_furlongs <- function(d) {
case_when(
str_detect(d,'3 Furlongs') ~ 300,
str_detect(d,'3 1/2 Furlongs') ~ 350,
str_detect(d,'3 3/4 Furlongs') ~ 375,
str_detect(d,'4 Furlongs') ~ 400,
str_detect(d,'4 1/4 Furlongs') ~ 425,
str_detect(d,'4 1/2 Furlongs') ~ 450,
str_detect(d,'5 Furlongs') ~ 500,
str_detect(d,'About 5 Furlongs') ~ 500,
str_detect(d,'5 1/4 Furlongs') ~ 525,
str_detect(d,'5 Furlongs 80 Yards') ~ 536,
str_detect(d,'5 1/2 Furlongs') ~ 550,
str_detect(d,'About 5 1/2 Furlongs') ~ 550,
str_detect(d,'6 Furlongs') ~ 600,
str_detect(d,'About 6 Furlongs') ~ 600,
str_detect(d,'6 1/2 Furlongs') ~ 650,
str_detect(d,'About 6 1/2 Furlongs') ~ 650,
str_detect(d,'7 Furlongs') ~ 700,
str_detect(d,'About 7 Furlongs') ~ 700,
str_detect(d,'7 1/2 Furlongs') ~ 750,
str_detect(d,'1 Mile') ~ 800,
str_detect(d,'About 1 Mile') ~ 800,
str_detect(d,'1 Mile 40 Yards') ~ 818,
str_detect(d,'About 1 Mile 40 Yards') ~ 818,
str_detect(d,'1 Mile 70 Yards') ~ 832,
str_detect(d,'About 1 Mile 70 Yards') ~ 832,
str_detect(d,'1 1/16 Miles') ~ 850,
str_detect(d,'About 1 1/16 Miles') ~ 850,
str_detect(d,'1 1/8 Miles') ~ 900,
str_detect(d,'1 3/16 Miles') ~ 950,
str_detect(d,'1 1/4 Miles') ~ 1000,
str_detect(d,'1 5/16 Miles') ~ 1040,
str_detect(d,'1 5/16 Miles') ~ 1050,
str_detect(d,'1 3/8 Miles') ~ 1100,
str_detect(d,'1 7/16 Miles') ~ 1150,
str_detect(d,'1 1/2 Miles') ~ 1200,
str_detect(d,'1 9/16 Miles') ~ 1250,
str_detect(d,'1 5/8 Miles') ~ 1300,
str_detect(d,'1 11/16 Miles') ~ 1350,
str_detect(d,'1 3/4 Miles') ~ 1400,
str_detect(d,'1 13/16 Miles') ~ 1450,
str_detect(d,'1 7/8 Miles') ~ 1500,
str_detect(d,'1 15/16 Miles') ~ 1550,
str_detect(d,'2 Miles') ~ 1600,
TRUE ~ 0
)
}past_performances <- past_performances %>%
group_by(horse) %>% mutate(sub_record_id = seq(n())) %>%
mutate(yr = ifelse(sub_record_id == 1 & is.na(yr), toString(year(now())),yr)) %>%
tidyr::fill(yr) %>%
ungroup(horse) %>%
arrange(record_id) %>%
mutate(mth_day = c5) %>%
separate(c5, into=c("r_mth", "r_day"), sep = " ", remove = FALSE) %>%
mutate(r_mth = match(r_mth, month.abb)) %>%
mutate(r_mth = str_pad(r_mth,2,"left", "0")) %>%
mutate(date_str = str_c(yr,r_mth,r_day,sep='')) %>%
arrange(record_id) %>%
#renaming columns
rename(trk_cde = c2) %>%
rename(race_nbr = c7) %>%
rename(jockey = c4) %>%
rename(off_odds = c17) %>%
rename(dist_str = c10) %>%
#Using helper functions to create distance variable
mutate(distance = get_furlongs(dist_str)) %>%
#renaming columns
rename(surface = c11) %>%
rename(race_txt = c14) %>%
rename(surface_cond = c12) %>%
rename(wt = c15) %>%
rename(eq_med = c16) %>%
rename(post = c18) %>%
rename(field = c19) %>%
rename(finish = c28) %>%
#using helper function to create pp_cde column
mutate(pp_cde = get_pp_code(date_str, trk_cde, race_nbr,finish)) %>%
#convering serveral variables from character to integer
mutate(post = as.integer(post)) %>%
mutate(finish = as.integer(finish)) %>%
mutate(field = as.integer(field)) %>%
ungroup()
pp <- past_performances %>%
select(record_id, horse, pp_cde, race_cls, jockey, off_odds, distance, surface, surface_cond, post, field, finish)
head(pp) %>%
kable() %>%
kable_styling() | record_id | horse | pp_cde | race_cls | jockey | off_odds | distance | surface | surface_cond | post | field | finish |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Street Talkin Guy | NA | NA | NA | NA | 0 | NA | NA | NA | NA | NA |
| 2 | Street Talkin Guy | 07Jul19_09WO02 | Maiden Special Weight | J Campbell | 6-1 | 850 | Synth | Fast 7 | 1 | 8 | 2 |
| 3 | Street Talkin Guy | 08Jun19_10WO10 | Maiden Special Weight | J Campbell | 8-1 | 850 | Turf | Firm 9 | 13 | 13 | 10 |
| 4 | Street Talkin Guy | 19May19_01WO03 | Maiden Special Weight | J Campbell | 14-1 | 650 | Synth | Fast 8 | 1 | 7 | 3 |
| 5 | One Eyed Jack | 29Jun19_06BEL03 | Maiden Special Weight | J Ortiz | 3-1 | 600 | Dirt | Fast 7 | 2 | 6 | 3 |
| 6 | Four Ten | 17Jul19_07SAR03 | Maiden Special Weight | J Alvarado | 3-1 | 700 | Dirt | Sloppy 6 | 1 | 5 | 3 |
pp <- pp %>%
filter(!is.na(pp_cde))
pp <- pp %>%
select(horse, post, field, finish)
#select(horse, pp_cde, race_cls, jockey, off_odds, distance, surface, surface_cond, post, field, finish)
head(pp) %>%
kable() %>%
kable_styling()| horse | post | field | finish |
|---|---|---|---|
| Street Talkin Guy | 1 | 8 | 2 |
| Street Talkin Guy | 13 | 13 | 10 |
| Street Talkin Guy | 1 | 7 | 3 |
| One Eyed Jack | 2 | 6 | 3 |
| Four Ten | 1 | 5 | 3 |
| Four Ten | 6 | 8 | 8 |
pp <- pp %>%
#creating a list column
group_by(horse) %>%
nest() %>%
#add new variables that will enable me to answer questions
mutate(starts=map_int(data,nrow)) %>%
mutate(win =map_dbl(data, ~sum(.$finish==1, na.rm=TRUE))) %>%
mutate(place =map_dbl(data, ~sum(.$finish<=2, na.rm=TRUE))) %>%
mutate(show =map_dbl(data, ~sum(.$finish<=3, na.rm=TRUE))) %>%
mutate(avg_finish =map_dbl(data, ~mean(.$finish, na.rm=TRUE))) %>%
mutate(win_pct = win/starts) %>%
mutate(itm_pct = show/starts) %>%
arrange(desc(win))## # A tibble: 77 x 9
## horse data starts win place show avg_finish win_pct itm_pct
## <chr> <list> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Noble Tho~ <tibble ~ 24 8 14 17 2.83 0.333 0.708
## 2 First App~ <tibble ~ 26 6 12 14 3.62 0.231 0.538
## 3 Irish Val~ <tibble ~ 23 6 11 16 2.78 0.261 0.696
## 4 Mo Maveri~ <tibble ~ 17 5 9 11 3 0.294 0.647
## 5 Dontblame~ <tibble ~ 11 4 6 9 2.82 0.364 0.818
## 6 Super Dude <tibble ~ 11 4 7 9 2.36 0.364 0.818
## 7 Dover Cli~ <tibble ~ 18 4 10 10 3.44 0.222 0.556
## 8 Wantagh Q~ <tibble ~ 47 4 9 15 5.15 0.0851 0.319
## 9 Cross Bor~ <tibble ~ 16 4 7 8 3.62 0.25 0.5
## 10 Free Kitty <tibble ~ 22 4 12 17 3.14 0.182 0.773
## # ... with 67 more rows
select(pp, horse, starts, win, place, show, win_pct, itm_pct, avg_finish) %>%
head(10L,pp) %>%
kable() %>%
kable_styling()| horse | starts | win | place | show | win_pct | itm_pct | avg_finish |
|---|---|---|---|---|---|---|---|
| Noble Thought | 24 | 8 | 14 | 17 | 0.3333333 | 0.7083333 | 2.833333 |
| First Appeal | 26 | 6 | 12 | 14 | 0.2307692 | 0.5384615 | 3.615385 |
| Irish Valor | 23 | 6 | 11 | 16 | 0.2608696 | 0.6956522 | 2.782609 |
| Mo Maverick | 17 | 5 | 9 | 11 | 0.2941176 | 0.6470588 | 3.000000 |
| Dontblamerocket | 11 | 4 | 6 | 9 | 0.3636364 | 0.8181818 | 2.818182 |
| Super Dude | 11 | 4 | 7 | 9 | 0.3636364 | 0.8181818 | 2.363636 |
| Dover Cliffs | 18 | 4 | 10 | 10 | 0.2222222 | 0.5555556 | 3.444444 |
| Wantagh Queen | 47 | 4 | 9 | 15 | 0.0851064 | 0.3191489 | 5.148936 |
| Cross Border | 16 | 4 | 7 | 8 | 0.2500000 | 0.5000000 | 3.625000 |
| Free Kitty | 22 | 4 | 12 | 17 | 0.1818182 | 0.7727273 | 3.136364 |
select(pp, horse, starts, win, place, show, win_pct, itm_pct, avg_finish) %>%
arrange(desc(win_pct)) %>%
head(10L,pp) %>%
kable() %>%
kable_styling()| horse | starts | win | place | show | win_pct | itm_pct | avg_finish |
|---|---|---|---|---|---|---|---|
| Sister Peacock | 7 | 4 | 6 | 7 | 0.5714286 | 1.0000000 | 1.571429 |
| O’Keeffe | 7 | 4 | 6 | 6 | 0.5714286 | 0.8571429 | 2.000000 |
| Big Bennys Tribute | 4 | 2 | 3 | 3 | 0.5000000 | 0.7500000 | 3.000000 |
| South of France | 7 | 3 | 5 | 6 | 0.4285714 | 0.8571429 | 2.428571 |
| Missmizz | 7 | 3 | 4 | 5 | 0.4285714 | 0.7142857 | 2.857143 |
| Karama | 5 | 2 | 4 | 4 | 0.4000000 | 0.8000000 | 2.600000 |
| Skamania | 8 | 3 | 4 | 4 | 0.3750000 | 0.5000000 | 3.000000 |
| Dontblamerocket | 11 | 4 | 6 | 9 | 0.3636364 | 0.8181818 | 2.818182 |
| Super Dude | 11 | 4 | 7 | 9 | 0.3636364 | 0.8181818 | 2.363636 |
| Noble Thought | 24 | 8 | 14 | 17 | 0.3333333 | 0.7083333 | 2.833333 |
select(pp, horse, starts, win, place, show, win_pct, itm_pct, avg_finish) %>%
arrange(desc(itm_pct)) %>%
head(10L,pp) %>%
kable() %>%
kable_styling()| horse | starts | win | place | show | win_pct | itm_pct | avg_finish |
|---|---|---|---|---|---|---|---|
| Sister Peacock | 7 | 4 | 6 | 7 | 0.5714286 | 1.0000000 | 1.571429 |
| Graded On a Curve | 3 | 1 | 2 | 3 | 0.3333333 | 1.0000000 | 2.000000 |
| Chelsea Cloisters | 6 | 1 | 4 | 6 | 0.1666667 | 1.0000000 | 2.166667 |
| One Eyed Jack | 1 | 0 | 0 | 1 | 0.0000000 | 1.0000000 | 3.000000 |
| Majority Rules | 2 | 0 | 2 | 2 | 0.0000000 | 1.0000000 | 2.000000 |
| Morning Gold | 1 | 0 | 1 | 1 | 0.0000000 | 1.0000000 | 2.000000 |
| O’Keeffe | 7 | 4 | 6 | 6 | 0.5714286 | 0.8571429 | 2.000000 |
| South of France | 7 | 3 | 5 | 6 | 0.4285714 | 0.8571429 | 2.428571 |
| Abyssinian | 6 | 2 | 4 | 5 | 0.3333333 | 0.8333333 | 2.333333 |
| Dontblamerocket | 11 | 4 | 6 | 9 | 0.3636364 | 0.8181818 | 2.818182 |