data from https://www.kaggle.com/datasets/robikscube/pga-tour-golf-data-20152022

with the US open a day away, I’m interested in looking at data from past years to see where gaining strokes compared to the field is the biggest predictor of success. the US open is known for extremely tough roughs, so I was thinking that perhaps this tournament, compared to others, might be predicted by success in certain areas (perhaps around the green or approach) more than putting for example.

dat<- fread("/Users/claire/Documents/Script//ASA All PGA Raw Data - Tourn Level.csv", header=T, data.table=F)


dat$finish2<- ifelse(dat$Finish=="CUT", 999, ifelse(dat$Finish=="DQ" | dat$Finish=="MDF" | dat$Finish=="WD" | dat$Finish=="W/D", NA, dat$Finish))

# that code creates a new variable (finish2) that counts CUT as 999, and anyone we don't care about as NA. now we can get rid of the "T" (stands for a tie) and convert the variable to numeric

dat$finish2<- gsub(pattern = 'T', x = dat$finish2, replacement = '', fixed = T)
dat$finish2<- as.numeric(dat$finish2)

dat$finish_cat<- ifelse(dat$finish2==1, "first", ifelse(dat$finish2==999, "cut", ifelse(dat$finish2>1, "not cut", NA)))

table(dat$finish_cat)
## 
##     cut   first not cut 
##   12741     251   17075

US open

# unique(dat$`tournament name`)

us<- dat %>% filter(`tournament name`=="U.S. Open")
table(us$season)
## 
## 2015 2016 2017 2018 2019 2021 2022 
##  109  100  116  112  121  262  103
head(us)
##   Player_initial_last tournament id player id hole_par strokes hole_DKP
## 1           A. Hadwin     401353222      5548      280     279     58.0
## 2            A. Noren     401353222      3832      140     146     17.0
## 3           A. Putnam     401353222      5502      280     286     45.5
## 4           A. Schenk     401353222     10372      280     284     53.0
## 5            A. Scott     401353222       388      280     282     54.0
## 6             A. Wise     401353222     10577      280     285     54.0
##   hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP n_rounds made_cut pos
## 1     52.9       60          3        8.6          3        4        1   7
## 2     11.8       22          0        0.0          0        2        0  NA
## 3     37.5       51          0        0.0          0        4        1  31
## 4     42.8       54          0       12.2          0        4        1  24
## 5     46.8       56          0        1.2          0        4        1  14
## 6     43.1       52          0        6.4          0        4        1  27
##   finish_DKP finish_FDP finish_SDP total_DKP total_FDP total_SDP
## 1         10         10          7      71.0      71.5        70
## 2          0          0          0      17.0      11.8        22
## 3          2          1          0      47.5      38.5        51
## 4          4          3          1      57.0      58.0        55
## 5          6          5          3      60.0      53.0        59
## 6          3          2          0      57.0      51.5        52
##            player Unnamed: 2 Unnamed: 3 Unnamed: 4 tournament name
## 1     Adam Hadwin         NA         NA         NA       U.S. Open
## 2 Alexander Noren         NA         NA         NA       U.S. Open
## 3   Andrew Putnam         NA         NA         NA       U.S. Open
## 4     Adam Schenk         NA         NA         NA       U.S. Open
## 5      Adam Scott         NA         NA         NA       U.S. Open
## 6      Aaron Wise         NA         NA         NA       U.S. Open
##                             course       date purse season no_cut Finish
## 1 The Country Club - Brookline, MA 2022-06-19  17.5   2022      0     T7
## 2 The Country Club - Brookline, MA 2022-06-19  17.5   2022      0    CUT
## 3 The Country Club - Brookline, MA 2022-06-19  17.5   2022      0    T31
## 4 The Country Club - Brookline, MA 2022-06-19  17.5   2022      0    T24
## 5 The Country Club - Brookline, MA 2022-06-19  17.5   2022      0    T14
## 6 The Country Club - Brookline, MA 2022-06-19  17.5   2022      0    T27
##   sg_putt sg_arg sg_app sg_ott sg_t2g sg_total finish2 finish_cat
## 1    1.30   0.80   0.54   0.21   1.54     2.85       7    not cut
## 2   -1.42   0.73   0.87  -0.81   0.80    -0.62     999        cut
## 3    1.43   0.60  -0.48  -0.45  -0.34     1.09      31    not cut
## 4    0.66   0.65   0.14   0.14   0.93     1.59      24    not cut
## 5    0.54   0.79   1.20  -0.43   1.55     2.10      14    not cut
## 6    0.09   1.08  -0.34   0.53   1.26     1.34      27    not cut

look at strokes gained and how they relate to each other and finish

corrs<- us[,c(32:37,38)]
res2 <- rcorr(as.matrix(corrs))


corrs<- res2$r

corrplot(corrs, method = "color",
         type = "full",  number.cex = .6,
         addCoef.col = "black", 
         tl.col = "black", tl.srt = 90) 

the correlation plot above is almost exactly what i would expect. ultimately, strokes gained don’t relate that strongly to each other (rs=|0.01-0.09|), with the exception of strokes gained tee to green and total. I’m not 100% sure how strokes gained tee to green and total are calculated, but they definitely are comprised of strokes gained in those other categories (e.g. strokes gained tee to green = strokes gained off the tee, on the approach and around the green). as is such, strokes gained tee to green does not correlate strongly with strokes gained putting. this is good as it shows our variables are largely independent of each other! however, it also shows we might be leery of including strokes gained tee to green and strokes gained total in our models, as they might induce multicollinearity. strokes gained total correlates very highly with strokes gained tee to green, r=0.80.

otherwise, strokes gained total and tee to green are most strongly associated with finish in the US open (rs=-0.55 - -0.68). the other strokes gained categories have correlations ranging from -.21 to -.42 with finish. this means, on average as strokes gained increase, average finish in the US open decreases.

1. predict who will make the cut (1/0)

this is a classic classification machine learning problem. we can run logistic regression now, and see what predicts making the cut, but we cannot directly score or train a classification model until we have the finishing results from this tournament.

train<- us %>% select(player, `player id`, season, starts_with("sg"), finish_cat)

head(train)
##            player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1     Adam Hadwin      5548   2022    1.30   0.80   0.54   0.21   1.54     2.85
## 2 Alexander Noren      3832   2022   -1.42   0.73   0.87  -0.81   0.80    -0.62
## 3   Andrew Putnam      5502   2022    1.43   0.60  -0.48  -0.45  -0.34     1.09
## 4     Adam Schenk     10372   2022    0.66   0.65   0.14   0.14   0.93     1.59
## 5      Adam Scott       388   2022    0.54   0.79   1.20  -0.43   1.55     2.10
## 6      Aaron Wise     10577   2022    0.09   1.08  -0.34   0.53   1.26     1.34
##   finish_cat
## 1    not cut
## 2        cut
## 3    not cut
## 4    not cut
## 5    not cut
## 6    not cut
summary(train)
##     player            player id           season        sg_putt       
##  Length:923         Min.   :     16   Min.   :2015   Min.   :-3.5800  
##  Class :character   1st Qu.:   1367   1st Qu.:2017   1st Qu.:-0.8800  
##  Mode  :character   Median :   4513   Median :2019   Median : 0.0100  
##                     Mean   : 178933   Mean   :2019   Mean   :-0.0717  
##                     3rd Qu.:   9025   3rd Qu.:2021   3rd Qu.: 0.8000  
##                     Max.   :4705886   Max.   :2022   Max.   : 2.8400  
##                                                      NA's   :442      
##      sg_arg            sg_app            sg_ott            sg_t2g       
##  Min.   :-3.7200   Min.   :-3.8100   Min.   :-2.9700   Min.   :-4.7500  
##  1st Qu.:-0.4900   1st Qu.:-0.5700   1st Qu.:-0.3600   1st Qu.:-0.9700  
##  Median : 0.0300   Median : 0.1200   Median : 0.0400   Median : 0.0300  
##  Mean   :-0.0178   Mean   : 0.0274   Mean   : 0.0093   Mean   : 0.0191  
##  3rd Qu.: 0.5600   3rd Qu.: 0.7200   3rd Qu.: 0.4200   3rd Qu.: 1.1600  
##  Max.   : 2.1200   Max.   : 3.0700   Max.   : 2.0300   Max.   : 4.4800  
##  NA's   :442       NA's   :442       NA's   :442       NA's   :442      
##     sg_total       finish_cat       
##  Min.   :-6.090   Length:923        
##  1st Qu.:-1.220   Class :character  
##  Median :-0.030   Mode  :character  
##  Mean   :-0.051                     
##  3rd Qu.: 1.280                     
##  Max.   : 5.570                     
##  NA's   :442
# train %>% filter(is.na(sg_putt)) # if anyone is NA they WD'd or were DQ'd

train<- train %>% filter(!is.na(finish_cat)) # no NAs

train$finish_class<- ifelse(train$finish_cat=="cut", 0, 1)

table(train$finish_cat)
## 
##     cut   first not cut 
##     235       4     239
table(train$finish_class)
## 
##   0   1 
## 235 243
train$finish_cat<- NULL


## saving for ML model in python

# py<- train %>% select(-c(player, sg_t2g, sg_total))
# fwrite(py, "usopen_classification_train.csv", sep=',')

load strokes gained data from 2023 season up through the RBC canadian open

these data I copy and pasted into excel sheets from https://www.pgatour.com/stats/strokes-gained.

names<- c("sg_total", "sg_ott", "sg_putt", "sg_app", "sg_arg", "sg_t2g")

dat<- list()
for (i in names) {
  file<- paste0("/Users/claire/Desktop/usopen/",i, ".csv")
  dat[[i]]<- fread(file, header=T, data.table = F)
}

tmp1<- merge(dat[[1]], dat[[2]], by="player")
tmp2<- merge(tmp1, dat[[3]], by="player")
tmp3<- merge(tmp2, dat[[4]], by="player")
tmp4<- merge(tmp3, dat[[5]], by="player")
test<- merge(tmp4, dat[[6]], by="player")

head(test)
##           player sg_total sg_ott sg_putt sg_app sg_arg sg_t2g
## 1 Aaron Baddeley    0.404 -0.423   0.351  0.025  0.534  0.053
## 2      Aaron Rai    0.491  0.265  -0.391  0.390  0.136  0.882
## 3     Aaron Wise   -0.138 -0.170   0.559 -0.795  0.228 -0.696
## 4    Adam Hadwin    0.825  0.048   0.356  0.350  0.103  0.469
## 5      Adam Long   -0.264 -0.285   0.406 -0.539  0.094 -0.670
## 6    Adam Schenk    0.625  0.024   0.355  0.099  0.106  0.271
nrow(test)
## [1] 198
rm(tmp1,tmp2,tmp3,tmp4,dat)

now we have training data from 2019-2022 and testing data that is only strokes gained in 2023 thus far.

head(test)
##           player sg_total sg_ott sg_putt sg_app sg_arg sg_t2g
## 1 Aaron Baddeley    0.404 -0.423   0.351  0.025  0.534  0.053
## 2      Aaron Rai    0.491  0.265  -0.391  0.390  0.136  0.882
## 3     Aaron Wise   -0.138 -0.170   0.559 -0.795  0.228 -0.696
## 4    Adam Hadwin    0.825  0.048   0.356  0.350  0.103  0.469
## 5      Adam Long   -0.264 -0.285   0.406 -0.539  0.094 -0.670
## 6    Adam Schenk    0.625  0.024   0.355  0.099  0.106  0.271
head(train)
##            player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1     Adam Hadwin      5548   2022    1.30   0.80   0.54   0.21   1.54     2.85
## 2 Alexander Noren      3832   2022   -1.42   0.73   0.87  -0.81   0.80    -0.62
## 3   Andrew Putnam      5502   2022    1.43   0.60  -0.48  -0.45  -0.34     1.09
## 4     Adam Schenk     10372   2022    0.66   0.65   0.14   0.14   0.93     1.59
## 5      Adam Scott       388   2022    0.54   0.79   1.20  -0.43   1.55     2.10
## 6      Aaron Wise     10577   2022    0.09   1.08  -0.34   0.53   1.26     1.34
##   finish_class
## 1            1
## 2            0
## 3            1
## 4            1
## 5            1
## 6            1
test$season<- 2023

test<- test %>% select(colnames(train)[c(1,3:9)])

head(train)
##            player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1     Adam Hadwin      5548   2022    1.30   0.80   0.54   0.21   1.54     2.85
## 2 Alexander Noren      3832   2022   -1.42   0.73   0.87  -0.81   0.80    -0.62
## 3   Andrew Putnam      5502   2022    1.43   0.60  -0.48  -0.45  -0.34     1.09
## 4     Adam Schenk     10372   2022    0.66   0.65   0.14   0.14   0.93     1.59
## 5      Adam Scott       388   2022    0.54   0.79   1.20  -0.43   1.55     2.10
## 6      Aaron Wise     10577   2022    0.09   1.08  -0.34   0.53   1.26     1.34
##   finish_class
## 1            1
## 2            0
## 3            1
## 4            1
## 5            1
## 6            1
head(test)
##           player season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Aaron Baddeley   2023   0.351  0.534  0.025 -0.423  0.053    0.404
## 2      Aaron Rai   2023  -0.391  0.136  0.390  0.265  0.882    0.491
## 3     Aaron Wise   2023   0.559  0.228 -0.795 -0.170 -0.696   -0.138
## 4    Adam Hadwin   2023   0.356  0.103  0.350  0.048  0.469    0.825
## 5      Adam Long   2023   0.406  0.094 -0.539 -0.285 -0.670   -0.264
## 6    Adam Schenk   2023   0.355  0.106  0.099  0.024  0.271    0.625
sum(test$player %in% train$player) # 121 in the testing set are in the training set
## [1] 121
ids<- train %>% select(player, `player id`) %>%
  distinct(player, `player id`)
test$`player id` <- factor(test$player, levels=ids$player, labels=ids$`player id`)


## saving for ML model in python


# py<- test %>% select(-c(sg_t2g, sg_total))
# py$`player id` <- factor(py$player, levels=ids$player, labels=ids$`player id`)
# py<- na.omit(py) 
# py$player<- NULL
# py<- py %>% select(colnames(train[c(2:7)]))
# fwrite(py, "usopen_test.csv", sep=',')
# fwrite(ids, "us_open_ids.csv")

in the training data what predicts making the cut or not?

here, we let strokes gained predict whether someone made the cut or not, and included random intercepts and slopes for both player and season. this allows each strokes gained category to have it’s own intercept (or starting point) per season and per player but still draws from the same joint distribution. allowing random intercepts means each season strokes gained in a certain area may be higher or lower than in other seasons. just the same, some players might be gaining more strokes in certain areas than others due to specific skill sets or injuries, so it is important to allow random intercepts for players when they are measured more than once. allowing random intercepts helps account for non-independence of observations induced by occurrences of multiple players and years.

it also allows each strokes gained category to have it’s own slope (or effect on making the cut) per season because the course, as well as each player, changes year to year, and some courses might have tighter fairways or harder greens, worse weather etc.

mod<- glm(finish_class~sg_ott+sg_app+sg_putt+sg_t2g+sg_arg + (season|season) + (1|`player id`), data=train)
summary(mod)
## 
## Call:
## glm(formula = finish_class ~ sg_ott + sg_app + sg_putt + sg_t2g + 
##     sg_arg + (season | season) + (1 | `player id`), data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.64916  -0.32210   0.02423   0.29593   0.87129  
## 
## Coefficients: (2 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.51829    0.01706  30.374   <2e-16 ***
## sg_ott               5.71459    2.59785   2.200   0.0283 *  
## sg_app               5.73966    2.59702   2.210   0.0276 *  
## sg_putt              0.16912    0.01498  11.292   <2e-16 ***
## sg_t2g              -5.55827    2.59743  -2.140   0.0329 *  
## sg_arg               5.74931    2.59791   2.213   0.0274 *  
## season | seasonTRUE       NA         NA      NA       NA    
## 1 | `player id`TRUE       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1383402)
## 
##     Null deviance: 119.467  on 477  degrees of freedom
## Residual deviance:  65.297  on 472  degrees of freedom
## AIC: 418.96
## 
## Number of Fisher Scoring iterations: 2
exp(coef(mod))
##         (Intercept)              sg_ott              sg_app             sg_putt 
##        1.679149e+00        3.032601e+02        3.109586e+02        1.184267e+00 
##              sg_t2g              sg_arg season | seasonTRUE 1 | `player id`TRUE 
##        3.855443e-03        3.139753e+02                  NA                  NA

strokes gained mostly everywhere else still increase one’s odds of making the cut. however, it seems like strokes gained tee to green has a negative impact on making the cut. looking at the logit coefficients exponentiated above, every additional stroke gained putting, for example, increases one’s chances of making the cut by 1.18. every stroke gained off the tee increases one’s chances of making the cut by about 303. every additional stroke gained tee to green increases one’s odds of missing the cut by 0.004. a very small chance, but odd it is in that direction.

mod<- glm(finish_class~sg_ott+sg_app+sg_putt+sg_arg + (season|season) + (1|`player id`), data=train)
summary(mod)
## 
## Call:
## glm(formula = finish_class ~ sg_ott + sg_app + sg_putt + sg_arg + 
##     (season | season) + (1 | `player id`), data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.63557  -0.32433   0.02811   0.30099   0.92712  
## 
## Coefficients: (2 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.51745    0.01712  30.219  < 2e-16 ***
## sg_ott               0.15571    0.02638   5.902 6.87e-09 ***
## sg_app               0.18236    0.01579  11.550  < 2e-16 ***
## sg_putt              0.17086    0.01501  11.382  < 2e-16 ***
## sg_arg               0.19021    0.02110   9.014  < 2e-16 ***
## season | seasonTRUE       NA         NA      NA       NA    
## 1 | `player id`TRUE       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1393871)
## 
##     Null deviance: 119.47  on 477  degrees of freedom
## Residual deviance:  65.93  on 473  degrees of freedom
## AIC: 421.58
## 
## Number of Fisher Scoring iterations: 2
exp(coef(mod))
##         (Intercept)              sg_ott              sg_app             sg_putt 
##            1.677751            1.168486            1.200052            1.186329 
##              sg_arg season | seasonTRUE 1 | `player id`TRUE 
##            1.209507                  NA                  NA

if we remove strokes gained tee to green, the odds ratios make a bit more sense, and are all in the direction that we would expect: gaining strokes in every area increases your chances of making the cut significantly.

now we can make predictions based on who will make the cut. using the predict function, we get probabilities of making the cut. therefore, i am just going to round() to predict who will and who will not make the cut

test$preds <- mod %>% predict(test, type = "response")
## Warning in Ops.factor(1, `player id`): '|' not meaningful for factors
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
hist(test$preds)

test$preds<- round(test$preds)

hist(test$preds)

madecut<- test %>% filter(preds==1) %>%
  ggplot(aes(x=player, y=sg_total, color=sg_total)) + 
  geom_point() +
  ylab("average total strokes gained 2023") +
  xlab("player")+
  ggtitle("predicted to make cut")+
  theme(legend.position = "blank", axis.text.x = element_text(angle=75, size=4, hjust = 1)) +
  scale_colour_gradient(low="red", high="darkgreen")

missedcut<- test %>% filter(preds==0) %>%
  ggplot(aes(x=player, y=sg_total, color=sg_total)) + 
  geom_point() +
  ylab("average total strokes gained 2023") +
  xlab("player")+
  ggtitle("predicted to miss cut")+
  theme(legend.position = "blank", axis.text.x = element_text(angle=75, size=4, hjust = 1))+
  scale_colour_gradient(low="red", high="darkgreen")


ggarrange(madecut, missedcut, nrow=2, ncol=1, common.legend = F)

2. predict overall finish

here, we can run simple linear regression models to predict each person’s finish based on strokes gained.

linear<- us %>% select(player, `player id`, season, starts_with("sg"), finish2) %>%
  filter(!is.na(finish2))

head(linear)
##            player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1     Adam Hadwin      5548   2022    1.30   0.80   0.54   0.21   1.54     2.85
## 2 Alexander Noren      3832   2022   -1.42   0.73   0.87  -0.81   0.80    -0.62
## 3   Andrew Putnam      5502   2022    1.43   0.60  -0.48  -0.45  -0.34     1.09
## 4     Adam Schenk     10372   2022    0.66   0.65   0.14   0.14   0.93     1.59
## 5      Adam Scott       388   2022    0.54   0.79   1.20  -0.43   1.55     2.10
## 6      Aaron Wise     10577   2022    0.09   1.08  -0.34   0.53   1.26     1.34
##   finish2
## 1       7
## 2     999
## 3      31
## 4      24
## 5      14
## 6      27
hist(linear$finish2)

linear$finish2<- ifelse(linear$finish2==999, 78, linear$finish2)

mod<- lmer(finish2~sg_ott+sg_app+sg_putt+sg_t2g+sg_arg +(season|season) + (1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_ott + sg_app + sg_putt + sg_t2g + sg_arg + (season |  
##     season) + (1 | `player id`)
##    Data: linear
## 
## REML criterion at convergence: 3939
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1196 -0.7250 -0.2138  0.7812  2.0883 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev. Corr       
##  player id (Intercept)   1.8660  1.3660             
##  season    (Intercept)   0.0000  0.0000             
##            season2021    0.1682  0.4101    NaN      
##            season2022   40.0114  6.3255    NaN -0.80
##  Residual              230.0580 15.1677             
## Number of obs: 478, groups:  player id, 245; season, 3
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)   52.4833     0.8305  63.198
## sg_ott      -298.5576   107.2274  -2.784
## sg_app      -298.3450   107.1924  -2.783
## sg_putt      -11.5875     0.6154 -18.828
## sg_t2g       286.2186   107.2042   2.670
## sg_arg      -298.3895   107.2276  -2.783
## 
## Correlation of Fixed Effects:
##         (Intr) sg_ott sg_app sg_ptt sg_t2g
## sg_ott   0.070                            
## sg_app   0.070  1.000                     
## sg_putt  0.084 -0.042 -0.042              
## sg_t2g  -0.070 -1.000 -1.000  0.043       
## sg_arg   0.070  1.000  1.000 -0.043 -1.000
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
## saving for ML model in python

# py<- linear %>% select(-c(sg_t2g, sg_total))
# py$`player id` <- factor(py$player, levels=ids$player, labels=ids$`player id`)
# py<- na.omit(py) 
# py$player<- NULL
# py<- py %>% select(colnames(train[c(2:7)]), finish2)
# fwrite(py, "usopen_regression_train.csv", sep=',')

the linear regression model is very similar to the logistic regression model, but now we are predicting overall finish in the US open, not just making/missing the cut. here, we designated everyone who missed the cut a score of 78 (the last place across all the US open data was 77). again, we see strokes gained almost everywhere makes a big impact on finish. for all variables except strokes gained tee to green, additional strokes gained decrease one’s predicting finishing place. for example from the output above, for each additional stroke gained putting, a player’s expected finish decreases by 11. for every additional stroke gained off the tee, a player’s expected finish decreases by 247. oddly enough again, for every additional stroke gained tee to green, a player’s expected finish is actually increasing substantially. i’m not entirely sure how that works given strokes gained tee to green has to be some combination of strokes gained off the tee, on the approach and around the green.

mod<- lmer(finish2~sg_putt+sg_t2g +(season|season) + (1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_putt + sg_t2g + (season | season) + (1 | `player id`)
##    Data: linear
## 
## REML criterion at convergence: 3962.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1152 -0.7498 -0.2153  0.7732  2.0530 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.  Corr       
##  player id (Intercept) 1.594e+00 1.262e+00            
##  season    (Intercept) 5.233e-08 2.288e-04            
##            season2021  1.337e-07 3.656e-04 -1.00      
##            season2022  3.243e+01 5.695e+00 -0.83  0.83
##  Residual              2.328e+02 1.526e+01            
## Number of obs: 478, groups:  player id, 245; season, 3
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  52.6072     0.7861   66.92
## sg_putt     -11.6545     0.6173  -18.88
## sg_t2g      -12.1671     0.4502  -27.03
## 
## Correlation of Fixed Effects:
##         (Intr) sg_ptt
## sg_putt 0.095        
## sg_t2g  0.025  0.055 
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

interestingly, if we just look at strokes gained tee to green and putting, they both drastically decrease one’s expected finish with every additional stroke gained. that is the direction we would expect. I would guess perhaps there is some sort of suppression effect occurring when strokes gained tee to green is in the model with all other variables that comprise strokes gained tee to green. it might make sense thus to take it out of the machine learning model to follow.

mod<- lmer(finish2~sg_ott+sg_app+sg_putt+sg_arg +(season|season) + (1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_ott + sg_app + sg_putt + sg_arg + (season | season) +  
##     (1 | `player id`)
##    Data: linear
## 
## REML criterion at convergence: 3957.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1233 -0.7478 -0.2205  0.7683  2.0504 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.  Corr       
##  player id (Intercept) 1.691e+00 1.300e+00            
##  season    (Intercept) 8.567e-08 2.927e-04            
##            season2021  1.452e-08 1.205e-04 -1.00      
##            season2022  3.278e+01 5.725e+00 -0.99  0.99
##  Residual              2.333e+02 1.528e+01            
## Number of obs: 478, groups:  player id, 245; season, 3
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  52.6012     0.7877   66.78
## sg_ott      -12.2875     1.0860  -11.31
## sg_app      -12.1614     0.6498  -18.71
## sg_putt     -11.6544     0.6189  -18.83
## sg_arg      -12.1148     0.8668  -13.98
## 
## Correlation of Fixed Effects:
##         (Intr) sg_ott sg_app sg_ptt
## sg_ott   0.002                     
## sg_app   0.009 -0.087              
## sg_putt  0.094  0.034  0.060       
## sg_arg   0.033  0.056 -0.060 -0.012
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

finally, removing tee to green from the original model we get much more realistic results. every additional stroke gained in every area (off the tee, on the approach, around the green and putting) all decrease one’s expected finish by about 11 to 12.

set.seed(2)

preProcess <- c("center","scale")
trControl <- trainControl(method = "repeatedcv",number = 10,repeats = 10)

model <- train(finish2~sg_putt+sg_arg+sg_app+sg_ott, data=linear, preProcess = preProcess, trControl=trControl) ### run the model 


test$finish_pred <- predict(model, test) 

head(test)
##           player season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total player id
## 1 Aaron Baddeley   2023   0.351  0.534  0.025 -0.423  0.053    0.404        16
## 2      Aaron Rai   2023  -0.391  0.136  0.390  0.265  0.882    0.491      <NA>
## 3     Aaron Wise   2023   0.559  0.228 -0.795 -0.170 -0.696   -0.138     10577
## 4    Adam Hadwin   2023   0.356  0.103  0.350  0.048  0.469    0.825      5548
## 5      Adam Long   2023   0.406  0.094 -0.539 -0.285 -0.670   -0.264      6015
## 6    Adam Schenk   2023   0.355  0.106  0.099  0.024  0.271    0.625     10372
##   preds finish_pred
## 1     1    48.94883
## 2     1    49.30457
## 3     0    70.06233
## 4     1    36.80317
## 5     0    62.19587
## 6     1    47.03707
plot<- test %>% arrange(finish_pred) 

plot<- plot[1:50,]

ggplot(plot, aes(x = reorder(player, finish_pred), y=finish_pred))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Predicted finish for 2023 US open",x="Player", y = "predicted finish")+
  theme_minimal()+
  geom_text(
    aes(label = round(finish_pred,0)),
    colour = "white", size = 2,
    vjust = 1.5, position = position_dodge(.9)) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

a simple linear prediction model here suggests Scottie will finish the lowest, followed by Jon Rahm, Tyrell Hatton, Xander, Rory and Tony Finau. not too bad if you ask me, although the lowest it is predicting someone to go is 16, so maybe not entirely precise. those guys are definitely mostly playing really well right now though!

2.2 predict overall finish for only those who made the cut last year

linear<- us %>% select(player, `player id`, season, starts_with("sg"), finish2) %>%
  filter(!is.na(finish2)) %>%
  filter(finish2<900)

head(linear)
##          player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1   Adam Hadwin      5548   2022    1.30   0.80   0.54   0.21   1.54     2.85
## 2 Andrew Putnam      5502   2022    1.43   0.60  -0.48  -0.45  -0.34     1.09
## 3   Adam Schenk     10372   2022    0.66   0.65   0.14   0.14   0.93     1.59
## 4    Adam Scott       388   2022    0.54   0.79   1.20  -0.43   1.55     2.10
## 5    Aaron Wise     10577   2022    0.09   1.08  -0.34   0.53   1.26     1.34
## 6  Brian Harman      1225   2022    0.58  -0.63   0.37   0.28   0.01     0.59
##   finish2
## 1       7
## 2      31
## 3      24
## 4      14
## 5      27
## 6      43
hist(linear$finish2)

table(linear$finish2)
## 
##  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17 19 21 22 23 24 26 27 28 30 31 
##  4  5  6  5  2  2 11  5  3  2  6  6  7  4  5  5  7  7  1  8  3  5  3  4  1 10 
## 32 34 35 37 38 40 43 46 47 48 49 50 51 52 53 54 55 56 57 58 59 61 62 63 64 65 
##  3  4 12  6  5  6 12  3  2  1  7  4  3  6  1  1  4  3  4  5  2  2  2  1  1  8 
## 68 70 72 76 77 
##  2  2  2  1  1
mod<- lmer(finish2~sg_ott+sg_app+sg_putt+sg_arg +(1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_ott + sg_app + sg_putt + sg_arg + (1 | `player id`)
##    Data: linear
## 
## REML criterion at convergence: 1529.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0208 -0.6297 -0.1544  0.5621  5.3308 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  player id (Intercept)  0.00    0.000   
##  Residual              32.66    5.715   
## Number of obs: 243, groups:  player id, 146
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  48.2335     0.4932   97.80
## sg_ott      -13.1841     0.7036  -18.74
## sg_app      -14.2677     0.4637  -30.77
## sg_putt     -14.4425     0.4197  -34.41
## sg_arg      -13.9635     0.6140  -22.74
## 
## Correlation of Fixed Effects:
##         (Intr) sg_ott sg_app sg_ptt
## sg_ott  -0.153                     
## sg_app  -0.455 -0.185              
## sg_putt -0.397  0.079  0.167       
## sg_arg  -0.339  0.014  0.021  0.081
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

the results are about the same, but each additional stroke gained is reducing one’s expected finish a bit more, as expected as these are only the players who performed well enough to make the cut.

sgt<- ggplot(linear, aes(x=sg_total, y=finish2, color=finish2)) + 
  geom_point() +
  facet_wrap(~season)+
  ylab("Finish") +
  xlab("Strokes gained total")+
  geom_hline(yintercept = 10, color="red")+
  scale_colour_gradientn(colours = terrain.colors(10))

sgt2g<- ggplot(linear, aes(x=sg_t2g, y=finish2, color=finish2)) + 
  geom_point() +
  facet_wrap(~season)+
  ylab("Finish") +
  xlab("Strokes gained tee to green")+
  geom_hline(yintercept = 10, color="red")+
  scale_colour_gradientn(colours = terrain.colors(10))

sgott<- ggplot(linear, aes(x=sg_ott, y=finish2, color=finish2)) + 
  geom_point() +
  facet_wrap(~season)+
  ylab("Finish") +
  xlab("Strokes gained off the tee")+
  geom_hline(yintercept = 10, color="red")+
  scale_colour_gradientn(colours = terrain.colors(10))

sgapp<- ggplot(linear, aes(x=sg_app, y=finish2, color=finish2)) + 
  geom_point() +
  facet_wrap(~season)+
  ylab("Finish") +
  xlab("Strokes gained on the approach")+
  geom_hline(yintercept = 10, color="red")+
  scale_colour_gradientn(colours = terrain.colors(10))

sgarg<- ggplot(linear, aes(x=sg_arg, y=finish2, color=finish2)) + 
  geom_point() +
  facet_wrap(~season)+
  ylab("Finish") +
  xlab("Strokes gained around the green")+
  geom_hline(yintercept = 10, color="red")+
  scale_colour_gradientn(colours = terrain.colors(10))

sgp<- ggplot(linear, aes(x=sg_putt, y=finish2, color=finish2)) + 
  geom_point() +
  facet_wrap(~season)+
  ylab("Finish") +
  xlab("Strokes gained putting")+
  geom_hline(yintercept = 10, color="red")+
  scale_colour_gradientn(colours = terrain.colors(10))

the below plot shows all strokes gained categories and their relationships with finish over the last 3 years looked at in this analysis. strokes gained total has the clearest association with finish, which makes sense (green = lower finish = better). the red line represents a top 10 finish at a US open. for the most part, people finishing in the top 10 are also in the positive in strokes gained categories (but not always!)

ggarrange(sgt, sgt2g, sgott, sgapp, sgarg, sgp, nrow=3, ncol=2, common.legend = T)

compare these predictions to the predictions of sci-kit learn models in python

i did very similar anlayses in python (which I am just learning, and not as comfortable with yet), so I want to compare the model outputs.

it seems like sci-kit learn has better fleshed out models (and model tuning!) compared to R, so I have a hunch those should perform better

sk_class<- fread("sklearn_cut_predictions.csv", header=T, data.table=F)
head(sk_class)
##   V1         player preds
## 1  0 Aaron Baddeley     1
## 2  1     Aaron Wise     0
## 3  2    Adam Hadwin     1
## 4  3      Adam Long     1
## 5  4    Adam Schenk     1
## 6  5     Adam Scott     1
sk_class$V1<- NULL
colnames(sk_class)[2]<- "preds_cut"

sk_linear<- fread("sklearn_linear_preds.csv", header=T, data.table = F)
head(sk_linear)
##   V1 player id finish_preds         player
## 1  0        16     48.98445 Aaron Baddeley
## 2  1     10577     71.39682     Aaron Wise
## 3  2      5548     35.49096    Adam Hadwin
## 4  3      6015     61.67538      Adam Long
## 5  4     10372     45.30434    Adam Schenk
## 6  5       388     44.55440     Adam Scott
sk_linear$V1<- NULL

select just classification predictions from the R model

r_class<- test %>% select(player, preds)
class<- merge(r_class, sk_class, by="player")

head(class)
##           player preds preds_cut
## 1 Aaron Baddeley     1         1
## 2     Aaron Wise     0         0
## 3    Adam Hadwin     1         1
## 4      Adam Long     0         1
## 5    Adam Schenk     1         1
## 6     Adam Scott     1         1
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
tetrachoric(class[,2:3]) ### the classification models correlate at 81%. not super high but pretty good
## Call: tetrachoric(x = class[, 2:3])
## tetrachoric correlation 
##           preds prds_
## preds     1.00       
## preds_cut 0.81  1.00 
## 
##  with tau of 
##     preds preds_cut 
##     -0.46     -0.94
table(class$preds) ### the R model is predicting 39 people to miss the cut
## 
##  0  1 
## 39 82
table(class$preds_cut) ### the sci kit learn model is only predicting 21 people to miss the cut
## 
##   0   1 
##  21 100

the below plot shows who was predicted to the make cut or not in both models.

if someone has a 2 on the y axis (confidence number) they were predicted to make the cut in both models. if someone only has a 1, they were only predicted to make the cut in one of the models. finally, if someone has a zero, or no bar, they were predicted to miss the cut in both models.

class_long <- class %>% 
  gather(preds, preds_cut, key = "model", value = "count") %>% 
  mutate(model = recode(model, preds = "R classification model", preds_cut = "sci-kit learn model"))

ggplot(class_long, aes(x = reorder(player, -count), y = count, fill = model)) + 
  geom_bar(stat = "identity", position = "stack", color = "black") +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=80, size=5, hjust = 1))+
  xlab("player")+
  ylab("confidence number")

select just linear predictions from the R model

r_linear<- test %>% select(player, finish_pred)
lin<- merge(r_linear, sk_linear, by="player")

head(lin)
##           player finish_pred player id finish_preds
## 1 Aaron Baddeley    48.94883        16     48.98445
## 2     Aaron Wise    70.06233     10577     71.39682
## 3    Adam Hadwin    36.80317      5548     35.49096
## 4      Adam Long    62.19587      6015     61.67538
## 5    Adam Schenk    47.03707     10372     45.30434
## 6     Adam Scott    40.80673       388     44.55440
cor.test(lin$finish_pred, lin$finish_preds)
## 
##  Pearson's product-moment correlation
## 
## data:  lin$finish_pred and lin$finish_preds
## t = 110.95, df = 119, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9931230 0.9966524
## sample estimates:
##       cor 
## 0.9952012
### wow! the two linear predictions correlate at almost 100% 
r_plot<- ggplot(plot, aes(x = reorder(player, finish_pred), y=finish_pred))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Predicted finish for 2023 US open from R",x="Player", y = "predicted finish")+
  theme_minimal()+
  geom_text(
    aes(label = round(finish_pred,0)),
    colour = "white", size = 2,
    vjust = 1.5, position = position_dodge(.9)) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))


plot2<- lin %>% arrange(finish_preds) 

plot2<- plot2[1:50,]

py_plot<- ggplot(plot2, aes(x = reorder(player, finish_preds), y=finish_preds))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Predicted finish for 2023 US open from sci-kit learn",x="Player", y = "predicted finish")+
  theme_minimal()+
  geom_text(
    aes(label = round(finish_preds,0)),
    colour = "white", size = 2,
    vjust = 1.5, position = position_dodge(.9)) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))


ggarrange(r_plot, py_plot, nrow=2, ncol=1, common.legend = F)

the people we are most confident will make the cut and finish top 20?

top<- merge(lin, class, by="player")

top<- top %>% filter(preds==1 & preds_cut==1) %>%
  filter(finish_pred< 40 & finish_preds<40)

season<- test %>% select(player, starts_with("sg"))

top<- merge(top, season, by="player")

these are 33 people we’re pretty confident will do well. here, we plot their strokes gained from the 2023 season thus far, arranged in order of predicted finish:

sgt<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_total, color=finish_preds)) + 
  geom_point() +
  xlab("Player") +
  ggtitle("Strokes gained total")+
    theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
  scale_colour_gradientn(colours = terrain.colors(10))

sgt2g<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_t2g, color=finish_preds)) + 
  geom_point() +
  xlab("Player") +
  ggtitle("Strokes gained tee to green")+
  theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
  scale_colour_gradientn(colours = terrain.colors(10))

sgott<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_ott, color=finish_preds)) + 
  geom_point() +
  xlab("Player") +
  ggtitle("Strokes gained off the tee")+
    theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
  scale_colour_gradientn(colours = terrain.colors(10))

sgapp<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_app, color=finish_preds)) + 
  geom_point() +
  xlab("Player") +
  ggtitle("Strokes gained on the approach")+
    theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
  scale_colour_gradientn(colours = terrain.colors(10))

sgarg<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_arg, color=finish_preds)) + 
  geom_point() +
  xlab("Player") +
  ggtitle("Strokes gained around the green")+
    theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
  scale_colour_gradientn(colours = terrain.colors(10))

sgp<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_putt, color=finish_preds)) + 
  geom_point() +
  xlab("Player") +
  ggtitle("Strokes gained putting")+
    theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
  scale_colour_gradientn(colours = terrain.colors(10))
ggarrange(sgt, sgt2g, sgott, sgapp, sgarg, sgp, nrow=3, ncol=2, common.legend = F)