data from https://www.kaggle.com/datasets/robikscube/pga-tour-golf-data-20152022
with the US open a day away, I’m interested in looking at data from past years to see where gaining strokes compared to the field is the biggest predictor of success. the US open is known for extremely tough roughs, so I was thinking that perhaps this tournament, compared to others, might be predicted by success in certain areas (perhaps around the green or approach) more than putting for example.
dat<- fread("/Users/claire/Documents/Script//ASA All PGA Raw Data - Tourn Level.csv", header=T, data.table=F)
dat$finish2<- ifelse(dat$Finish=="CUT", 999, ifelse(dat$Finish=="DQ" | dat$Finish=="MDF" | dat$Finish=="WD" | dat$Finish=="W/D", NA, dat$Finish))
# that code creates a new variable (finish2) that counts CUT as 999, and anyone we don't care about as NA. now we can get rid of the "T" (stands for a tie) and convert the variable to numeric
dat$finish2<- gsub(pattern = 'T', x = dat$finish2, replacement = '', fixed = T)
dat$finish2<- as.numeric(dat$finish2)
dat$finish_cat<- ifelse(dat$finish2==1, "first", ifelse(dat$finish2==999, "cut", ifelse(dat$finish2>1, "not cut", NA)))
table(dat$finish_cat)
##
## cut first not cut
## 12741 251 17075
# unique(dat$`tournament name`)
us<- dat %>% filter(`tournament name`=="U.S. Open")
table(us$season)
##
## 2015 2016 2017 2018 2019 2021 2022
## 109 100 116 112 121 262 103
head(us)
## Player_initial_last tournament id player id hole_par strokes hole_DKP
## 1 A. Hadwin 401353222 5548 280 279 58.0
## 2 A. Noren 401353222 3832 140 146 17.0
## 3 A. Putnam 401353222 5502 280 286 45.5
## 4 A. Schenk 401353222 10372 280 284 53.0
## 5 A. Scott 401353222 388 280 282 54.0
## 6 A. Wise 401353222 10577 280 285 54.0
## hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP n_rounds made_cut pos
## 1 52.9 60 3 8.6 3 4 1 7
## 2 11.8 22 0 0.0 0 2 0 NA
## 3 37.5 51 0 0.0 0 4 1 31
## 4 42.8 54 0 12.2 0 4 1 24
## 5 46.8 56 0 1.2 0 4 1 14
## 6 43.1 52 0 6.4 0 4 1 27
## finish_DKP finish_FDP finish_SDP total_DKP total_FDP total_SDP
## 1 10 10 7 71.0 71.5 70
## 2 0 0 0 17.0 11.8 22
## 3 2 1 0 47.5 38.5 51
## 4 4 3 1 57.0 58.0 55
## 5 6 5 3 60.0 53.0 59
## 6 3 2 0 57.0 51.5 52
## player Unnamed: 2 Unnamed: 3 Unnamed: 4 tournament name
## 1 Adam Hadwin NA NA NA U.S. Open
## 2 Alexander Noren NA NA NA U.S. Open
## 3 Andrew Putnam NA NA NA U.S. Open
## 4 Adam Schenk NA NA NA U.S. Open
## 5 Adam Scott NA NA NA U.S. Open
## 6 Aaron Wise NA NA NA U.S. Open
## course date purse season no_cut Finish
## 1 The Country Club - Brookline, MA 2022-06-19 17.5 2022 0 T7
## 2 The Country Club - Brookline, MA 2022-06-19 17.5 2022 0 CUT
## 3 The Country Club - Brookline, MA 2022-06-19 17.5 2022 0 T31
## 4 The Country Club - Brookline, MA 2022-06-19 17.5 2022 0 T24
## 5 The Country Club - Brookline, MA 2022-06-19 17.5 2022 0 T14
## 6 The Country Club - Brookline, MA 2022-06-19 17.5 2022 0 T27
## sg_putt sg_arg sg_app sg_ott sg_t2g sg_total finish2 finish_cat
## 1 1.30 0.80 0.54 0.21 1.54 2.85 7 not cut
## 2 -1.42 0.73 0.87 -0.81 0.80 -0.62 999 cut
## 3 1.43 0.60 -0.48 -0.45 -0.34 1.09 31 not cut
## 4 0.66 0.65 0.14 0.14 0.93 1.59 24 not cut
## 5 0.54 0.79 1.20 -0.43 1.55 2.10 14 not cut
## 6 0.09 1.08 -0.34 0.53 1.26 1.34 27 not cut
corrs<- us[,c(32:37,38)]
res2 <- rcorr(as.matrix(corrs))
corrs<- res2$r
corrplot(corrs, method = "color",
type = "full", number.cex = .6,
addCoef.col = "black",
tl.col = "black", tl.srt = 90)
the correlation plot above is almost exactly what i would expect. ultimately, strokes gained don’t relate that strongly to each other (rs=|0.01-0.09|), with the exception of strokes gained tee to green and total. I’m not 100% sure how strokes gained tee to green and total are calculated, but they definitely are comprised of strokes gained in those other categories (e.g. strokes gained tee to green = strokes gained off the tee, on the approach and around the green). as is such, strokes gained tee to green does not correlate strongly with strokes gained putting. this is good as it shows our variables are largely independent of each other! however, it also shows we might be leery of including strokes gained tee to green and strokes gained total in our models, as they might induce multicollinearity. strokes gained total correlates very highly with strokes gained tee to green, r=0.80.
otherwise, strokes gained total and tee to green are most strongly associated with finish in the US open (rs=-0.55 - -0.68). the other strokes gained categories have correlations ranging from -.21 to -.42 with finish. this means, on average as strokes gained increase, average finish in the US open decreases.
this is a classic classification machine learning problem. we can run logistic regression now, and see what predicts making the cut, but we cannot directly score or train a classification model until we have the finishing results from this tournament.
train<- us %>% select(player, `player id`, season, starts_with("sg"), finish_cat)
head(train)
## player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Adam Hadwin 5548 2022 1.30 0.80 0.54 0.21 1.54 2.85
## 2 Alexander Noren 3832 2022 -1.42 0.73 0.87 -0.81 0.80 -0.62
## 3 Andrew Putnam 5502 2022 1.43 0.60 -0.48 -0.45 -0.34 1.09
## 4 Adam Schenk 10372 2022 0.66 0.65 0.14 0.14 0.93 1.59
## 5 Adam Scott 388 2022 0.54 0.79 1.20 -0.43 1.55 2.10
## 6 Aaron Wise 10577 2022 0.09 1.08 -0.34 0.53 1.26 1.34
## finish_cat
## 1 not cut
## 2 cut
## 3 not cut
## 4 not cut
## 5 not cut
## 6 not cut
summary(train)
## player player id season sg_putt
## Length:923 Min. : 16 Min. :2015 Min. :-3.5800
## Class :character 1st Qu.: 1367 1st Qu.:2017 1st Qu.:-0.8800
## Mode :character Median : 4513 Median :2019 Median : 0.0100
## Mean : 178933 Mean :2019 Mean :-0.0717
## 3rd Qu.: 9025 3rd Qu.:2021 3rd Qu.: 0.8000
## Max. :4705886 Max. :2022 Max. : 2.8400
## NA's :442
## sg_arg sg_app sg_ott sg_t2g
## Min. :-3.7200 Min. :-3.8100 Min. :-2.9700 Min. :-4.7500
## 1st Qu.:-0.4900 1st Qu.:-0.5700 1st Qu.:-0.3600 1st Qu.:-0.9700
## Median : 0.0300 Median : 0.1200 Median : 0.0400 Median : 0.0300
## Mean :-0.0178 Mean : 0.0274 Mean : 0.0093 Mean : 0.0191
## 3rd Qu.: 0.5600 3rd Qu.: 0.7200 3rd Qu.: 0.4200 3rd Qu.: 1.1600
## Max. : 2.1200 Max. : 3.0700 Max. : 2.0300 Max. : 4.4800
## NA's :442 NA's :442 NA's :442 NA's :442
## sg_total finish_cat
## Min. :-6.090 Length:923
## 1st Qu.:-1.220 Class :character
## Median :-0.030 Mode :character
## Mean :-0.051
## 3rd Qu.: 1.280
## Max. : 5.570
## NA's :442
# train %>% filter(is.na(sg_putt)) # if anyone is NA they WD'd or were DQ'd
train<- train %>% filter(!is.na(finish_cat)) # no NAs
train$finish_class<- ifelse(train$finish_cat=="cut", 0, 1)
table(train$finish_cat)
##
## cut first not cut
## 235 4 239
table(train$finish_class)
##
## 0 1
## 235 243
train$finish_cat<- NULL
## saving for ML model in python
# py<- train %>% select(-c(player, sg_t2g, sg_total))
# fwrite(py, "usopen_classification_train.csv", sep=',')
these data I copy and pasted into excel sheets from https://www.pgatour.com/stats/strokes-gained.
names<- c("sg_total", "sg_ott", "sg_putt", "sg_app", "sg_arg", "sg_t2g")
dat<- list()
for (i in names) {
file<- paste0("/Users/claire/Desktop/usopen/",i, ".csv")
dat[[i]]<- fread(file, header=T, data.table = F)
}
tmp1<- merge(dat[[1]], dat[[2]], by="player")
tmp2<- merge(tmp1, dat[[3]], by="player")
tmp3<- merge(tmp2, dat[[4]], by="player")
tmp4<- merge(tmp3, dat[[5]], by="player")
test<- merge(tmp4, dat[[6]], by="player")
head(test)
## player sg_total sg_ott sg_putt sg_app sg_arg sg_t2g
## 1 Aaron Baddeley 0.404 -0.423 0.351 0.025 0.534 0.053
## 2 Aaron Rai 0.491 0.265 -0.391 0.390 0.136 0.882
## 3 Aaron Wise -0.138 -0.170 0.559 -0.795 0.228 -0.696
## 4 Adam Hadwin 0.825 0.048 0.356 0.350 0.103 0.469
## 5 Adam Long -0.264 -0.285 0.406 -0.539 0.094 -0.670
## 6 Adam Schenk 0.625 0.024 0.355 0.099 0.106 0.271
nrow(test)
## [1] 198
rm(tmp1,tmp2,tmp3,tmp4,dat)
now we have training data from 2019-2022 and testing data that is only strokes gained in 2023 thus far.
head(test)
## player sg_total sg_ott sg_putt sg_app sg_arg sg_t2g
## 1 Aaron Baddeley 0.404 -0.423 0.351 0.025 0.534 0.053
## 2 Aaron Rai 0.491 0.265 -0.391 0.390 0.136 0.882
## 3 Aaron Wise -0.138 -0.170 0.559 -0.795 0.228 -0.696
## 4 Adam Hadwin 0.825 0.048 0.356 0.350 0.103 0.469
## 5 Adam Long -0.264 -0.285 0.406 -0.539 0.094 -0.670
## 6 Adam Schenk 0.625 0.024 0.355 0.099 0.106 0.271
head(train)
## player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Adam Hadwin 5548 2022 1.30 0.80 0.54 0.21 1.54 2.85
## 2 Alexander Noren 3832 2022 -1.42 0.73 0.87 -0.81 0.80 -0.62
## 3 Andrew Putnam 5502 2022 1.43 0.60 -0.48 -0.45 -0.34 1.09
## 4 Adam Schenk 10372 2022 0.66 0.65 0.14 0.14 0.93 1.59
## 5 Adam Scott 388 2022 0.54 0.79 1.20 -0.43 1.55 2.10
## 6 Aaron Wise 10577 2022 0.09 1.08 -0.34 0.53 1.26 1.34
## finish_class
## 1 1
## 2 0
## 3 1
## 4 1
## 5 1
## 6 1
test$season<- 2023
test<- test %>% select(colnames(train)[c(1,3:9)])
head(train)
## player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Adam Hadwin 5548 2022 1.30 0.80 0.54 0.21 1.54 2.85
## 2 Alexander Noren 3832 2022 -1.42 0.73 0.87 -0.81 0.80 -0.62
## 3 Andrew Putnam 5502 2022 1.43 0.60 -0.48 -0.45 -0.34 1.09
## 4 Adam Schenk 10372 2022 0.66 0.65 0.14 0.14 0.93 1.59
## 5 Adam Scott 388 2022 0.54 0.79 1.20 -0.43 1.55 2.10
## 6 Aaron Wise 10577 2022 0.09 1.08 -0.34 0.53 1.26 1.34
## finish_class
## 1 1
## 2 0
## 3 1
## 4 1
## 5 1
## 6 1
head(test)
## player season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Aaron Baddeley 2023 0.351 0.534 0.025 -0.423 0.053 0.404
## 2 Aaron Rai 2023 -0.391 0.136 0.390 0.265 0.882 0.491
## 3 Aaron Wise 2023 0.559 0.228 -0.795 -0.170 -0.696 -0.138
## 4 Adam Hadwin 2023 0.356 0.103 0.350 0.048 0.469 0.825
## 5 Adam Long 2023 0.406 0.094 -0.539 -0.285 -0.670 -0.264
## 6 Adam Schenk 2023 0.355 0.106 0.099 0.024 0.271 0.625
sum(test$player %in% train$player) # 121 in the testing set are in the training set
## [1] 121
ids<- train %>% select(player, `player id`) %>%
distinct(player, `player id`)
test$`player id` <- factor(test$player, levels=ids$player, labels=ids$`player id`)
## saving for ML model in python
# py<- test %>% select(-c(sg_t2g, sg_total))
# py$`player id` <- factor(py$player, levels=ids$player, labels=ids$`player id`)
# py<- na.omit(py)
# py$player<- NULL
# py<- py %>% select(colnames(train[c(2:7)]))
# fwrite(py, "usopen_test.csv", sep=',')
# fwrite(ids, "us_open_ids.csv")
here, we let strokes gained predict whether someone made the cut or not, and included random intercepts and slopes for both player and season. this allows each strokes gained category to have it’s own intercept (or starting point) per season and per player but still draws from the same joint distribution. allowing random intercepts means each season strokes gained in a certain area may be higher or lower than in other seasons. just the same, some players might be gaining more strokes in certain areas than others due to specific skill sets or injuries, so it is important to allow random intercepts for players when they are measured more than once. allowing random intercepts helps account for non-independence of observations induced by occurrences of multiple players and years.
it also allows each strokes gained category to have it’s own slope (or effect on making the cut) per season because the course, as well as each player, changes year to year, and some courses might have tighter fairways or harder greens, worse weather etc.
mod<- glm(finish_class~sg_ott+sg_app+sg_putt+sg_t2g+sg_arg + (season|season) + (1|`player id`), data=train)
summary(mod)
##
## Call:
## glm(formula = finish_class ~ sg_ott + sg_app + sg_putt + sg_t2g +
## sg_arg + (season | season) + (1 | `player id`), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.64916 -0.32210 0.02423 0.29593 0.87129
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.51829 0.01706 30.374 <2e-16 ***
## sg_ott 5.71459 2.59785 2.200 0.0283 *
## sg_app 5.73966 2.59702 2.210 0.0276 *
## sg_putt 0.16912 0.01498 11.292 <2e-16 ***
## sg_t2g -5.55827 2.59743 -2.140 0.0329 *
## sg_arg 5.74931 2.59791 2.213 0.0274 *
## season | seasonTRUE NA NA NA NA
## 1 | `player id`TRUE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1383402)
##
## Null deviance: 119.467 on 477 degrees of freedom
## Residual deviance: 65.297 on 472 degrees of freedom
## AIC: 418.96
##
## Number of Fisher Scoring iterations: 2
exp(coef(mod))
## (Intercept) sg_ott sg_app sg_putt
## 1.679149e+00 3.032601e+02 3.109586e+02 1.184267e+00
## sg_t2g sg_arg season | seasonTRUE 1 | `player id`TRUE
## 3.855443e-03 3.139753e+02 NA NA
strokes gained mostly everywhere else still increase one’s odds of making the cut. however, it seems like strokes gained tee to green has a negative impact on making the cut. looking at the logit coefficients exponentiated above, every additional stroke gained putting, for example, increases one’s chances of making the cut by 1.18. every stroke gained off the tee increases one’s chances of making the cut by about 303. every additional stroke gained tee to green increases one’s odds of missing the cut by 0.004. a very small chance, but odd it is in that direction.
mod<- glm(finish_class~sg_ott+sg_app+sg_putt+sg_arg + (season|season) + (1|`player id`), data=train)
summary(mod)
##
## Call:
## glm(formula = finish_class ~ sg_ott + sg_app + sg_putt + sg_arg +
## (season | season) + (1 | `player id`), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.63557 -0.32433 0.02811 0.30099 0.92712
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.51745 0.01712 30.219 < 2e-16 ***
## sg_ott 0.15571 0.02638 5.902 6.87e-09 ***
## sg_app 0.18236 0.01579 11.550 < 2e-16 ***
## sg_putt 0.17086 0.01501 11.382 < 2e-16 ***
## sg_arg 0.19021 0.02110 9.014 < 2e-16 ***
## season | seasonTRUE NA NA NA NA
## 1 | `player id`TRUE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1393871)
##
## Null deviance: 119.47 on 477 degrees of freedom
## Residual deviance: 65.93 on 473 degrees of freedom
## AIC: 421.58
##
## Number of Fisher Scoring iterations: 2
exp(coef(mod))
## (Intercept) sg_ott sg_app sg_putt
## 1.677751 1.168486 1.200052 1.186329
## sg_arg season | seasonTRUE 1 | `player id`TRUE
## 1.209507 NA NA
if we remove strokes gained tee to green, the odds ratios make a bit more sense, and are all in the direction that we would expect: gaining strokes in every area increases your chances of making the cut significantly.
now we can make predictions based on who will make the cut. using the
predict function, we get probabilities of making the cut. therefore, i
am just going to round() to predict who will and who will
not make the cut
test$preds <- mod %>% predict(test, type = "response")
## Warning in Ops.factor(1, `player id`): '|' not meaningful for factors
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
hist(test$preds)
test$preds<- round(test$preds)
hist(test$preds)
madecut<- test %>% filter(preds==1) %>%
ggplot(aes(x=player, y=sg_total, color=sg_total)) +
geom_point() +
ylab("average total strokes gained 2023") +
xlab("player")+
ggtitle("predicted to make cut")+
theme(legend.position = "blank", axis.text.x = element_text(angle=75, size=4, hjust = 1)) +
scale_colour_gradient(low="red", high="darkgreen")
missedcut<- test %>% filter(preds==0) %>%
ggplot(aes(x=player, y=sg_total, color=sg_total)) +
geom_point() +
ylab("average total strokes gained 2023") +
xlab("player")+
ggtitle("predicted to miss cut")+
theme(legend.position = "blank", axis.text.x = element_text(angle=75, size=4, hjust = 1))+
scale_colour_gradient(low="red", high="darkgreen")
ggarrange(madecut, missedcut, nrow=2, ncol=1, common.legend = F)
here, we can run simple linear regression models to predict each person’s finish based on strokes gained.
linear<- us %>% select(player, `player id`, season, starts_with("sg"), finish2) %>%
filter(!is.na(finish2))
head(linear)
## player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Adam Hadwin 5548 2022 1.30 0.80 0.54 0.21 1.54 2.85
## 2 Alexander Noren 3832 2022 -1.42 0.73 0.87 -0.81 0.80 -0.62
## 3 Andrew Putnam 5502 2022 1.43 0.60 -0.48 -0.45 -0.34 1.09
## 4 Adam Schenk 10372 2022 0.66 0.65 0.14 0.14 0.93 1.59
## 5 Adam Scott 388 2022 0.54 0.79 1.20 -0.43 1.55 2.10
## 6 Aaron Wise 10577 2022 0.09 1.08 -0.34 0.53 1.26 1.34
## finish2
## 1 7
## 2 999
## 3 31
## 4 24
## 5 14
## 6 27
hist(linear$finish2)
linear$finish2<- ifelse(linear$finish2==999, 78, linear$finish2)
mod<- lmer(finish2~sg_ott+sg_app+sg_putt+sg_t2g+sg_arg +(season|season) + (1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_ott + sg_app + sg_putt + sg_t2g + sg_arg + (season |
## season) + (1 | `player id`)
## Data: linear
##
## REML criterion at convergence: 3939
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1196 -0.7250 -0.2138 0.7812 2.0883
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## player id (Intercept) 1.8660 1.3660
## season (Intercept) 0.0000 0.0000
## season2021 0.1682 0.4101 NaN
## season2022 40.0114 6.3255 NaN -0.80
## Residual 230.0580 15.1677
## Number of obs: 478, groups: player id, 245; season, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 52.4833 0.8305 63.198
## sg_ott -298.5576 107.2274 -2.784
## sg_app -298.3450 107.1924 -2.783
## sg_putt -11.5875 0.6154 -18.828
## sg_t2g 286.2186 107.2042 2.670
## sg_arg -298.3895 107.2276 -2.783
##
## Correlation of Fixed Effects:
## (Intr) sg_ott sg_app sg_ptt sg_t2g
## sg_ott 0.070
## sg_app 0.070 1.000
## sg_putt 0.084 -0.042 -0.042
## sg_t2g -0.070 -1.000 -1.000 0.043
## sg_arg 0.070 1.000 1.000 -0.043 -1.000
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
## saving for ML model in python
# py<- linear %>% select(-c(sg_t2g, sg_total))
# py$`player id` <- factor(py$player, levels=ids$player, labels=ids$`player id`)
# py<- na.omit(py)
# py$player<- NULL
# py<- py %>% select(colnames(train[c(2:7)]), finish2)
# fwrite(py, "usopen_regression_train.csv", sep=',')
the linear regression model is very similar to the logistic regression model, but now we are predicting overall finish in the US open, not just making/missing the cut. here, we designated everyone who missed the cut a score of 78 (the last place across all the US open data was 77). again, we see strokes gained almost everywhere makes a big impact on finish. for all variables except strokes gained tee to green, additional strokes gained decrease one’s predicting finishing place. for example from the output above, for each additional stroke gained putting, a player’s expected finish decreases by 11. for every additional stroke gained off the tee, a player’s expected finish decreases by 247. oddly enough again, for every additional stroke gained tee to green, a player’s expected finish is actually increasing substantially. i’m not entirely sure how that works given strokes gained tee to green has to be some combination of strokes gained off the tee, on the approach and around the green.
mod<- lmer(finish2~sg_putt+sg_t2g +(season|season) + (1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_putt + sg_t2g + (season | season) + (1 | `player id`)
## Data: linear
##
## REML criterion at convergence: 3962.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1152 -0.7498 -0.2153 0.7732 2.0530
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## player id (Intercept) 1.594e+00 1.262e+00
## season (Intercept) 5.233e-08 2.288e-04
## season2021 1.337e-07 3.656e-04 -1.00
## season2022 3.243e+01 5.695e+00 -0.83 0.83
## Residual 2.328e+02 1.526e+01
## Number of obs: 478, groups: player id, 245; season, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 52.6072 0.7861 66.92
## sg_putt -11.6545 0.6173 -18.88
## sg_t2g -12.1671 0.4502 -27.03
##
## Correlation of Fixed Effects:
## (Intr) sg_ptt
## sg_putt 0.095
## sg_t2g 0.025 0.055
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
interestingly, if we just look at strokes gained tee to green and putting, they both drastically decrease one’s expected finish with every additional stroke gained. that is the direction we would expect. I would guess perhaps there is some sort of suppression effect occurring when strokes gained tee to green is in the model with all other variables that comprise strokes gained tee to green. it might make sense thus to take it out of the machine learning model to follow.
mod<- lmer(finish2~sg_ott+sg_app+sg_putt+sg_arg +(season|season) + (1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_ott + sg_app + sg_putt + sg_arg + (season | season) +
## (1 | `player id`)
## Data: linear
##
## REML criterion at convergence: 3957.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1233 -0.7478 -0.2205 0.7683 2.0504
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## player id (Intercept) 1.691e+00 1.300e+00
## season (Intercept) 8.567e-08 2.927e-04
## season2021 1.452e-08 1.205e-04 -1.00
## season2022 3.278e+01 5.725e+00 -0.99 0.99
## Residual 2.333e+02 1.528e+01
## Number of obs: 478, groups: player id, 245; season, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 52.6012 0.7877 66.78
## sg_ott -12.2875 1.0860 -11.31
## sg_app -12.1614 0.6498 -18.71
## sg_putt -11.6544 0.6189 -18.83
## sg_arg -12.1148 0.8668 -13.98
##
## Correlation of Fixed Effects:
## (Intr) sg_ott sg_app sg_ptt
## sg_ott 0.002
## sg_app 0.009 -0.087
## sg_putt 0.094 0.034 0.060
## sg_arg 0.033 0.056 -0.060 -0.012
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
finally, removing tee to green from the original model we get much more realistic results. every additional stroke gained in every area (off the tee, on the approach, around the green and putting) all decrease one’s expected finish by about 11 to 12.
set.seed(2)
preProcess <- c("center","scale")
trControl <- trainControl(method = "repeatedcv",number = 10,repeats = 10)
model <- train(finish2~sg_putt+sg_arg+sg_app+sg_ott, data=linear, preProcess = preProcess, trControl=trControl) ### run the model
test$finish_pred <- predict(model, test)
head(test)
## player season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total player id
## 1 Aaron Baddeley 2023 0.351 0.534 0.025 -0.423 0.053 0.404 16
## 2 Aaron Rai 2023 -0.391 0.136 0.390 0.265 0.882 0.491 <NA>
## 3 Aaron Wise 2023 0.559 0.228 -0.795 -0.170 -0.696 -0.138 10577
## 4 Adam Hadwin 2023 0.356 0.103 0.350 0.048 0.469 0.825 5548
## 5 Adam Long 2023 0.406 0.094 -0.539 -0.285 -0.670 -0.264 6015
## 6 Adam Schenk 2023 0.355 0.106 0.099 0.024 0.271 0.625 10372
## preds finish_pred
## 1 1 48.94883
## 2 1 49.30457
## 3 0 70.06233
## 4 1 36.80317
## 5 0 62.19587
## 6 1 47.03707
plot<- test %>% arrange(finish_pred)
plot<- plot[1:50,]
ggplot(plot, aes(x = reorder(player, finish_pred), y=finish_pred))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Predicted finish for 2023 US open",x="Player", y = "predicted finish")+
theme_minimal()+
geom_text(
aes(label = round(finish_pred,0)),
colour = "white", size = 2,
vjust = 1.5, position = position_dodge(.9)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
a simple linear prediction model here suggests Scottie will finish the lowest, followed by Jon Rahm, Tyrell Hatton, Xander, Rory and Tony Finau. not too bad if you ask me, although the lowest it is predicting someone to go is 16, so maybe not entirely precise. those guys are definitely mostly playing really well right now though!
linear<- us %>% select(player, `player id`, season, starts_with("sg"), finish2) %>%
filter(!is.na(finish2)) %>%
filter(finish2<900)
head(linear)
## player player id season sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## 1 Adam Hadwin 5548 2022 1.30 0.80 0.54 0.21 1.54 2.85
## 2 Andrew Putnam 5502 2022 1.43 0.60 -0.48 -0.45 -0.34 1.09
## 3 Adam Schenk 10372 2022 0.66 0.65 0.14 0.14 0.93 1.59
## 4 Adam Scott 388 2022 0.54 0.79 1.20 -0.43 1.55 2.10
## 5 Aaron Wise 10577 2022 0.09 1.08 -0.34 0.53 1.26 1.34
## 6 Brian Harman 1225 2022 0.58 -0.63 0.37 0.28 0.01 0.59
## finish2
## 1 7
## 2 31
## 3 24
## 4 14
## 5 27
## 6 43
hist(linear$finish2)
table(linear$finish2)
##
## 1 2 3 4 5 6 7 8 9 10 12 13 14 15 16 17 19 21 22 23 24 26 27 28 30 31
## 4 5 6 5 2 2 11 5 3 2 6 6 7 4 5 5 7 7 1 8 3 5 3 4 1 10
## 32 34 35 37 38 40 43 46 47 48 49 50 51 52 53 54 55 56 57 58 59 61 62 63 64 65
## 3 4 12 6 5 6 12 3 2 1 7 4 3 6 1 1 4 3 4 5 2 2 2 1 1 8
## 68 70 72 76 77
## 2 2 2 1 1
mod<- lmer(finish2~sg_ott+sg_app+sg_putt+sg_arg +(1|`player id`), data=linear)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: finish2 ~ sg_ott + sg_app + sg_putt + sg_arg + (1 | `player id`)
## Data: linear
##
## REML criterion at convergence: 1529.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0208 -0.6297 -0.1544 0.5621 5.3308
##
## Random effects:
## Groups Name Variance Std.Dev.
## player id (Intercept) 0.00 0.000
## Residual 32.66 5.715
## Number of obs: 243, groups: player id, 146
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 48.2335 0.4932 97.80
## sg_ott -13.1841 0.7036 -18.74
## sg_app -14.2677 0.4637 -30.77
## sg_putt -14.4425 0.4197 -34.41
## sg_arg -13.9635 0.6140 -22.74
##
## Correlation of Fixed Effects:
## (Intr) sg_ott sg_app sg_ptt
## sg_ott -0.153
## sg_app -0.455 -0.185
## sg_putt -0.397 0.079 0.167
## sg_arg -0.339 0.014 0.021 0.081
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
the results are about the same, but each additional stroke gained is reducing one’s expected finish a bit more, as expected as these are only the players who performed well enough to make the cut.
sgt<- ggplot(linear, aes(x=sg_total, y=finish2, color=finish2)) +
geom_point() +
facet_wrap(~season)+
ylab("Finish") +
xlab("Strokes gained total")+
geom_hline(yintercept = 10, color="red")+
scale_colour_gradientn(colours = terrain.colors(10))
sgt2g<- ggplot(linear, aes(x=sg_t2g, y=finish2, color=finish2)) +
geom_point() +
facet_wrap(~season)+
ylab("Finish") +
xlab("Strokes gained tee to green")+
geom_hline(yintercept = 10, color="red")+
scale_colour_gradientn(colours = terrain.colors(10))
sgott<- ggplot(linear, aes(x=sg_ott, y=finish2, color=finish2)) +
geom_point() +
facet_wrap(~season)+
ylab("Finish") +
xlab("Strokes gained off the tee")+
geom_hline(yintercept = 10, color="red")+
scale_colour_gradientn(colours = terrain.colors(10))
sgapp<- ggplot(linear, aes(x=sg_app, y=finish2, color=finish2)) +
geom_point() +
facet_wrap(~season)+
ylab("Finish") +
xlab("Strokes gained on the approach")+
geom_hline(yintercept = 10, color="red")+
scale_colour_gradientn(colours = terrain.colors(10))
sgarg<- ggplot(linear, aes(x=sg_arg, y=finish2, color=finish2)) +
geom_point() +
facet_wrap(~season)+
ylab("Finish") +
xlab("Strokes gained around the green")+
geom_hline(yintercept = 10, color="red")+
scale_colour_gradientn(colours = terrain.colors(10))
sgp<- ggplot(linear, aes(x=sg_putt, y=finish2, color=finish2)) +
geom_point() +
facet_wrap(~season)+
ylab("Finish") +
xlab("Strokes gained putting")+
geom_hline(yintercept = 10, color="red")+
scale_colour_gradientn(colours = terrain.colors(10))
the below plot shows all strokes gained categories and their relationships with finish over the last 3 years looked at in this analysis. strokes gained total has the clearest association with finish, which makes sense (green = lower finish = better). the red line represents a top 10 finish at a US open. for the most part, people finishing in the top 10 are also in the positive in strokes gained categories (but not always!)
ggarrange(sgt, sgt2g, sgott, sgapp, sgarg, sgp, nrow=3, ncol=2, common.legend = T)
i did very similar anlayses in python (which I am just learning, and not as comfortable with yet), so I want to compare the model outputs.
it seems like sci-kit learn has better fleshed out models (and model tuning!) compared to R, so I have a hunch those should perform better
sk_class<- fread("sklearn_cut_predictions.csv", header=T, data.table=F)
head(sk_class)
## V1 player preds
## 1 0 Aaron Baddeley 1
## 2 1 Aaron Wise 0
## 3 2 Adam Hadwin 1
## 4 3 Adam Long 1
## 5 4 Adam Schenk 1
## 6 5 Adam Scott 1
sk_class$V1<- NULL
colnames(sk_class)[2]<- "preds_cut"
sk_linear<- fread("sklearn_linear_preds.csv", header=T, data.table = F)
head(sk_linear)
## V1 player id finish_preds player
## 1 0 16 48.98445 Aaron Baddeley
## 2 1 10577 71.39682 Aaron Wise
## 3 2 5548 35.49096 Adam Hadwin
## 4 3 6015 61.67538 Adam Long
## 5 4 10372 45.30434 Adam Schenk
## 6 5 388 44.55440 Adam Scott
sk_linear$V1<- NULL
select just classification predictions from the R model
r_class<- test %>% select(player, preds)
class<- merge(r_class, sk_class, by="player")
head(class)
## player preds preds_cut
## 1 Aaron Baddeley 1 1
## 2 Aaron Wise 0 0
## 3 Adam Hadwin 1 1
## 4 Adam Long 0 1
## 5 Adam Schenk 1 1
## 6 Adam Scott 1 1
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
tetrachoric(class[,2:3]) ### the classification models correlate at 81%. not super high but pretty good
## Call: tetrachoric(x = class[, 2:3])
## tetrachoric correlation
## preds prds_
## preds 1.00
## preds_cut 0.81 1.00
##
## with tau of
## preds preds_cut
## -0.46 -0.94
table(class$preds) ### the R model is predicting 39 people to miss the cut
##
## 0 1
## 39 82
table(class$preds_cut) ### the sci kit learn model is only predicting 21 people to miss the cut
##
## 0 1
## 21 100
the below plot shows who was predicted to the make cut or not in both models.
if someone has a 2 on the y axis (confidence number) they were predicted to make the cut in both models. if someone only has a 1, they were only predicted to make the cut in one of the models. finally, if someone has a zero, or no bar, they were predicted to miss the cut in both models.
class_long <- class %>%
gather(preds, preds_cut, key = "model", value = "count") %>%
mutate(model = recode(model, preds = "R classification model", preds_cut = "sci-kit learn model"))
ggplot(class_long, aes(x = reorder(player, -count), y = count, fill = model)) +
geom_bar(stat = "identity", position = "stack", color = "black") +
theme(legend.position = "bottom", axis.text.x = element_text(angle=80, size=5, hjust = 1))+
xlab("player")+
ylab("confidence number")
select just linear predictions from the R model
r_linear<- test %>% select(player, finish_pred)
lin<- merge(r_linear, sk_linear, by="player")
head(lin)
## player finish_pred player id finish_preds
## 1 Aaron Baddeley 48.94883 16 48.98445
## 2 Aaron Wise 70.06233 10577 71.39682
## 3 Adam Hadwin 36.80317 5548 35.49096
## 4 Adam Long 62.19587 6015 61.67538
## 5 Adam Schenk 47.03707 10372 45.30434
## 6 Adam Scott 40.80673 388 44.55440
cor.test(lin$finish_pred, lin$finish_preds)
##
## Pearson's product-moment correlation
##
## data: lin$finish_pred and lin$finish_preds
## t = 110.95, df = 119, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9931230 0.9966524
## sample estimates:
## cor
## 0.9952012
### wow! the two linear predictions correlate at almost 100%
r_plot<- ggplot(plot, aes(x = reorder(player, finish_pred), y=finish_pred))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Predicted finish for 2023 US open from R",x="Player", y = "predicted finish")+
theme_minimal()+
geom_text(
aes(label = round(finish_pred,0)),
colour = "white", size = 2,
vjust = 1.5, position = position_dodge(.9)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
plot2<- lin %>% arrange(finish_preds)
plot2<- plot2[1:50,]
py_plot<- ggplot(plot2, aes(x = reorder(player, finish_preds), y=finish_preds))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Predicted finish for 2023 US open from sci-kit learn",x="Player", y = "predicted finish")+
theme_minimal()+
geom_text(
aes(label = round(finish_preds,0)),
colour = "white", size = 2,
vjust = 1.5, position = position_dodge(.9)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
ggarrange(r_plot, py_plot, nrow=2, ncol=1, common.legend = F)
the people we are most confident will make the cut and finish top 20?
top<- merge(lin, class, by="player")
top<- top %>% filter(preds==1 & preds_cut==1) %>%
filter(finish_pred< 40 & finish_preds<40)
season<- test %>% select(player, starts_with("sg"))
top<- merge(top, season, by="player")
these are 33 people we’re pretty confident will do well. here, we plot their strokes gained from the 2023 season thus far, arranged in order of predicted finish:
sgt<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_total, color=finish_preds)) +
geom_point() +
xlab("Player") +
ggtitle("Strokes gained total")+
theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
scale_colour_gradientn(colours = terrain.colors(10))
sgt2g<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_t2g, color=finish_preds)) +
geom_point() +
xlab("Player") +
ggtitle("Strokes gained tee to green")+
theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
scale_colour_gradientn(colours = terrain.colors(10))
sgott<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_ott, color=finish_preds)) +
geom_point() +
xlab("Player") +
ggtitle("Strokes gained off the tee")+
theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
scale_colour_gradientn(colours = terrain.colors(10))
sgapp<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_app, color=finish_preds)) +
geom_point() +
xlab("Player") +
ggtitle("Strokes gained on the approach")+
theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
scale_colour_gradientn(colours = terrain.colors(10))
sgarg<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_arg, color=finish_preds)) +
geom_point() +
xlab("Player") +
ggtitle("Strokes gained around the green")+
theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
scale_colour_gradientn(colours = terrain.colors(10))
sgp<- ggplot(top, aes(x = reorder(player, finish_preds), y=sg_putt, color=finish_preds)) +
geom_point() +
xlab("Player") +
ggtitle("Strokes gained putting")+
theme(legend.position = "blank", axis.text.x = element_text(angle=60, size=4, hjust = 1))+
scale_colour_gradientn(colours = terrain.colors(10))
ggarrange(sgt, sgt2g, sgott, sgapp, sgarg, sgp, nrow=3, ncol=2, common.legend = F)