The are many stats used to measure the performance of PGA tour golfers. Here I would like to consider some of those metrics as predictors of success and as a method of classification. We will compare one relatively recently introduced group of metrics, strokes gained1, with some more traditional measures. We will also try several different models as predictors of success against several different outcomes, such as proportion of top 10 finishes, money earned, and ranking.
The data for this project was provided by Jack Schultz, a blogger who wrote a post about scraping PGA.com stats data using Python2 and was kind enough to provide a copy of the data he had already scraped.
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
library(tidyverse)
library(broom)
library(rpart)
library(plotly)
library(knitr)
# get around superfluous error message
my_parse_number <- function(x, ...) {
if (mode(x) == 'numeric') {
x
} else {
parse_number(x, ...)
}
}
golfers <- bind_rows(lapply(2004:2018, function(y) {
cbind(year = y,
reduce(.f = inner_join, by = 'name', .x = list(
## Outcomes - rankings, money, and wins
# All-Around Ranking
read_csv(paste0('tourstats/stats_csv_years/', y, '/127.csv')) %>%
filter(EVENTS >= 5) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
transmute(name = `PLAYER NAME`,
ranking = my_parse_number(`RANK THIS WEEK`)),
# Money per Event Leaders
read_csv(paste0('tourstats/stats_csv_years/', y, '/154.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
transmute(name = `PLAYER NAME`,
money_per_event = my_parse_number(`MONEY PER EVENT`)),
# Total Money (Official and Unofficial)
read_csv(paste0('tourstats/stats_csv_years/', y, '/194.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
transmute(name = `PLAYER NAME`,
total_money = my_parse_number(MONEY)),
# Top 10 Finishes
read_csv(paste0('tourstats/stats_csv_years/', y, '/138.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
transmute(name = `PLAYER NAME`,
top_10_prop = `TOP 10` / EVENTS),
# Victory Leaders
read_csv(paste0('tourstats/stats_csv_years/', y, '/300.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
transmute(name = `PLAYER NAME`,
victories_prop = replace_na(VICTORIES, 0) / EVENTS),
## Percentages
# Driving Distance
read_csv(paste0('tourstats/stats_csv_years/', y, '/101.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
driving_distance = AVG.),
# Driving Accuracy Percentage
read_csv(paste0('tourstats/stats_csv_years/', y, '/102.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
driving_accuracy_pct = `%`),
# Greens in Regulation Percentage
read_csv(paste0('tourstats/stats_csv_years/', y, '/103.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
greens_in_regulation_pct = `%`),
# Scrambling
read_csv(paste0('tourstats/stats_csv_years/', y, '/130.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
scrambling_pct = `%`),
# One-Putt Percentage
read_csv(paste0('tourstats/stats_csv_years/', y, '/413.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
one_putt_pct = `%`),
# Rough Tendency
read_csv(paste0('tourstats/stats_csv_years/', y, '/02435.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
rough_tendency = `%`),
# Fairway Bunker Tendency
read_csv(paste0('tourstats/stats_csv_years/', y, '/01008.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
fairway_bunker_tendency = `%`),
# 3-Putt Avoidance
read_csv(paste0('tourstats/stats_csv_years/', y, '/426.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
three_putt_avoidance = `%`),
# 3-Putts per Round
read_csv(paste0('tourstats/stats_csv_years/', y, '/400.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
three_putts_per_round = AVG),
## "Strokes Gained" metrics
# SG: Tee-to-Green
read_csv(paste0('tourstats/stats_csv_years/', y, '/02674.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
sg_tee_to_green = AVERAGE),
# SG: Putting
read_csv(paste0('tourstats/stats_csv_years/', y, '/02564.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
sg_putting = AVERAGE),
# SG: Total
read_csv(paste0('tourstats/stats_csv_years/', y, '/02675.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
sg_total = AVERAGE),
# SG: Off-the-Tee
read_csv(paste0('tourstats/stats_csv_years/', y, '/02567.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
sg_off_the_tee = AVERAGE),
# SG: Approach-the-Green
read_csv(paste0('tourstats/stats_csv_years/', y, '/02568.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
sg_approach_the_green = AVERAGE),
# SG: Around-the-Green
read_csv(paste0('tourstats/stats_csv_years/', y, '/02569.csv')) %>%
distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
select(name = `PLAYER NAME`,
sg_around_the_green = AVERAGE)
# End of list, reduce
))
# End of cbind
)
# End of lapply, bind_rows
}))
# Store the variables we are using as "outcomes" and as "metrics" (predictors).
outcomes <- c('ranking', 'money_per_event', 'total_money', 'top_10_prop', 'victories_prop')
metrics <- golfers %>%
select(-name, -year, -!!outcomes) %>%
names
In this analysis there are fifteen years of PGA tour player data (2004-2018) with 22 variables. In addition to player name and tour year, there are variables that we will be using as “outcomes” (response) and others as “metrics” (predictors). For each year we only look at players who were ranked for at least five events and only those having complete data. Even with those limits, there are a good number of golfers per year to analyze. Here is a count of golfers per year in the result:
golfers %>%
with(table(year))
## year
## 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
## 164 162 161 159 163 156 165 143 157 146 149 150 148 160 160
Because there are many different variables to choose from (having already selected only a subset of the available variables from PGA.com), there are probably more than we really need. We can look at the correlation matrix to get a sense of what kind of colinearities we may be facing, at least in terms of pairwise colinearity. Showing the whole matrix directly would be a lot of numbers, so instead let’s only examine the highest (in absolute value) pairings.
golfers %>%
select(!!metrics) %>%
cor %>%
abs %>%
replace(upper.tri(., TRUE), NA) %>%
as_tibble(rownames = 'var1') %>%
gather(key = 'var2', value = 'cor', ... = -var1, na.rm = TRUE) %>%
arrange(-cor) %>%
top_n(10, cor) %>%
kable
| var1 | var2 | cor |
|---|---|---|
| three_putts_per_round | three_putt_avoidance | 0.9947672 |
| rough_tendency | driving_accuracy_pct | 0.9090466 |
| sg_total | sg_tee_to_green | 0.8510743 |
| sg_approach_the_green | sg_tee_to_green | 0.8130344 |
| sg_approach_the_green | sg_total | 0.6943186 |
| sg_off_the_tee | sg_tee_to_green | 0.6561241 |
| sg_putting | three_putt_avoidance | 0.6284127 |
| sg_putting | three_putts_per_round | 0.6217183 |
| sg_off_the_tee | driving_distance | 0.6057743 |
| driving_accuracy_pct | driving_distance | 0.6032331 |
What we see are a few different ways that variables may be similar. First, three_putts_per_round and three_putt_avoidance are really just two different ways of measuring the same thing, and there is no surprise that rough_tendency is related to driving_accuracy_pct, so again there is some redundancy in the way different metrics measure the same thing.
We also have metrics that are directly related by definition. The strokes gained measures count the drive, approach, and putting in two different ways, both adding up to the same total (sg_total).
One formulation is:
sg_tee_to_green + sg_putting = sg_total
A somewhat newer version breaks down tee-to-green into more categories:
sg_off_the_tee + sg_approach_the_green + sg_around_the_green + sg_putting = sg_total
This newer formulation has been applied to the older data so we have this for all of the years in our data set. Having examined areas where there may be redundancies, let’s select two groups of variables to compare in some models. First, there are the more traditional measures, excluding some apparent redundancies as noted above.
metrics_trad <- c('driving_distance',
'driving_accuracy_pct',
'greens_in_regulation_pct',
'scrambling_pct',
'one_putt_pct',
'rough_tendency',
'fairway_bunker_tendency',
'three_putt_avoidance')
writeLines(metrics_trad)
## driving_distance
## driving_accuracy_pct
## greens_in_regulation_pct
## scrambling_pct
## one_putt_pct
## rough_tendency
## fairway_bunker_tendency
## three_putt_avoidance
Then there are the strokes gained measures, using the more nuanced newer version and excluding the total.
metrics_sg <- c('sg_off_the_tee',
'sg_approach_the_green',
'sg_around_the_green',
'sg_putting')
writeLines(metrics_sg)
## sg_off_the_tee
## sg_approach_the_green
## sg_around_the_green
## sg_putting
Let’s try some classic linear regression models on these sets of variables and see how well they perform. If a golfer is skilled in more of these areas, one would expect them to be a top 10 finisher more often, so let’s try to predict that.
These are the performance metrics of a linear model using the traditional measures:
golfers %>%
select(y = top_10_prop, !!metrics_trad) %>%
lm(y ~ ., .) %>%
glance %>%
kable
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.3289553 | 0.3266552 | 0.0890619 | 143.0198 | 0 | 9 | 2346.302 | -4672.604 | -4615.012 | 18.51335 | 2334 |
And these are the metrics using strokes gained measures:
golfers %>%
select(y = top_10_prop, !!metrics_sg) %>%
lm(y ~ ., .) %>%
glance %>%
kable
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.5957839 | 0.5950923 | 0.0690639 | 861.5087 | 0 | 5 | 2940.119 | -5868.238 | -5833.683 | 11.15185 | 2338 |
Among other indicators, we see lower AIC and BIC and higher R-squared for the model built on strokes gained, which suggests these measures are a better indicator of outcomes over the traditional metrics. But perhaps this is particular to top 10 finishes, what if we compare these models against other outcomes? Here we compare both models on each outcome using AIC:
map_dfr(outcomes, function(x) {
bind_rows(
golfers %>%
select(y = !!x, !!metrics_trad) %>%
lm(y ~ ., .) %>%
glance %>%
cbind(outcome = x,
Predictors = 'Tradtional',
.,
stringsAsFactors = FALSE),
golfers %>%
select(y = !!x, !!metrics_sg) %>%
lm(y ~ ., .) %>%
glance %>%
cbind(outcome = x,
Predictors = 'Strokes gained',
.,
stringsAsFactors = FALSE)
)
}) %>%
select(outcome, Predictors, AIC) %>%
ggplot(aes(x = 0, y = AIC, color = Predictors)) +
geom_point() +
scale_fill_brewer(palette = 'Set1') +
theme_light() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_wrap(. ~outcome, scales = 'free_y')
It appears that the strokes gained metrics consistently outperform the traditional.
The strokes gained metrics already have a relatively small number of predictors (four), but there are eight predictors in the traditional metrics. Maybe a third set of predictors created by running principal components analysis on the traditional metrics would do better.
pca_trad <- golfers %>%
select(!!metrics_trad) %>%
prcomp()
summary(pca_trad)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 10.0393 5.0153 3.33630 2.33694 1.32929 1.31127
## Proportion of Variance 0.6848 0.1709 0.07563 0.03711 0.01201 0.01168
## Cumulative Proportion 0.6848 0.8557 0.93133 0.96844 0.98045 0.99213
## PC7 PC8
## Standard deviation 0.90017 0.59000
## Proportion of Variance 0.00551 0.00237
## Cumulative Proportion 0.99763 1.00000
It looks like we can capture over ninety percent of the variability with just three components. Taking only three components also allows us to try an interesting visualization using a selection of famous golfers:
u <- as.data.frame(pca_trad$x)
names(u) <- paste0('PC', 1:ncol(u))
golfers <- cbind(golfers, u)
famous_golfers <- c('Dustin Johnson',
'Jordan Spieth',
'Phil Mickelson',
'Rickie Fowler',
'Rory McIlroy',
'Tiger Woods')
golfers %>%
filter(name %in% famous_golfers) %>%
plot_ly(x = ~PC1, y = ~PC2, z = ~PC3,
color = ~name,
size = ~top_10_prop,
hoverinfo = 'text',
text = ~paste0(.$name, ', ', .$year)) %>%
add_markers()
Although it is more interesting if we use variables that are more meaningful:
golfers %>%
filter(name %in% famous_golfers) %>%
plot_ly(x = ~driving_distance, y = ~driving_accuracy_pct, z = ~one_putt_pct,
color = ~name,
size = 1,
hoverinfo = 'text',
text = ~paste0(.$name, ', ', .$year)) %>%
add_markers() %>%
layout(scene = list(camera = list(eye = list(x = -1.9, y = -2.2, z = 0.3))))
When looking for a model to predict success from performance metrics, we should also consider using something other than linear regression. One possible alternative is a classification and regression tree (CART) based on analysis of variance (ANOVA). The implementation of the model used below does not readily provide for measures such as AIC as the linear model before, but we can compute a measure like root mean square error (RMSE). In the following, we take the three sets of predictor variables – traditional, strokes gained, and the PCA reduction of the traditional – and build both a linear regression and CART model for each. Prior to model building we split the data into training and testing sets to make sure we aren’t overfitting.
set.seed(-1)
train_set <- sample(nrow(golfers), .8 * nrow(golfers))
# Rescale values so RMSE is consistently scaled
train <- golfers[train_set,] %>%
mutate_if(is_double, scale)
test <- golfers[-train_set,] %>%
mutate_if(is_double, scale)
map_dfr(outcomes, function(x) {
lm_trad <- lm(y ~ .,
select(train, y = !!x, !!metrics_trad))
lm_sg <- lm(y ~ .,
select(train, y = !!x, !!metrics_sg))
lm_pca <- lm(y ~ .,
select(train, y = !!x, PC1, PC2, PC3))
cart_trad <- rpart(y ~ .,
data = select(train, y = !!x, !!metrics_trad),
method = 'anova')
cart_sg <- rpart(y ~ .,
data = select(train, y = !!x, !!metrics_sg),
method = 'anova')
cart_pca <- rpart(y ~ .,
data = select(train, y = !!x, PC1, PC2, PC3),
method = 'anova')
data.frame(ttp = test$top_10_prop,
ltp = predict(lm_trad, test),
lsp = predict(lm_sg, test),
lpp = predict(lm_pca, test),
ctp = predict(cart_trad, test),
csp = predict(cart_sg, test),
cpp = predict(cart_pca, test)) %>%
mutate(lte = (ttp - ltp) ^ 2,
lse = (ttp - lsp) ^ 2,
lpe = (ttp - lpp) ^ 2,
cte = (ttp - ctp) ^ 2,
cse = (ttp - csp) ^ 2,
cpe = (ttp - cpp) ^ 2) %>%
summarise(outcome = x,
`Traditional, Linear` = sqrt(mean(lte)),
`Strokes gained, Linear` = sqrt(mean(lse)),
`PCA, Linear` = sqrt(mean(lpe)),
`Traditional, CART` = sqrt(mean(cte)),
`Strokes gained, CART` = sqrt(mean(cse)),
`PCA, CART` = sqrt(mean(cpe)))
}) %>%
gather('model', 'RMSE', ... = -outcome) %>%
ggplot(aes(x = 0, y = RMSE, fill = model)) +
geom_col(position = 'dodge') +
scale_fill_brewer(palette = 'Paired') +
theme_light() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_wrap(. ~outcome)
It is interesting to note the outlier that ranking is here. In most cases the linear models actually perform slightly better than CART, with the strokes gained metrics performing better in general. With ranking, however, the linear strokes gained model performs the worst of all.
Overall though it appears that a linear regression model of the strokes gained metrics will have the best performance in predicting success.
Another interesting question is how the coefficients of the predictors have changed over time. This next step will create a linear model for each year of the data, collect the coefficients, and plot their changes over time. For an outcome measure let’s predict the proportion of top 10 finishes top_10_prop. The following plot shows the actual values overlayed with a smoothing curve.
lin_mod_coeffs <- function(d) {
this_mod <- lm(top_10_prop ~ ., select(d, top_10_prop, !!metrics_sg))
tidy(this_mod)
}
golfers %>%
filter(ranking <= 25) %>%
mutate_if(is_double, scale) %>%
group_by(year) %>%
nest %>%
mutate(coeffs = map(data, lin_mod_coeffs)) %>%
unnest(coeffs) %>%
filter(term != '(Intercept)') %>%
select(Year = year, term, Coefficient = estimate) %>%
mutate(Metric = factor(term,
levels = c('sg_off_the_tee',
'sg_approach_the_green',
'sg_around_the_green',
'sg_putting'),
labels = c('Off the tee',
'Approach the green',
'Around the green',
'Putting'))) %>%
ggplot(aes(x = Year, color = Metric, y = Coefficient)) +
geom_line(alpha = 0.4, linetyp = 'dashed') +
geom_smooth(method = 'loess', se = FALSE, size = .8) +
scale_y_continuous(limits = c(0, 1))
It is interesting to observe the rise and fall of importance of different aspects over time. In this time span the off the tee play and putting tend to rise and fall together. In some recent years both of these measures are rising, but there is a more recent decline and lately a slow rise in the short game around the green. Perhaps good scrambling play is the differentiator of the future.
Finally, let’s see if we can use the strokes gained performance metrics to form clusters of similar golfers. First we consider only golfers who were ever in the top five ranking at some point in this data set, and for each golfer we take the mean of their scores over the years available. Performing complete linkage hierarchical clustering provides an interesting picture. The number of clusters chosen is somewhat arbitrary, but it lends well to description as we shall see below.
golfers_mean_sg <- golfers %>%
group_by(name) %>%
filter(min(ranking) <= 5) %>%
summarise_at(metrics_sg, mean)
gclusters <- golfers_mean_sg %>%
column_to_rownames('name') %>%
dist %>%
hclust
plot(gclusters, axes = FALSE, ann = FALSE)
rect.hclust(gclusters, 5)
Now let’s repeat that clustering, this time keeping all golfers and taking a look at the characteristics of each cluster. Also, let’s give a name to each cluster by finding the top earning player in each. From here on, the golfer names represent clusters of many golfers (with one exception), with the one name being a representative of that group. Here are a count of the members of each group.
golfers_mean_sg <- golfers %>%
group_by(name) %>%
summarise_at(metrics_sg, mean)
gclusters <- golfers_mean_sg %>%
column_to_rownames('name') %>%
dist %>%
hclust
golfers_w_cluster <- golfers_mean_sg %>%
mutate(cluster = cutree(gclusters, 5)) %>%
select(name, cluster) %>%
merge(golfers, by = 'name')
golfers_w_cluster <- golfers_w_cluster %>%
group_by(name, cluster) %>%
summarise(max_top_10 = max(top_10_prop)) %>%
group_by(cluster) %>%
slice(which.max(max_top_10)) %>%
select(cluster_name = name,
cluster) %>%
inner_join(golfers_w_cluster, by = 'cluster')
golfers_w_cluster %>%
group_by(cluster_name) %>%
summarise(count = n_distinct(name))
## # A tibble: 5 x 2
## cluster_name count
## <chr> <int>
## 1 Jason Dufner 79
## 2 Luke Donald 173
## 3 Rory McIlroy 201
## 4 Tiger Woods 1
## 5 Tommy Gainey 43
What does each cluster look like on the metrics?
golfers_w_cluster %>%
group_by(cluster_name) %>%
summarise_at(metrics_sg, mean) %>%
gather(... = -cluster_name) %>%
mutate(Metric = factor(key,
levels = c('sg_off_the_tee',
'sg_approach_the_green',
'sg_around_the_green',
'sg_putting'),
labels = c('Off the tee',
'Approach the green',
'Around the green',
'Putting'))) %>%
ggplot(aes(x = cluster_name, y = value, fill = Metric)) +
geom_col(position = 'dodge') +
xlab('Cluster name') +
ylab('Strokes gained') +
scale_y_continuous(limits = c(-0.5, 1.5)) +
theme_light()
We could summarize the groups like so:
Comparing a few other metrics by cluster:
golfers_w_cluster %>%
group_by(cluster_name) %>%
ggplot(aes(x = cluster_name, y = ranking)) +
geom_boxplot() +
xlab('Cluster name') +
ylab('Yearly ranking') +
theme_light()
golfers_w_cluster %>%
group_by(cluster_name) %>%
ggplot(aes(x = cluster_name, y = money_per_event)) +
geom_boxplot() +
xlab('Cluster name') +
ylab('Year average money earned per event') +
theme_light()
golfers_w_cluster %>%
group_by(cluster_name) %>%
ggplot(aes(x = cluster_name, y = top_10_prop)) +
geom_boxplot() +
xlab('Cluster name') +
ylab('Year average proportion of events in the top 10') +
theme_light()
And lastly, returning to an earlier idea, now coloring by cluster. Here shown are averages by player over all of the years, and plotting on metrics that were used in the clustering, minus the approach shot, since we only have three dimensions to work with! Zooming on the plot is useful on this one.
golfers_w_cluster %>%
group_by(name, cluster_name) %>%
summarise_at(metrics_sg, mean) %>%
plot_ly(x = ~sg_off_the_tee, y = ~sg_around_the_green, z = ~sg_putting,
color = ~cluster_name,
size = 1,
hoverinfo = 'text',
text = ~name) %>%
add_markers()