The are many stats used to measure the performance of PGA tour golfers. Here I would like to consider some of those metrics as predictors of success and as a method of classification. We will compare one relatively recently introduced group of metrics, strokes gained1, with some more traditional measures. We will also try several different models as predictors of success against several different outcomes, such as proportion of top 10 finishes, money earned, and ranking.

The data for this project was provided by Jack Schultz, a blogger who wrote a post about scraping PGA.com stats data using Python2 and was kind enough to provide a copy of the data he had already scraped.

knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
library(tidyverse)
library(broom)
library(rpart)
library(plotly)
library(knitr)

# get around superfluous error message
my_parse_number <- function(x, ...) {
  if (mode(x) == 'numeric') {
    x
  } else {
    parse_number(x, ...)
  }
}

golfers <- bind_rows(lapply(2004:2018, function(y) {
  cbind(year = y,
        reduce(.f = inner_join, by = 'name', .x = list(
          
          ## Outcomes - rankings, money, and wins
          
          # All-Around Ranking  
          read_csv(paste0('tourstats/stats_csv_years/', y, '/127.csv')) %>%
            filter(EVENTS >= 5) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            transmute(name = `PLAYER NAME`,
                      ranking = my_parse_number(`RANK THIS WEEK`)),
          
          # Money per Event Leaders 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/154.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            transmute(name = `PLAYER NAME`,
                      money_per_event = my_parse_number(`MONEY PER EVENT`)),
          
          # Total Money (Official and Unofficial)   
          read_csv(paste0('tourstats/stats_csv_years/', y, '/194.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            transmute(name = `PLAYER NAME`,
                      total_money = my_parse_number(MONEY)),
          
          # Top 10 Finishes 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/138.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            transmute(name = `PLAYER NAME`,
                      top_10_prop = `TOP 10` / EVENTS),
          
          # Victory Leaders 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/300.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            transmute(name = `PLAYER NAME`,
                      victories_prop = replace_na(VICTORIES, 0) / EVENTS),
          
          ## Percentages
          # Driving Distance    
          read_csv(paste0('tourstats/stats_csv_years/', y, '/101.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   driving_distance = AVG.),
          
          # Driving Accuracy Percentage 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/102.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   driving_accuracy_pct = `%`),
          
          
          # Greens in Regulation Percentage 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/103.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   greens_in_regulation_pct = `%`),
          
          # Scrambling  
          read_csv(paste0('tourstats/stats_csv_years/', y, '/130.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   scrambling_pct = `%`),
          
          # One-Putt Percentage 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/413.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   one_putt_pct = `%`),
          
          # Rough Tendency  
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02435.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   rough_tendency = `%`),
          
          # Fairway Bunker Tendency 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/01008.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   fairway_bunker_tendency = `%`),
          
          # 3-Putt Avoidance    
          read_csv(paste0('tourstats/stats_csv_years/', y, '/426.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   three_putt_avoidance = `%`),
          
          # 3-Putts per Round   
          read_csv(paste0('tourstats/stats_csv_years/', y, '/400.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   three_putts_per_round = AVG),
          
          
          ## "Strokes Gained" metrics
          
          # SG: Tee-to-Green    
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02674.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   sg_tee_to_green = AVERAGE),
          
          # SG: Putting 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02564.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   sg_putting = AVERAGE),
          
          # SG: Total   
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02675.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   sg_total = AVERAGE),
          
          # SG: Off-the-Tee 
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02567.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   sg_off_the_tee = AVERAGE),
          
          # SG: Approach-the-Green  
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02568.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   sg_approach_the_green = AVERAGE),
          
          # SG: Around-the-Green    
          read_csv(paste0('tourstats/stats_csv_years/', y, '/02569.csv')) %>%
            distinct(`PLAYER NAME`, .keep_all = TRUE) %>%
            select(name = `PLAYER NAME`,
                   sg_around_the_green = AVERAGE)
          
          # End of list, reduce
        ))
        # End of cbind        
  )
  # End of lapply, bind_rows
}))

# Store the variables we are using as "outcomes" and as "metrics" (predictors).
outcomes <- c('ranking', 'money_per_event', 'total_money', 'top_10_prop', 'victories_prop')

metrics <- golfers %>%
  select(-name, -year, -!!outcomes) %>%
  names

In this analysis there are fifteen years of PGA tour player data (2004-2018) with 22 variables. In addition to player name and tour year, there are variables that we will be using as “outcomes” (response) and others as “metrics” (predictors). For each year we only look at players who were ranked for at least five events and only those having complete data. Even with those limits, there are a good number of golfers per year to analyze. Here is a count of golfers per year in the result:

golfers %>%
  with(table(year))
## year
## 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 
##  164  162  161  159  163  156  165  143  157  146  149  150  148  160  160

Because there are many different variables to choose from (having already selected only a subset of the available variables from PGA.com), there are probably more than we really need. We can look at the correlation matrix to get a sense of what kind of colinearities we may be facing, at least in terms of pairwise colinearity. Showing the whole matrix directly would be a lot of numbers, so instead let’s only examine the highest (in absolute value) pairings.

golfers %>%
  select(!!metrics) %>%
  cor %>%
  abs %>%
  replace(upper.tri(., TRUE), NA) %>%
  as_tibble(rownames = 'var1') %>%
  gather(key = 'var2', value = 'cor', ... = -var1, na.rm = TRUE) %>%
  arrange(-cor) %>%
  top_n(10, cor) %>%
  kable
var1 var2 cor
three_putts_per_round three_putt_avoidance 0.9947672
rough_tendency driving_accuracy_pct 0.9090466
sg_total sg_tee_to_green 0.8510743
sg_approach_the_green sg_tee_to_green 0.8130344
sg_approach_the_green sg_total 0.6943186
sg_off_the_tee sg_tee_to_green 0.6561241
sg_putting three_putt_avoidance 0.6284127
sg_putting three_putts_per_round 0.6217183
sg_off_the_tee driving_distance 0.6057743
driving_accuracy_pct driving_distance 0.6032331

What we see are a few different ways that variables may be similar. First, three_putts_per_round and three_putt_avoidance are really just two different ways of measuring the same thing, and there is no surprise that rough_tendency is related to driving_accuracy_pct, so again there is some redundancy in the way different metrics measure the same thing.

We also have metrics that are directly related by definition. The strokes gained measures count the drive, approach, and putting in two different ways, both adding up to the same total (sg_total).

One formulation is:

sg_tee_to_green + sg_putting = sg_total

A somewhat newer version breaks down tee-to-green into more categories:

sg_off_the_tee + sg_approach_the_green + sg_around_the_green + sg_putting = sg_total

This newer formulation has been applied to the older data so we have this for all of the years in our data set. Having examined areas where there may be redundancies, let’s select two groups of variables to compare in some models. First, there are the more traditional measures, excluding some apparent redundancies as noted above.

metrics_trad <- c('driving_distance',
                  'driving_accuracy_pct',
                  'greens_in_regulation_pct',
                  'scrambling_pct',
                  'one_putt_pct',
                  'rough_tendency',
                  'fairway_bunker_tendency',
                  'three_putt_avoidance')
writeLines(metrics_trad)
## driving_distance
## driving_accuracy_pct
## greens_in_regulation_pct
## scrambling_pct
## one_putt_pct
## rough_tendency
## fairway_bunker_tendency
## three_putt_avoidance

Then there are the strokes gained measures, using the more nuanced newer version and excluding the total.

metrics_sg <- c('sg_off_the_tee',
                'sg_approach_the_green',
                'sg_around_the_green',
                'sg_putting')
writeLines(metrics_sg)
## sg_off_the_tee
## sg_approach_the_green
## sg_around_the_green
## sg_putting

Let’s try some classic linear regression models on these sets of variables and see how well they perform. If a golfer is skilled in more of these areas, one would expect them to be a top 10 finisher more often, so let’s try to predict that.

These are the performance metrics of a linear model using the traditional measures:

golfers %>%
  select(y = top_10_prop, !!metrics_trad) %>%
  lm(y ~ ., .) %>%
  glance %>%
  kable
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.3289553 0.3266552 0.0890619 143.0198 0 9 2346.302 -4672.604 -4615.012 18.51335 2334

And these are the metrics using strokes gained measures:

golfers %>%
  select(y = top_10_prop, !!metrics_sg) %>%
  lm(y ~ ., .) %>%
  glance %>%
  kable
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.5957839 0.5950923 0.0690639 861.5087 0 5 2940.119 -5868.238 -5833.683 11.15185 2338

Among other indicators, we see lower AIC and BIC and higher R-squared for the model built on strokes gained, which suggests these measures are a better indicator of outcomes over the traditional metrics. But perhaps this is particular to top 10 finishes, what if we compare these models against other outcomes? Here we compare both models on each outcome using AIC:

map_dfr(outcomes, function(x) {
  bind_rows(
  golfers %>%
    select(y = !!x, !!metrics_trad) %>%
    lm(y ~ ., .) %>%
    glance %>%
    cbind(outcome = x,
          Predictors = 'Tradtional', 
          .,
          stringsAsFactors = FALSE),
  
  golfers %>%
    select(y = !!x, !!metrics_sg) %>%
    lm(y ~ ., .) %>%
    glance %>%
    cbind(outcome = x,
          Predictors = 'Strokes gained', 
          .,
          stringsAsFactors = FALSE)
  )
}) %>%
  select(outcome, Predictors, AIC) %>%
  ggplot(aes(x = 0, y = AIC, color = Predictors)) +
  geom_point() +
  scale_fill_brewer(palette = 'Set1') +
  theme_light() + 
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  facet_wrap(. ~outcome, scales = 'free_y')

It appears that the strokes gained metrics consistently outperform the traditional.

The strokes gained metrics already have a relatively small number of predictors (four), but there are eight predictors in the traditional metrics. Maybe a third set of predictors created by running principal components analysis on the traditional metrics would do better.

pca_trad <- golfers %>%
  select(!!metrics_trad) %>%
  prcomp()

summary(pca_trad)
## Importance of components:
##                            PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     10.0393 5.0153 3.33630 2.33694 1.32929 1.31127
## Proportion of Variance  0.6848 0.1709 0.07563 0.03711 0.01201 0.01168
## Cumulative Proportion   0.6848 0.8557 0.93133 0.96844 0.98045 0.99213
##                            PC7     PC8
## Standard deviation     0.90017 0.59000
## Proportion of Variance 0.00551 0.00237
## Cumulative Proportion  0.99763 1.00000

It looks like we can capture over ninety percent of the variability with just three components. Taking only three components also allows us to try an interesting visualization using a selection of famous golfers:

u <- as.data.frame(pca_trad$x)
names(u) <- paste0('PC', 1:ncol(u))
golfers <- cbind(golfers, u)

famous_golfers <- c('Dustin Johnson',
                    'Jordan Spieth',
                    'Phil Mickelson',
                    'Rickie Fowler',
                    'Rory McIlroy',
                    'Tiger Woods')

golfers %>%
  filter(name %in% famous_golfers) %>%
  plot_ly(x = ~PC1, y = ~PC2, z = ~PC3,
          color = ~name,
          size = ~top_10_prop,
          hoverinfo = 'text',
          text = ~paste0(.$name, ', ', .$year)) %>%
  add_markers()

Although it is more interesting if we use variables that are more meaningful:

golfers %>%
  filter(name %in% famous_golfers) %>%
  plot_ly(x = ~driving_distance, y = ~driving_accuracy_pct, z = ~one_putt_pct,
          color = ~name,
          size = 1,
          hoverinfo = 'text',
          text = ~paste0(.$name, ', ', .$year)) %>%
  add_markers() %>% 
  layout(scene = list(camera = list(eye = list(x = -1.9, y = -2.2, z = 0.3))))

When looking for a model to predict success from performance metrics, we should also consider using something other than linear regression. One possible alternative is a classification and regression tree (CART) based on analysis of variance (ANOVA). The implementation of the model used below does not readily provide for measures such as AIC as the linear model before, but we can compute a measure like root mean square error (RMSE). In the following, we take the three sets of predictor variables – traditional, strokes gained, and the PCA reduction of the traditional – and build both a linear regression and CART model for each. Prior to model building we split the data into training and testing sets to make sure we aren’t overfitting.

set.seed(-1)
train_set <- sample(nrow(golfers), .8 * nrow(golfers))
# Rescale values so RMSE is consistently scaled
train <- golfers[train_set,] %>%
  mutate_if(is_double, scale)
test <- golfers[-train_set,] %>%
  mutate_if(is_double, scale)

map_dfr(outcomes, function(x) {
  
  
  lm_trad <- lm(y ~ .,
                select(train, y = !!x, !!metrics_trad))
  
  lm_sg <- lm(y ~ .,
              select(train, y = !!x, !!metrics_sg))
  
  lm_pca <- lm(y ~ .,
               select(train, y = !!x, PC1, PC2, PC3))
  
  cart_trad <- rpart(y ~ .,
                     data = select(train, y = !!x, !!metrics_trad),
                     method = 'anova')
  
  cart_sg <- rpart(y ~ .,
                   data = select(train, y = !!x, !!metrics_sg),
                   method = 'anova')
  
  cart_pca <- rpart(y ~ .,
                    data = select(train, y = !!x, PC1, PC2, PC3),
                    method = 'anova')
  
  data.frame(ttp = test$top_10_prop,
             ltp = predict(lm_trad, test),
             lsp = predict(lm_sg, test),
             lpp = predict(lm_pca, test),
             ctp = predict(cart_trad, test),
             csp = predict(cart_sg, test),
             cpp = predict(cart_pca, test)) %>%
    mutate(lte = (ttp - ltp) ^ 2,
           lse = (ttp - lsp) ^ 2,
           lpe = (ttp - lpp) ^ 2,
           cte = (ttp - ctp) ^ 2,
           cse = (ttp - csp) ^ 2,
           cpe = (ttp - cpp) ^ 2) %>%
    summarise(outcome = x,
              `Traditional, Linear` = sqrt(mean(lte)),
              `Strokes gained, Linear` = sqrt(mean(lse)),
              `PCA, Linear` = sqrt(mean(lpe)),
              `Traditional, CART` = sqrt(mean(cte)),
              `Strokes gained, CART` = sqrt(mean(cse)),
              `PCA, CART` = sqrt(mean(cpe)))
}) %>%
  gather('model', 'RMSE', ... = -outcome) %>%
  ggplot(aes(x = 0, y = RMSE, fill = model)) +
  geom_col(position = 'dodge') +
  scale_fill_brewer(palette = 'Paired') +
  theme_light() + 
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  facet_wrap(. ~outcome)

It is interesting to note the outlier that ranking is here. In most cases the linear models actually perform slightly better than CART, with the strokes gained metrics performing better in general. With ranking, however, the linear strokes gained model performs the worst of all.

Overall though it appears that a linear regression model of the strokes gained metrics will have the best performance in predicting success.

Another interesting question is how the coefficients of the predictors have changed over time. This next step will create a linear model for each year of the data, collect the coefficients, and plot their changes over time. For an outcome measure let’s predict the proportion of top 10 finishes top_10_prop. The following plot shows the actual values overlayed with a smoothing curve.

lin_mod_coeffs <- function(d) {
  this_mod <- lm(top_10_prop ~ ., select(d, top_10_prop, !!metrics_sg))
  tidy(this_mod)
}

golfers %>%
  filter(ranking <= 25) %>%
  mutate_if(is_double, scale) %>%
  group_by(year) %>%
  nest %>%
  mutate(coeffs = map(data, lin_mod_coeffs)) %>%
  unnest(coeffs) %>%
  filter(term != '(Intercept)') %>%
  select(Year = year, term, Coefficient = estimate) %>%
  mutate(Metric = factor(term,
                       levels = c('sg_off_the_tee',
                                  'sg_approach_the_green',
                                  'sg_around_the_green',
                                  'sg_putting'),
                       labels = c('Off the tee',
                                  'Approach the green',
                                  'Around the green',
                                  'Putting'))) %>%
  ggplot(aes(x = Year, color = Metric, y = Coefficient)) +
  geom_line(alpha = 0.4, linetyp = 'dashed') +
  geom_smooth(method = 'loess', se = FALSE, size = .8) +
  scale_y_continuous(limits = c(0, 1))

It is interesting to observe the rise and fall of importance of different aspects over time. In this time span the off the tee play and putting tend to rise and fall together. In some recent years both of these measures are rising, but there is a more recent decline and lately a slow rise in the short game around the green. Perhaps good scrambling play is the differentiator of the future.

Finally, let’s see if we can use the strokes gained performance metrics to form clusters of similar golfers. First we consider only golfers who were ever in the top five ranking at some point in this data set, and for each golfer we take the mean of their scores over the years available. Performing complete linkage hierarchical clustering provides an interesting picture. The number of clusters chosen is somewhat arbitrary, but it lends well to description as we shall see below.

golfers_mean_sg <- golfers %>%
  group_by(name) %>%
  filter(min(ranking) <= 5) %>%
  summarise_at(metrics_sg, mean)

gclusters <- golfers_mean_sg %>%
  column_to_rownames('name') %>%
  dist %>%
  hclust

plot(gclusters, axes = FALSE, ann = FALSE)
rect.hclust(gclusters, 5)

Now let’s repeat that clustering, this time keeping all golfers and taking a look at the characteristics of each cluster. Also, let’s give a name to each cluster by finding the top earning player in each. From here on, the golfer names represent clusters of many golfers (with one exception), with the one name being a representative of that group. Here are a count of the members of each group.

golfers_mean_sg <- golfers %>%
  group_by(name) %>%
  summarise_at(metrics_sg, mean)

gclusters <- golfers_mean_sg %>%
  column_to_rownames('name') %>%
  dist %>%
  hclust

golfers_w_cluster <- golfers_mean_sg %>%
  mutate(cluster = cutree(gclusters, 5)) %>%
  select(name, cluster) %>%
  merge(golfers, by = 'name')
golfers_w_cluster <- golfers_w_cluster %>%
  group_by(name, cluster) %>%
  summarise(max_top_10 = max(top_10_prop)) %>%
  group_by(cluster) %>%
  slice(which.max(max_top_10)) %>%
  select(cluster_name = name,
         cluster) %>%
  inner_join(golfers_w_cluster, by = 'cluster')

golfers_w_cluster %>%
  group_by(cluster_name) %>%
  summarise(count = n_distinct(name))
## # A tibble: 5 x 2
##   cluster_name count
##   <chr>        <int>
## 1 Jason Dufner    79
## 2 Luke Donald    173
## 3 Rory McIlroy   201
## 4 Tiger Woods      1
## 5 Tommy Gainey    43

What does each cluster look like on the metrics?

golfers_w_cluster %>%
  group_by(cluster_name) %>%
  summarise_at(metrics_sg, mean) %>%
  gather(... = -cluster_name) %>%
  mutate(Metric = factor(key,
                         levels = c('sg_off_the_tee',
                                    'sg_approach_the_green',
                                    'sg_around_the_green',
                                    'sg_putting'),
                         labels = c('Off the tee',
                                    'Approach the green',
                                    'Around the green',
                                    'Putting'))) %>%
  ggplot(aes(x = cluster_name, y = value, fill = Metric)) +
  geom_col(position = 'dodge') +
  xlab('Cluster name') +
  ylab('Strokes gained') +
  scale_y_continuous(limits = c(-0.5, 1.5)) +
  theme_light()

We could summarize the groups like so:

Comparing a few other metrics by cluster:

golfers_w_cluster %>%
  group_by(cluster_name) %>%
  ggplot(aes(x = cluster_name, y = ranking)) +
  geom_boxplot() +
  xlab('Cluster name') +
  ylab('Yearly ranking') +
  theme_light()

golfers_w_cluster %>%
  group_by(cluster_name) %>%
  ggplot(aes(x = cluster_name, y = money_per_event)) +
  geom_boxplot() +
  xlab('Cluster name') +
  ylab('Year average money earned per event') +
  theme_light()

golfers_w_cluster %>%
  group_by(cluster_name) %>%
  ggplot(aes(x = cluster_name, y = top_10_prop)) +
  geom_boxplot() +
  xlab('Cluster name') +
  ylab('Year average proportion of events in the top 10') +
  theme_light()

And lastly, returning to an earlier idea, now coloring by cluster. Here shown are averages by player over all of the years, and plotting on metrics that were used in the clustering, minus the approach shot, since we only have three dimensions to work with! Zooming on the plot is useful on this one.

golfers_w_cluster %>%
  group_by(name, cluster_name) %>%
  summarise_at(metrics_sg, mean) %>%
  plot_ly(x = ~sg_off_the_tee, y = ~sg_around_the_green, z = ~sg_putting,
          color = ~cluster_name,
          size = 1,
          hoverinfo = 'text',
          text = ~name) %>%
  add_markers()

  1. https://www.pgatour.com/news/2016/05/31/strokes-gained-defined.html

  2. https://bigishdata.com/2016/03/05/gather-all-the-pga-tour-stats/