dat<- fread("/Users/claire/Desktop/ASA All PGA Raw Data - Tourn Level.csv", header=T, data.table=F)


dat$finish2<- ifelse(dat$Finish=="CUT", 999, ifelse(dat$Finish=="DQ" | dat$Finish=="MDF" | dat$Finish=="WD" | dat$Finish=="W/D", NA, dat$Finish))

# that code creates a new variable (finish2) that counts CUT as 999, and anyone we don't care about as NA. now we can get rid of the "T" (stands for a tie) and convert the variable to numeric

dat$finish2<- gsub(pattern = 'T', x = dat$finish2, replacement = '', fixed = T)
dat$finish2<- as.numeric(dat$finish2)

dat$finish_cat<- ifelse(dat$finish2==1, "first", ifelse(dat$finish2==999, "cut", ifelse(dat$finish2>1, "not cut", NA)))

table(dat$finish_cat)
## 
##     cut   first not cut 
##   12741     251   17075

look at total number of wins and mean strokes gained by player

### select anyone who has won at least once since 2015


winners <- unique(dat$Player_initial_last[ which(dat$finish_cat=="first")])

length(winners) ### 128 who have won at least once
## [1] 128
sg<- dat %>% filter(Player_initial_last %in% winners) %>%
  select(Player_initial_last, sg_total, finish_cat) %>%
  group_by(Player_initial_last) %>%
  mutate(avg_sg=mean(sg_total,na.rm=T),
         win=ifelse(finish_cat=="first", 1, 0),
         total_wins=sum(win, na.rm=T), 
         appearance=1,
         total_appearances=sum(appearance, na.rm=T)) %>%
  slice(1)

### of those who have won at least once, who has appeared the most?

ggplot(sg, aes(x = reorder(Player_initial_last, -total_appearances), y=total_appearances))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Number of appearances by players who have won at least once since 2015",x="Player", y = "Total appearances")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

DJ has the most wins and also also a pretty high strokes gained since 2015

who has the highest number of wins over number of appearances?

sg$efficiency<- sg$total_wins/sg$total_appearances

sg<- sg %>% arrange(-efficiency) %>%
  mutate(eff_pct= (round(efficiency*100,1)),
         eff_pct=paste0(eff_pct,"%"))
sg<- sg[1:30,]

ggplot(sg, aes(x = reorder(Player_initial_last, -efficiency), y=avg_sg))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Mean strokes gained by players who have won at least once since 2015 \norganized by highest efficiency (overlayed in white)",x="Player", y = "Mean total strokes gained")+
  theme_minimal()+
  geom_text(
    aes(label = eff_pct),
    colour = "white", size = 2,
    vjust = 1.5, position = position_dodge(.9)) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

ggplot(sg, aes(x = reorder(Player_initial_last, -total_wins), y=avg_sg))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Mean strokes gained by players who have won at least once since 2015 \norganized by total wins (overlayed in white)",x="Player", y = "Mean total strokes gained")+
  theme_minimal()+
  geom_text(
    aes(label = total_wins),
    colour = "white", size = 2,
    vjust = 1.5, position = position_dodge(.9)) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

DJ has been the most efficiency at 8.4%, and has also won the most (12 wins since 2015)

just looking at the masters

masters<- dat %>% filter(`tournament name`=="Masters Tournament", season=="2022")

for some reason there is only strokes gained data for the masters from 2022!

colnames(masters)
##  [1] "Player_initial_last" "tournament id"       "player id"          
##  [4] "hole_par"            "strokes"             "hole_DKP"           
##  [7] "hole_FDP"            "hole_SDP"            "streak_DKP"         
## [10] "streak_FDP"          "streak_SDP"          "n_rounds"           
## [13] "made_cut"            "pos"                 "finish_DKP"         
## [16] "finish_FDP"          "finish_SDP"          "total_DKP"          
## [19] "total_FDP"           "total_SDP"           "player"             
## [22] "Unnamed: 2"          "Unnamed: 3"          "Unnamed: 4"         
## [25] "tournament name"     "course"              "date"               
## [28] "purse"               "season"              "no_cut"             
## [31] "Finish"              "sg_putt"             "sg_arg"             
## [34] "sg_app"              "sg_ott"              "sg_t2g"             
## [37] "sg_total"            "finish2"             "finish_cat"
head(masters)
##   Player_initial_last tournament id player id hole_par strokes hole_DKP
## 1            A. Ancer     401353232      9261      144     151     23.0
## 2            A. Scott     401353232       388      288     302     43.5
## 3       B. DeChambeau     401353232     10046      144     156     12.0
## 4           B. Harman     401353232      1225      144     149     27.0
## 5         B. Horschel     401353232      1651      288     296     49.5
## 6           B. Koepka     401353232      6798      144     150     16.5
##   hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP n_rounds made_cut pos
## 1     13.7       18          0        6.4          0        2        0  NA
## 2     28.4       38          0        1.2          0        4        1  48
## 3      1.8       12          0        0.2          0        2        0  NA
## 4     18.5       22          0        0.4          0        2        0  NA
## 5     38.9       43          0        8.6          0        4        1  43
## 6     12.3       21          0        0.6          0        2        0  NA
##   finish_DKP finish_FDP finish_SDP total_DKP total_FDP total_SDP
## 1          0          0          0      23.0      20.1        18
## 2          1          0          0      44.5      29.6        38
## 3          0          0          0      12.0       2.0        12
## 4          0          0          0      27.0      18.9        22
## 5          1          0          0      50.5      47.5        43
## 6          0          0          0      16.5      12.9        21
##              player Unnamed: 2 Unnamed: 3 Unnamed: 4    tournament name
## 1     Abraham Ancer         NA         NA         NA Masters Tournament
## 2        Adam Scott         NA         NA         NA Masters Tournament
## 3 Bryson DeChambeau         NA         NA         NA Masters Tournament
## 4      Brian Harman         NA         NA         NA Masters Tournament
## 5    Billy Horschel         NA         NA         NA Masters Tournament
## 6     Brooks Koepka         NA         NA         NA Masters Tournament
##                                     course       date purse season no_cut
## 1 Augusta National Golf Club - Augusta, GA 2022-04-10    15   2022      0
## 2 Augusta National Golf Club - Augusta, GA 2022-04-10    15   2022      0
## 3 Augusta National Golf Club - Augusta, GA 2022-04-10    15   2022      0
## 4 Augusta National Golf Club - Augusta, GA 2022-04-10    15   2022      0
## 5 Augusta National Golf Club - Augusta, GA 2022-04-10    15   2022      0
## 6 Augusta National Golf Club - Augusta, GA 2022-04-10    15   2022      0
##   Finish sg_putt sg_arg sg_app sg_ott sg_t2g sg_total finish2 finish_cat
## 1    CUT    2.72  -2.87  -1.39   0.25  -4.00    -1.28     999        cut
## 2    T48   -0.11  -0.63  -0.60  -0.31  -1.54    -1.65      48    not cut
## 3   <NA>      NA     NA     NA     NA     NA       NA      NA       <NA>
## 4    CUT    0.24   0.79  -1.07  -0.25  -0.52    -0.28     999        cut
## 5     43    0.06  -0.57   0.83  -0.46  -0.20    -0.14      43    not cut
## 6    CUT   -2.60   1.70  -0.06   0.17   1.82    -0.78     999        cut
masters<- masters %>% select("Player_initial_last", "player id", "made_cut", "finish2", "sg_putt", "sg_arg", "sg_app", "sg_ott", "sg_t2g", "sg_total", "finish_cat")

str(masters$finish2)
##  num [1:84] 999 48 NA 999 43 999 999 39 44 10 ...
table(masters$finish_cat)
## 
##     cut   first not cut 
##      28       1      48

what strokes predict place the most?

ggpairs(masters[,c(4:11)])

masters<- na.omit(masters)
masters$`player id`<- as.numeric(masters$`player id`)
sg<- lm(finish2~sg_putt+sg_arg+sg_app+sg_ott, data=masters)
tab_model(sg)
  finish 2
Predictors Estimates CI p
(Intercept) 350.28 270.08 – 430.48 <0.001
sg putt -132.92 -207.72 – -58.13 0.001
sg arg -122.09 -225.96 – -18.23 0.022
sg app -217.18 -295.71 – -138.65 <0.001
sg ott -139.98 -242.65 – -37.31 0.008
Observations 77
R2 / R2 adjusted 0.473 / 0.444

strokes gained on the approach and off the tee seem to make the biggest impact on finish.

now, only looking at those who made the cut

madecut<- masters %>% filter(!finish_cat=="cut") %>%
  mutate(player=ifelse(finish2<10, Player_initial_last, NA),
         scottie=ifelse(finish2==1, Player_initial_last, NA))

SG total

ggplot(madecut, aes(x=sg_total, y=finish2, color=finish2)) + 
    geom_point() +
  ylab("Finish") +
  xlab("Strokes gained total")+
  geom_label(label=madecut$scottie)+
  scale_colour_gradientn(colours = terrain.colors(10))
## Warning: Removed 48 rows containing missing values (`geom_label()`).

SG putting

ggplot(madecut, aes(x=sg_putt, y=finish2, color=finish2)) + 
    geom_point() +
  ylab("Finish") +
  xlab("Strokes gained putting")+
  geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25, 
    check_overlap = T)+
  scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).

SG around green

ggplot(madecut, aes(x=sg_arg, y=finish2, color=finish2)) + 
    geom_point() +
  ylab("Finish") +
  xlab("Strokes gained around the green")+
  geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25, 
    check_overlap = T)+
  scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).

SG approach

ggplot(madecut, aes(x=sg_app, y=finish2, color=finish2)) + 
    geom_point() +
  ylab("Finish") +
  xlab("Strokes gained approach")+
  geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25, 
    check_overlap = T)+
  scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).

SG on the tee

ggplot(madecut, aes(x=sg_ott, y=finish2, color=finish2)) + 
    geom_point() +
  ylab("Finish") +
  xlab("Strokes gained on the tee")+
  geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25, 
    check_overlap = T)+
  scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).

by looking at last year’s data, it seems like strokes gained putting and on approach are the most important at Augusta. so who over the last 3 years has been good at that?

this graph plots the top 30 total strokes gained off the tee in the last 3 seasons – for those who have appeared at least 15 times.

recent<- dat %>% filter(season=="2020" | season=="2021" | season=="2022") %>%
  group_by(Player_initial_last) %>%
  mutate(total_sg_ott=sum(sg_ott, na.rm=T),
         appearance=1,
         total_appearances=sum(appearance, na.rm=T)) %>%
  filter(total_appearances>15) %>%
  distinct(Player_initial_last, total_sg_ott) %>%
  select(Player_initial_last, total_sg_ott) %>%
  arrange(-total_sg_ott)

recent<- recent[1:30,]

ggplot(recent, aes(reorder(Player_initial_last, -total_sg_ott), y=total_sg_ott))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Strokes gained off the tee across appearances (2020-2022)\n for those who appeared > 15 times",x="Player", y = "Total strokes gained")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

this graph plots the top 30 total strokes gained on approach in the last 3 seasons – for those who have appeared at least 15 times.

recent<- dat %>% filter(season=="2020" | season=="2021" | season=="2022") %>%
  group_by(Player_initial_last) %>%
  mutate(total_sg_app=sum(sg_app, na.rm=T),
         appearance=1,
         total_appearances=sum(appearance, na.rm=T)) %>%
  filter(total_appearances>15) %>%
  distinct(Player_initial_last, total_sg_app) %>%
  select(Player_initial_last, total_sg_app) %>%
  arrange(-total_sg_app)

recent<- recent[1:30,]

ggplot(recent, aes(x = reorder(Player_initial_last, -total_sg_app), y=total_sg_app))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Strokes gained on approach across appearances (2020-2022)\n for those who appeared > 15 times",x="Player", y = "Total strokes gained")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

players tend to struggle a lot more with gaining strokes off the tee.. which maybe is what makes Augsta a hard course to play.

winners<- c("S. Scheffler", "H. Matsuyama", "D. Johnson", "T. Woods", "P. Reed", "S. Garcia", "D. Willett", "J. Spieth", "B. Watson")

recent_winners<- dat %>% filter(`Player_initial_last` %in% winners) %>%
  select("Player_initial_last", "made_cut", "finish2", "sg_putt", "sg_arg", "sg_app", "sg_ott", "sg_t2g", "sg_total", "finish_cat", season)

### profiles of recent winnners

clusVARS<-colnames(recent_winners[c(2,4:7)])

clusITEMS<-recent_winners[clusVARS]

set.seed(13)
clusITEMS<- na.omit(clusITEMS)
kclu1<-kclustering(clusITEMS)
plot(kclu1)

clu_dat<-recent_winners[c("Player_initial_last",clusVARS)]
clu_dat<- na.omit(clu_dat)
kclu2<-kclustering(clusITEMS, labels = clu_dat$Player_initial_last, k=6)

plot(kclu2)

clusters 3,4 and 6 show profiles when players made the cut. nothing clear jumps out – to make the cut you have to gain strokes in at least one category it seems but not necessarily all. maybe to do well you need to putt well – or in the case of cluster 1, you need to play well in every other aspect (e.g. make up more) if you’re not putting well.

put data in long form and look at all strokes gained by recent winners

pga_long<- melt(recent_winners, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 36 × 6
##    Player_initial_last position variable           n   mean    sd
##    <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
##  1 B. Watson           sg_putt  strokes_gained   106 -0.214 0.918
##  2 B. Watson           sg_arg   strokes_gained   106 -0.177 0.777
##  3 B. Watson           sg_app   strokes_gained   106  0.028 1.08 
##  4 B. Watson           sg_ott   strokes_gained   106  0.69  0.632
##  5 D. Johnson          sg_putt  strokes_gained   110  0.169 1.08 
##  6 D. Johnson          sg_arg   strokes_gained   110  0.055 0.551
##  7 D. Johnson          sg_app   strokes_gained   110  0.426 1.13 
##  8 D. Johnson          sg_ott   strokes_gained   110  0.73  0.659
##  9 D. Willett          sg_putt  strokes_gained    70 -0.214 1.13 
## 10 D. Willett          sg_arg   strokes_gained    70  0.061 0.914
## # … with 26 more rows
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  facet_wrap(~Player_initial_last)+
  labs(title="Average strokes gained at various positions for previous Masters winners (2015-2022)",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

golfers who have won the Masters since 2015 tend to average positive strokes gained, as expected. Bubba and Tiger seem like anomalies since they have 2-3 negative strokes gained averages, though from 2015-2018 Tiger wasn’t playing great? Interestingly Scottie, Hideki, Bubba, Danny and Sergio all had negative strokes gained putting. that being said, they all average positive strokes gained elsewhere except Bubba and Danny….

if we look at each player’s stats for the year we won we can see if they were just have exceptional years that year:

Masters Winners List

-2022 — Scottie Scheffler

-2021 — Hideki Matsuyama

-2020 — Dustin Johnson

-2019 — Tiger Woods

-2018 — Patrick Reed

-2017 — Sergio Garcia

-2016 — Danny Willett

-2015 — Jordan Spieth

scottie

scottie<- dat %>% filter(season=="2022" & Player_initial_last=="S. Scheffler")

pga_long<- melt(scottie, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n  mean    sd
##   <chr>               <fct>    <fct>          <dbl> <dbl> <dbl>
## 1 S. Scheffler        sg_putt  strokes_gained    21 0.147 0.761
## 2 S. Scheffler        sg_arg   strokes_gained    21 0.233 0.69 
## 3 S. Scheffler        sg_app   strokes_gained    21 0.769 1.01 
## 4 S. Scheffler        sg_ott   strokes_gained    21 0.428 0.677
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Scottie Scheffler 2022",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

Hideki

hideki<- dat %>% filter(season=="2021" & Player_initial_last=="H. Matsuyama")

pga_long<- melt(hideki, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n   mean    sd
##   <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
## 1 H. Matsuyama        sg_putt  strokes_gained    24 -0.449 1.06 
## 2 H. Matsuyama        sg_arg   strokes_gained    24  0.129 0.732
## 3 H. Matsuyama        sg_app   strokes_gained    24  0.661 1.06 
## 4 H. Matsuyama        sg_ott   strokes_gained    24  0.204 0.535
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Hideki Matsuyama 2021",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

DJ

dj<- dat %>% filter(season=="2020" & Player_initial_last=="D. Johnson")

pga_long<- melt(dj, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n   mean    sd
##   <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
## 1 D. Johnson          sg_putt  strokes_gained    14 -0.031 1.44 
## 2 D. Johnson          sg_arg   strokes_gained    14 -0.151 0.691
## 3 D. Johnson          sg_app   strokes_gained    14 -0.019 2.21 
## 4 D. Johnson          sg_ott   strokes_gained    14  0.639 0.497
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Dustin Johnson 2020",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

Tiger

tiger<- dat %>% filter(season=="2019" & Player_initial_last=="T. Woods")

pga_long<- melt(tiger, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n   mean    sd
##   <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
## 1 T. Woods            sg_putt  strokes_gained     9 -0.164 0.954
## 2 T. Woods            sg_arg   strokes_gained     9  0.064 0.707
## 3 T. Woods            sg_app   strokes_gained     9  0.459 1.23 
## 4 T. Woods            sg_ott   strokes_gained     9  0.168 0.626
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Tiger Woods 2019",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

reed

pat<- dat %>% filter(season=="2018" & Player_initial_last=="P. Reed")

pga_long<- melt(pat, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n   mean    sd
##   <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
## 1 P. Reed             sg_putt  strokes_gained    17  0.04  0.925
## 2 P. Reed             sg_arg   strokes_gained    17  0.441 0.611
## 3 P. Reed             sg_app   strokes_gained    17 -0.041 1.1  
## 4 P. Reed             sg_ott   strokes_gained    17  0.073 0.534
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Patrick Reed 2018",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

sergio

sergio<- dat %>% filter(season=="2017" & Player_initial_last=="S. Garcia")

pga_long<- melt(sergio, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n   mean    sd
##   <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
## 1 S. Garcia           sg_putt  strokes_gained     9 -0.458 0.678
## 2 S. Garcia           sg_arg   strokes_gained     9  0.214 0.411
## 3 S. Garcia           sg_app   strokes_gained     9  0.104 0.701
## 4 S. Garcia           sg_ott   strokes_gained     9  0.9   0.362
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Sergio Garcia 2017",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

danny willett

danny<- dat %>% filter(season=="2016" & Player_initial_last=="D. Willett")

pga_long<- melt(danny, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n   mean    sd
##   <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
## 1 D. Willett          sg_putt  strokes_gained     3 -0.373 0.489
## 2 D. Willett          sg_arg   strokes_gained     3 -0.16  0.815
## 3 D. Willett          sg_app   strokes_gained     3  0.18  0.476
## 4 D. Willett          sg_ott   strokes_gained     3  0.33  0.121
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Danny Willett 2016",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

jordan

jordan<- dat %>% filter(season=="2015" & Player_initial_last=="J. Spieth")

pga_long<- melt(jordan, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")

(means<- pga_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
##   Player_initial_last position variable           n  mean    sd
##   <chr>               <fct>    <fct>          <dbl> <dbl> <dbl>
## 1 J. Spieth           sg_putt  strokes_gained    16 0.399 1.01 
## 2 J. Spieth           sg_arg   strokes_gained    16 0.278 0.68 
## 3 J. Spieth           sg_app   strokes_gained    16 0.791 1.20 
## 4 J. Spieth           sg_ott   strokes_gained    16 0.418 0.551
ggplot(means, aes(y = position, x=mean, fill=position))+
  geom_bar(stat="identity", position = "dodge")+
  labs(title="Average strokes gained at various positions for Jordan Spieth 2015",x="Player", y = "Average")+
  theme_minimal()+
  geom_vline(xintercept = 0, size=1, linetype="dashed") +
  theme(legend.position = "bottom")

It’s interesting that no one seemed to be playing spectacularly well leading up to their Masters win, except Jordan and Scottie. That means I suppose it’s really anyone’s tournamnet to win, but perhaps hitting well off the tee and approach shots really help going into Augusta.

in 2022 who hit the best off the tee and approach?

top 30 players (who have played more than 5 tournaments!) with approach shots from last season + their off the tee strokes gained:

last_yr<- dat %>% filter(season=="2022")

last_yr_long<- melt(last_yr, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_app", "sg_ott"), variable.name = "position")

(means<- last_yr_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd") %>%
    filter(n>5))
## # A tibble: 444 × 6
##    Player_initial_last position variable           n   mean    sd
##    <chr>               <fct>    <fct>          <dbl>  <dbl> <dbl>
##  1 A. Ancer            sg_app   strokes_gained    16 -0.192 0.992
##  2 A. Ancer            sg_ott   strokes_gained    16  0.507 0.616
##  3 A. Baddeley         sg_app   strokes_gained    10  0.053 1.08 
##  4 A. Baddeley         sg_ott   strokes_gained    10 -1.03  1.13 
##  5 A. Cook             sg_app   strokes_gained    20 -0.357 1.12 
##  6 A. Cook             sg_ott   strokes_gained    20 -0.633 1.28 
##  7 A. Hadwin           sg_app   strokes_gained    23  0.202 0.884
##  8 A. Hadwin           sg_ott   strokes_gained    23 -0.073 0.608
##  9 A. Lahiri           sg_app   strokes_gained    22 -0.407 1.36 
## 10 A. Lahiri           sg_ott   strokes_gained    22  0.396 0.429
## # … with 434 more rows
means_app<- last_yr_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd") %>%
    filter(n>5) %>%
    filter(position=="sg_app") %>%
  arrange(-mean) %>%
    slice(1:30)


app<- means %>% filter(Player_initial_last %in% means_app$Player_initial_last)

ggplot(app, aes(x = reorder(Player_initial_last, -mean), y=mean, fill=position))+
  geom_bar(stat="identity", position="dodge")+
  labs(title="Players with most strokes gained on approach [and off the tee] 2022",x="Player", y = "Average")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

top 30 players off the tee from last season + their approach strokes gained:

(means_ott<- last_yr_long %>%
  group_by(Player_initial_last, position) %>%
  get_summary_stats(strokes_gained, type = "mean_sd") %>%
    filter(n>5) %>%
    filter(position=="sg_ott") %>%
  arrange(-mean) %>%
    slice(1:30))
## # A tibble: 30 × 6
##    Player_initial_last position variable           n  mean    sd
##    <chr>               <fct>    <fct>          <dbl> <dbl> <dbl>
##  1 J. Rahm             sg_ott   strokes_gained    17 1.05  0.649
##  2 C. Young            sg_ott   strokes_gained    23 0.76  0.762
##  3 S. Im               sg_ott   strokes_gained    22 0.717 0.491
##  4 C. Champ            sg_ott   strokes_gained    19 0.707 0.774
##  5 M. Fitzpatrick      sg_ott   strokes_gained    16 0.688 0.477
##  6 L. List             sg_ott   strokes_gained    24 0.682 0.581
##  7 C. Conners          sg_ott   strokes_gained    23 0.673 0.348
##  8 K. Mitchell         sg_ott   strokes_gained    23 0.65  0.555
##  9 W. Zalatoris        sg_ott   strokes_gained    19 0.633 0.486
## 10 B. Steele           sg_ott   strokes_gained    20 0.627 0.579
## # … with 20 more rows
ott<- means %>% filter(Player_initial_last %in% means_ott$Player_initial_last)

ott<-ott[ order(-ott$mean), ]

ggplot(ott, aes(x = reorder(Player_initial_last, -mean), y=mean, fill=position))+
  geom_bar(stat="identity", position="dodge")+
  labs(title="Players with most strokes gained off the tee [and on approach] 2022",x="Player", y = "Average")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

so ideally we would pick someone (from this season) who is hitting well off the tee and on approach shots… but going off of only 2022 data…. Will, Xander, Scottie, Tony or Colin are in the top 30 on approach and were hitting off the tee well too….

Corey Conners (just won week prior to Augusta), Matt Fitzpatrick, Rahm, Scottie, Tony, Viktor and Willy Z are in the top 30 off the tee and also hit approach shots well

finally, we can do a prediction model to see who is likely to finish where based strictly on last year’s master’s finish

set.seed(2)

#sample <- sample(c(TRUE, FALSE), nrow(masters), replace=TRUE, prob=c(0.7,0.3))
#train  <- masters[sample, ] ### select 70% of data for training
#test   <- masters[!sample, ] ### select 30% of data for testing

train<- dat %>% filter(finish_cat=="not cut" & season=="2022") %>%
  filter(`tournament name`=="U.S. Open" |
           `tournament name`=="The Open" |
           `tournament name`=="PGA Championship") %>%
  select(colnames(masters))

train$finish_cat<- NULL

train<- na.omit(train)


preProcess <- c("center","scale")
trControl <- trainControl(method = "repeatedcv",number = 10,repeats = 10)

model <- train(finish2~sg_putt+sg_arg+sg_app+sg_ott, data=train, preProcess = preProcess, trControl=trControl) ### run the model 

test<- masters
test$finish_cat<- NULL

test<- na.omit(test)
test$finish_pred <- predict(model, test) ### predict on testing set

plot<- test %>% arrange(finish_pred)

ggplot(plot, aes(x = reorder(Player_initial_last, finish_pred), y=finish_pred))+
  geom_bar(stat="identity", fill="darkgreen")+
  labs(title="Predicted finish for 2022 Masters (overlayed in white)",x="Player", y = "predicted finish")+
  theme_minimal()+
  geom_text(
    aes(label = round(finish_pred,0)),
    colour = "white", size = 2,
    vjust = 1.5, position = position_dodge(.9)) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

The prediction plot predicts scottie to win (which technically this is based on last year’s data so it is predicting last year’s masters?). confusing since we dont have the ideal amount of data. the training data set was the majors from last year, and the test was last year’s masters. so essentially, the model correctly predicted scottie to win.

other people who should’ve done well last year are Collin Morikawa, Cam Smith, Corey Conners and Will Zalatoris. Pretty accurate for last year! I wish we could have done this with this year’s data but it is what is is!