dat<- fread("/Users/claire/Desktop/ASA All PGA Raw Data - Tourn Level.csv", header=T, data.table=F)
dat$finish2<- ifelse(dat$Finish=="CUT", 999, ifelse(dat$Finish=="DQ" | dat$Finish=="MDF" | dat$Finish=="WD" | dat$Finish=="W/D", NA, dat$Finish))
# that code creates a new variable (finish2) that counts CUT as 999, and anyone we don't care about as NA. now we can get rid of the "T" (stands for a tie) and convert the variable to numeric
dat$finish2<- gsub(pattern = 'T', x = dat$finish2, replacement = '', fixed = T)
dat$finish2<- as.numeric(dat$finish2)
dat$finish_cat<- ifelse(dat$finish2==1, "first", ifelse(dat$finish2==999, "cut", ifelse(dat$finish2>1, "not cut", NA)))
table(dat$finish_cat)
##
## cut first not cut
## 12741 251 17075
look at total number of wins and mean strokes gained by player
### select anyone who has won at least once since 2015
winners <- unique(dat$Player_initial_last[ which(dat$finish_cat=="first")])
length(winners) ### 128 who have won at least once
## [1] 128
sg<- dat %>% filter(Player_initial_last %in% winners) %>%
select(Player_initial_last, sg_total, finish_cat) %>%
group_by(Player_initial_last) %>%
mutate(avg_sg=mean(sg_total,na.rm=T),
win=ifelse(finish_cat=="first", 1, 0),
total_wins=sum(win, na.rm=T),
appearance=1,
total_appearances=sum(appearance, na.rm=T)) %>%
slice(1)
### of those who have won at least once, who has appeared the most?
ggplot(sg, aes(x = reorder(Player_initial_last, -total_appearances), y=total_appearances))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Number of appearances by players who have won at least once since 2015",x="Player", y = "Total appearances")+
theme_minimal()+
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
DJ has the most wins and also also a pretty high strokes gained since 2015
who has the highest number of wins over number of appearances?
sg$efficiency<- sg$total_wins/sg$total_appearances
sg<- sg %>% arrange(-efficiency) %>%
mutate(eff_pct= (round(efficiency*100,1)),
eff_pct=paste0(eff_pct,"%"))
sg<- sg[1:30,]
ggplot(sg, aes(x = reorder(Player_initial_last, -efficiency), y=avg_sg))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Mean strokes gained by players who have won at least once since 2015 \norganized by highest efficiency (overlayed in white)",x="Player", y = "Mean total strokes gained")+
theme_minimal()+
geom_text(
aes(label = eff_pct),
colour = "white", size = 2,
vjust = 1.5, position = position_dodge(.9)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
ggplot(sg, aes(x = reorder(Player_initial_last, -total_wins), y=avg_sg))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Mean strokes gained by players who have won at least once since 2015 \norganized by total wins (overlayed in white)",x="Player", y = "Mean total strokes gained")+
theme_minimal()+
geom_text(
aes(label = total_wins),
colour = "white", size = 2,
vjust = 1.5, position = position_dodge(.9)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
DJ has been the most efficiency at 8.4%, and has also won the most (12 wins since 2015)
just looking at the masters
masters<- dat %>% filter(`tournament name`=="Masters Tournament", season=="2022")
for some reason there is only strokes gained data for the masters from 2022!
colnames(masters)
## [1] "Player_initial_last" "tournament id" "player id"
## [4] "hole_par" "strokes" "hole_DKP"
## [7] "hole_FDP" "hole_SDP" "streak_DKP"
## [10] "streak_FDP" "streak_SDP" "n_rounds"
## [13] "made_cut" "pos" "finish_DKP"
## [16] "finish_FDP" "finish_SDP" "total_DKP"
## [19] "total_FDP" "total_SDP" "player"
## [22] "Unnamed: 2" "Unnamed: 3" "Unnamed: 4"
## [25] "tournament name" "course" "date"
## [28] "purse" "season" "no_cut"
## [31] "Finish" "sg_putt" "sg_arg"
## [34] "sg_app" "sg_ott" "sg_t2g"
## [37] "sg_total" "finish2" "finish_cat"
head(masters)
## Player_initial_last tournament id player id hole_par strokes hole_DKP
## 1 A. Ancer 401353232 9261 144 151 23.0
## 2 A. Scott 401353232 388 288 302 43.5
## 3 B. DeChambeau 401353232 10046 144 156 12.0
## 4 B. Harman 401353232 1225 144 149 27.0
## 5 B. Horschel 401353232 1651 288 296 49.5
## 6 B. Koepka 401353232 6798 144 150 16.5
## hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP n_rounds made_cut pos
## 1 13.7 18 0 6.4 0 2 0 NA
## 2 28.4 38 0 1.2 0 4 1 48
## 3 1.8 12 0 0.2 0 2 0 NA
## 4 18.5 22 0 0.4 0 2 0 NA
## 5 38.9 43 0 8.6 0 4 1 43
## 6 12.3 21 0 0.6 0 2 0 NA
## finish_DKP finish_FDP finish_SDP total_DKP total_FDP total_SDP
## 1 0 0 0 23.0 20.1 18
## 2 1 0 0 44.5 29.6 38
## 3 0 0 0 12.0 2.0 12
## 4 0 0 0 27.0 18.9 22
## 5 1 0 0 50.5 47.5 43
## 6 0 0 0 16.5 12.9 21
## player Unnamed: 2 Unnamed: 3 Unnamed: 4 tournament name
## 1 Abraham Ancer NA NA NA Masters Tournament
## 2 Adam Scott NA NA NA Masters Tournament
## 3 Bryson DeChambeau NA NA NA Masters Tournament
## 4 Brian Harman NA NA NA Masters Tournament
## 5 Billy Horschel NA NA NA Masters Tournament
## 6 Brooks Koepka NA NA NA Masters Tournament
## course date purse season no_cut
## 1 Augusta National Golf Club - Augusta, GA 2022-04-10 15 2022 0
## 2 Augusta National Golf Club - Augusta, GA 2022-04-10 15 2022 0
## 3 Augusta National Golf Club - Augusta, GA 2022-04-10 15 2022 0
## 4 Augusta National Golf Club - Augusta, GA 2022-04-10 15 2022 0
## 5 Augusta National Golf Club - Augusta, GA 2022-04-10 15 2022 0
## 6 Augusta National Golf Club - Augusta, GA 2022-04-10 15 2022 0
## Finish sg_putt sg_arg sg_app sg_ott sg_t2g sg_total finish2 finish_cat
## 1 CUT 2.72 -2.87 -1.39 0.25 -4.00 -1.28 999 cut
## 2 T48 -0.11 -0.63 -0.60 -0.31 -1.54 -1.65 48 not cut
## 3 <NA> NA NA NA NA NA NA NA <NA>
## 4 CUT 0.24 0.79 -1.07 -0.25 -0.52 -0.28 999 cut
## 5 43 0.06 -0.57 0.83 -0.46 -0.20 -0.14 43 not cut
## 6 CUT -2.60 1.70 -0.06 0.17 1.82 -0.78 999 cut
masters<- masters %>% select("Player_initial_last", "player id", "made_cut", "finish2", "sg_putt", "sg_arg", "sg_app", "sg_ott", "sg_t2g", "sg_total", "finish_cat")
str(masters$finish2)
## num [1:84] 999 48 NA 999 43 999 999 39 44 10 ...
table(masters$finish_cat)
##
## cut first not cut
## 28 1 48
what strokes predict place the most?
ggpairs(masters[,c(4:11)])
masters<- na.omit(masters)
masters$`player id`<- as.numeric(masters$`player id`)
sg<- lm(finish2~sg_putt+sg_arg+sg_app+sg_ott, data=masters)
tab_model(sg)
finish 2 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 350.28 | 270.08 – 430.48 | <0.001 |
sg putt | -132.92 | -207.72 – -58.13 | 0.001 |
sg arg | -122.09 | -225.96 – -18.23 | 0.022 |
sg app | -217.18 | -295.71 – -138.65 | <0.001 |
sg ott | -139.98 | -242.65 – -37.31 | 0.008 |
Observations | 77 | ||
R2 / R2 adjusted | 0.473 / 0.444 |
strokes gained on the approach and off the tee seem to make the biggest impact on finish.
now, only looking at those who made the cut
madecut<- masters %>% filter(!finish_cat=="cut") %>%
mutate(player=ifelse(finish2<10, Player_initial_last, NA),
scottie=ifelse(finish2==1, Player_initial_last, NA))
SG total
ggplot(madecut, aes(x=sg_total, y=finish2, color=finish2)) +
geom_point() +
ylab("Finish") +
xlab("Strokes gained total")+
geom_label(label=madecut$scottie)+
scale_colour_gradientn(colours = terrain.colors(10))
## Warning: Removed 48 rows containing missing values (`geom_label()`).
SG putting
ggplot(madecut, aes(x=sg_putt, y=finish2, color=finish2)) +
geom_point() +
ylab("Finish") +
xlab("Strokes gained putting")+
geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25,
check_overlap = T)+
scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).
SG around green
ggplot(madecut, aes(x=sg_arg, y=finish2, color=finish2)) +
geom_point() +
ylab("Finish") +
xlab("Strokes gained around the green")+
geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25,
check_overlap = T)+
scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).
SG approach
ggplot(madecut, aes(x=sg_app, y=finish2, color=finish2)) +
geom_point() +
ylab("Finish") +
xlab("Strokes gained approach")+
geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25,
check_overlap = T)+
scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).
SG on the tee
ggplot(madecut, aes(x=sg_ott, y=finish2, color=finish2)) +
geom_point() +
ylab("Finish") +
xlab("Strokes gained on the tee")+
geom_label(label=madecut$player, nudge_x = 0.25, nudge_y = 0.25,
check_overlap = T)+
scale_colour_gradientn(colours = terrain.colors(10))
## Warning in geom_label(label = madecut$player, nudge_x = 0.25, nudge_y = 0.25, :
## Ignoring unknown parameters: `check_overlap`
## Warning: Removed 41 rows containing missing values (`geom_label()`).
by looking at last year’s data, it seems like strokes gained putting and on approach are the most important at Augusta. so who over the last 3 years has been good at that?
this graph plots the top 30 total strokes gained off the tee in the last 3 seasons – for those who have appeared at least 15 times.
recent<- dat %>% filter(season=="2020" | season=="2021" | season=="2022") %>%
group_by(Player_initial_last) %>%
mutate(total_sg_ott=sum(sg_ott, na.rm=T),
appearance=1,
total_appearances=sum(appearance, na.rm=T)) %>%
filter(total_appearances>15) %>%
distinct(Player_initial_last, total_sg_ott) %>%
select(Player_initial_last, total_sg_ott) %>%
arrange(-total_sg_ott)
recent<- recent[1:30,]
ggplot(recent, aes(reorder(Player_initial_last, -total_sg_ott), y=total_sg_ott))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Strokes gained off the tee across appearances (2020-2022)\n for those who appeared > 15 times",x="Player", y = "Total strokes gained")+
theme_minimal()+
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
this graph plots the top 30 total strokes gained on approach in the last 3 seasons – for those who have appeared at least 15 times.
recent<- dat %>% filter(season=="2020" | season=="2021" | season=="2022") %>%
group_by(Player_initial_last) %>%
mutate(total_sg_app=sum(sg_app, na.rm=T),
appearance=1,
total_appearances=sum(appearance, na.rm=T)) %>%
filter(total_appearances>15) %>%
distinct(Player_initial_last, total_sg_app) %>%
select(Player_initial_last, total_sg_app) %>%
arrange(-total_sg_app)
recent<- recent[1:30,]
ggplot(recent, aes(x = reorder(Player_initial_last, -total_sg_app), y=total_sg_app))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Strokes gained on approach across appearances (2020-2022)\n for those who appeared > 15 times",x="Player", y = "Total strokes gained")+
theme_minimal()+
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
players tend to struggle a lot more with gaining strokes off the tee.. which maybe is what makes Augsta a hard course to play.
winners<- c("S. Scheffler", "H. Matsuyama", "D. Johnson", "T. Woods", "P. Reed", "S. Garcia", "D. Willett", "J. Spieth", "B. Watson")
recent_winners<- dat %>% filter(`Player_initial_last` %in% winners) %>%
select("Player_initial_last", "made_cut", "finish2", "sg_putt", "sg_arg", "sg_app", "sg_ott", "sg_t2g", "sg_total", "finish_cat", season)
### profiles of recent winnners
clusVARS<-colnames(recent_winners[c(2,4:7)])
clusITEMS<-recent_winners[clusVARS]
set.seed(13)
clusITEMS<- na.omit(clusITEMS)
kclu1<-kclustering(clusITEMS)
plot(kclu1)
clu_dat<-recent_winners[c("Player_initial_last",clusVARS)]
clu_dat<- na.omit(clu_dat)
kclu2<-kclustering(clusITEMS, labels = clu_dat$Player_initial_last, k=6)
plot(kclu2)
clusters 3,4 and 6 show profiles when players made the cut. nothing clear jumps out – to make the cut you have to gain strokes in at least one category it seems but not necessarily all. maybe to do well you need to putt well – or in the case of cluster 1, you need to play well in every other aspect (e.g. make up more) if you’re not putting well.
put data in long form and look at all strokes gained by recent winners
pga_long<- melt(recent_winners, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 36 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 B. Watson sg_putt strokes_gained 106 -0.214 0.918
## 2 B. Watson sg_arg strokes_gained 106 -0.177 0.777
## 3 B. Watson sg_app strokes_gained 106 0.028 1.08
## 4 B. Watson sg_ott strokes_gained 106 0.69 0.632
## 5 D. Johnson sg_putt strokes_gained 110 0.169 1.08
## 6 D. Johnson sg_arg strokes_gained 110 0.055 0.551
## 7 D. Johnson sg_app strokes_gained 110 0.426 1.13
## 8 D. Johnson sg_ott strokes_gained 110 0.73 0.659
## 9 D. Willett sg_putt strokes_gained 70 -0.214 1.13
## 10 D. Willett sg_arg strokes_gained 70 0.061 0.914
## # … with 26 more rows
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
facet_wrap(~Player_initial_last)+
labs(title="Average strokes gained at various positions for previous Masters winners (2015-2022)",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
golfers who have won the Masters since 2015 tend to average positive strokes gained, as expected. Bubba and Tiger seem like anomalies since they have 2-3 negative strokes gained averages, though from 2015-2018 Tiger wasn’t playing great? Interestingly Scottie, Hideki, Bubba, Danny and Sergio all had negative strokes gained putting. that being said, they all average positive strokes gained elsewhere except Bubba and Danny….
if we look at each player’s stats for the year we won we can see if they were just have exceptional years that year:
Masters Winners List
-2022 — Scottie Scheffler
-2021 — Hideki Matsuyama
-2020 — Dustin Johnson
-2019 — Tiger Woods
-2018 — Patrick Reed
-2017 — Sergio Garcia
-2016 — Danny Willett
-2015 — Jordan Spieth
scottie
scottie<- dat %>% filter(season=="2022" & Player_initial_last=="S. Scheffler")
pga_long<- melt(scottie, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 S. Scheffler sg_putt strokes_gained 21 0.147 0.761
## 2 S. Scheffler sg_arg strokes_gained 21 0.233 0.69
## 3 S. Scheffler sg_app strokes_gained 21 0.769 1.01
## 4 S. Scheffler sg_ott strokes_gained 21 0.428 0.677
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Scottie Scheffler 2022",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
Hideki
hideki<- dat %>% filter(season=="2021" & Player_initial_last=="H. Matsuyama")
pga_long<- melt(hideki, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 H. Matsuyama sg_putt strokes_gained 24 -0.449 1.06
## 2 H. Matsuyama sg_arg strokes_gained 24 0.129 0.732
## 3 H. Matsuyama sg_app strokes_gained 24 0.661 1.06
## 4 H. Matsuyama sg_ott strokes_gained 24 0.204 0.535
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Hideki Matsuyama 2021",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
DJ
dj<- dat %>% filter(season=="2020" & Player_initial_last=="D. Johnson")
pga_long<- melt(dj, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 D. Johnson sg_putt strokes_gained 14 -0.031 1.44
## 2 D. Johnson sg_arg strokes_gained 14 -0.151 0.691
## 3 D. Johnson sg_app strokes_gained 14 -0.019 2.21
## 4 D. Johnson sg_ott strokes_gained 14 0.639 0.497
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Dustin Johnson 2020",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
Tiger
tiger<- dat %>% filter(season=="2019" & Player_initial_last=="T. Woods")
pga_long<- melt(tiger, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 T. Woods sg_putt strokes_gained 9 -0.164 0.954
## 2 T. Woods sg_arg strokes_gained 9 0.064 0.707
## 3 T. Woods sg_app strokes_gained 9 0.459 1.23
## 4 T. Woods sg_ott strokes_gained 9 0.168 0.626
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Tiger Woods 2019",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
reed
pat<- dat %>% filter(season=="2018" & Player_initial_last=="P. Reed")
pga_long<- melt(pat, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 P. Reed sg_putt strokes_gained 17 0.04 0.925
## 2 P. Reed sg_arg strokes_gained 17 0.441 0.611
## 3 P. Reed sg_app strokes_gained 17 -0.041 1.1
## 4 P. Reed sg_ott strokes_gained 17 0.073 0.534
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Patrick Reed 2018",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
sergio
sergio<- dat %>% filter(season=="2017" & Player_initial_last=="S. Garcia")
pga_long<- melt(sergio, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 S. Garcia sg_putt strokes_gained 9 -0.458 0.678
## 2 S. Garcia sg_arg strokes_gained 9 0.214 0.411
## 3 S. Garcia sg_app strokes_gained 9 0.104 0.701
## 4 S. Garcia sg_ott strokes_gained 9 0.9 0.362
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Sergio Garcia 2017",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
danny willett
danny<- dat %>% filter(season=="2016" & Player_initial_last=="D. Willett")
pga_long<- melt(danny, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 D. Willett sg_putt strokes_gained 3 -0.373 0.489
## 2 D. Willett sg_arg strokes_gained 3 -0.16 0.815
## 3 D. Willett sg_app strokes_gained 3 0.18 0.476
## 4 D. Willett sg_ott strokes_gained 3 0.33 0.121
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Danny Willett 2016",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
jordan
jordan<- dat %>% filter(season=="2015" & Player_initial_last=="J. Spieth")
pga_long<- melt(jordan, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_putt", "sg_arg", "sg_app", "sg_ott"), variable.name = "position")
(means<- pga_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd"))
## # A tibble: 4 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 J. Spieth sg_putt strokes_gained 16 0.399 1.01
## 2 J. Spieth sg_arg strokes_gained 16 0.278 0.68
## 3 J. Spieth sg_app strokes_gained 16 0.791 1.20
## 4 J. Spieth sg_ott strokes_gained 16 0.418 0.551
ggplot(means, aes(y = position, x=mean, fill=position))+
geom_bar(stat="identity", position = "dodge")+
labs(title="Average strokes gained at various positions for Jordan Spieth 2015",x="Player", y = "Average")+
theme_minimal()+
geom_vline(xintercept = 0, size=1, linetype="dashed") +
theme(legend.position = "bottom")
It’s interesting that no one seemed to be playing spectacularly well leading up to their Masters win, except Jordan and Scottie. That means I suppose it’s really anyone’s tournamnet to win, but perhaps hitting well off the tee and approach shots really help going into Augusta.
in 2022 who hit the best off the tee and approach?
top 30 players (who have played more than 5 tournaments!) with approach shots from last season + their off the tee strokes gained:
last_yr<- dat %>% filter(season=="2022")
last_yr_long<- melt(last_yr, id.vars = "Player_initial_last", value.name = "strokes_gained", measure.vars = c("sg_app", "sg_ott"), variable.name = "position")
(means<- last_yr_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd") %>%
filter(n>5))
## # A tibble: 444 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 A. Ancer sg_app strokes_gained 16 -0.192 0.992
## 2 A. Ancer sg_ott strokes_gained 16 0.507 0.616
## 3 A. Baddeley sg_app strokes_gained 10 0.053 1.08
## 4 A. Baddeley sg_ott strokes_gained 10 -1.03 1.13
## 5 A. Cook sg_app strokes_gained 20 -0.357 1.12
## 6 A. Cook sg_ott strokes_gained 20 -0.633 1.28
## 7 A. Hadwin sg_app strokes_gained 23 0.202 0.884
## 8 A. Hadwin sg_ott strokes_gained 23 -0.073 0.608
## 9 A. Lahiri sg_app strokes_gained 22 -0.407 1.36
## 10 A. Lahiri sg_ott strokes_gained 22 0.396 0.429
## # … with 434 more rows
means_app<- last_yr_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd") %>%
filter(n>5) %>%
filter(position=="sg_app") %>%
arrange(-mean) %>%
slice(1:30)
app<- means %>% filter(Player_initial_last %in% means_app$Player_initial_last)
ggplot(app, aes(x = reorder(Player_initial_last, -mean), y=mean, fill=position))+
geom_bar(stat="identity", position="dodge")+
labs(title="Players with most strokes gained on approach [and off the tee] 2022",x="Player", y = "Average")+
theme_minimal()+
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
top 30 players off the tee from last season + their approach strokes gained:
(means_ott<- last_yr_long %>%
group_by(Player_initial_last, position) %>%
get_summary_stats(strokes_gained, type = "mean_sd") %>%
filter(n>5) %>%
filter(position=="sg_ott") %>%
arrange(-mean) %>%
slice(1:30))
## # A tibble: 30 × 6
## Player_initial_last position variable n mean sd
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 J. Rahm sg_ott strokes_gained 17 1.05 0.649
## 2 C. Young sg_ott strokes_gained 23 0.76 0.762
## 3 S. Im sg_ott strokes_gained 22 0.717 0.491
## 4 C. Champ sg_ott strokes_gained 19 0.707 0.774
## 5 M. Fitzpatrick sg_ott strokes_gained 16 0.688 0.477
## 6 L. List sg_ott strokes_gained 24 0.682 0.581
## 7 C. Conners sg_ott strokes_gained 23 0.673 0.348
## 8 K. Mitchell sg_ott strokes_gained 23 0.65 0.555
## 9 W. Zalatoris sg_ott strokes_gained 19 0.633 0.486
## 10 B. Steele sg_ott strokes_gained 20 0.627 0.579
## # … with 20 more rows
ott<- means %>% filter(Player_initial_last %in% means_ott$Player_initial_last)
ott<-ott[ order(-ott$mean), ]
ggplot(ott, aes(x = reorder(Player_initial_last, -mean), y=mean, fill=position))+
geom_bar(stat="identity", position="dodge")+
labs(title="Players with most strokes gained off the tee [and on approach] 2022",x="Player", y = "Average")+
theme_minimal()+
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
so ideally we would pick someone (from this season) who is hitting well off the tee and on approach shots… but going off of only 2022 data…. Will, Xander, Scottie, Tony or Colin are in the top 30 on approach and were hitting off the tee well too….
Corey Conners (just won week prior to Augusta), Matt Fitzpatrick, Rahm, Scottie, Tony, Viktor and Willy Z are in the top 30 off the tee and also hit approach shots well
finally, we can do a prediction model to see who is likely to finish where based strictly on last year’s master’s finish
set.seed(2)
#sample <- sample(c(TRUE, FALSE), nrow(masters), replace=TRUE, prob=c(0.7,0.3))
#train <- masters[sample, ] ### select 70% of data for training
#test <- masters[!sample, ] ### select 30% of data for testing
train<- dat %>% filter(finish_cat=="not cut" & season=="2022") %>%
filter(`tournament name`=="U.S. Open" |
`tournament name`=="The Open" |
`tournament name`=="PGA Championship") %>%
select(colnames(masters))
train$finish_cat<- NULL
train<- na.omit(train)
preProcess <- c("center","scale")
trControl <- trainControl(method = "repeatedcv",number = 10,repeats = 10)
model <- train(finish2~sg_putt+sg_arg+sg_app+sg_ott, data=train, preProcess = preProcess, trControl=trControl) ### run the model
test<- masters
test$finish_cat<- NULL
test<- na.omit(test)
test$finish_pred <- predict(model, test) ### predict on testing set
plot<- test %>% arrange(finish_pred)
ggplot(plot, aes(x = reorder(Player_initial_last, finish_pred), y=finish_pred))+
geom_bar(stat="identity", fill="darkgreen")+
labs(title="Predicted finish for 2022 Masters (overlayed in white)",x="Player", y = "predicted finish")+
theme_minimal()+
geom_text(
aes(label = round(finish_pred,0)),
colour = "white", size = 2,
vjust = 1.5, position = position_dodge(.9)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
The prediction plot predicts scottie to win (which technically this is based on last year’s data so it is predicting last year’s masters?). confusing since we dont have the ideal amount of data. the training data set was the majors from last year, and the test was last year’s masters. so essentially, the model correctly predicted scottie to win.
other people who should’ve done well last year are Collin Morikawa, Cam Smith, Corey Conners and Will Zalatoris. Pretty accurate for last year! I wish we could have done this with this year’s data but it is what is is!