Packages and Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)
library(dplyr)
library(ggplot2)
library(tibble)

golf_data <- read_csv("Final Project/ASA All PGA Raw Data - Tourn Level.csv")
## Rows: 36864 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): Player_initial_last, player, tournament name, course, Finish
## dbl  (28): tournament id, player id, hole_par, strokes, hole_DKP, hole_FDP, ...
## lgl   (3): Unnamed: 2, Unnamed: 3, Unnamed: 4
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question 1: How have the top players’ performances changed over time from 2015 to 2022?

Setup

performance_by_year <- golf_data %>%
  group_by(player, season) %>%
  summarise(avg_sg_total = mean(sg_total, na.rm = TRUE),
            total_events = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'player'. You can override using the
## `.groups` argument.
top_players1 <- performance_by_year %>%
  group_by(player) %>%
  summarise(overall_avg_sg = mean(avg_sg_total, na.rm = TRUE)) %>%
  arrange(desc(overall_avg_sg)) %>%
  slice_head(n = 5) %>%
  pull(player)

top_perf <- performance_by_year %>%
  filter(player %in% top_players1)

Visualization

top_perf %>%
  ggplot(aes(x = season, y = avg_sg_total, color = player)) + 
  geom_line(size = 1.2) + 
  geom_point(size = 2) + 
  facet_wrap(~ player, ncol = 3) +
  labs(title = "Top Players' Average Strokes Gained Over Time",
       x = "Year",
       y = "Avg Strokes Gained (Total)",
       color = "Player") + 
  theme_minimal() +
  theme(strip.text = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 8))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Question 2: Which players excel in specific skill areas (driving (OTT) vs. short game (ARG))?

Setup

skill_summary <- golf_data %>%
  group_by(player) %>%
  summarize(sg_drive = mean(sg_ott, na.rm = TRUE), 
            sg_short_game = mean(sg_arg, na.rm = TRUE),
            sg_total = mean(sg_total, na.rm = TRUE),
            events_played = n()
  ) %>%
  filter(events_played >= 15)

top_players2 <- skill_summary %>%
  arrange(desc(sg_total)) %>%
  slice(1:10) %>%
  select(player, sg_drive, sg_short_game) %>%
  pivot_longer(cols = c(sg_drive, sg_short_game), names_to = "Skill", values_to = "SG")

Visualization

top_players2 %>%
  ggplot(aes(x = reorder(player, -SG), y = SG, fill = Skill)) + 
  geom_col(position = "dodge") + 
  labs(
    title = "Top 10 Players: Driving vs. Short Game",
    x = "Player", 
    y = "Avg Strokes Gained",
    fill = "Skill Area"
  ) + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Question 3: How do player performances vary by tournament category?

Setup

colnames(golf_data) <- gsub(" ", "_", colnames(golf_data))

performance_by_category <- golf_data %>%
  mutate(
    tournament_type = case_when(
      str_detect(tournament_name, "Masters") ~ "Major",
      str_detect(tournament_name, "U.S. Open") ~ "Major",
      str_detect(tournament_name, "Open Championship") ~ "Major",
      str_detect(tournament_name, "PGA Championship") ~ "Major",
      str_detect(tournament_name, "WGC|World Golf Championshop") ~ "WGC",
      str_detect(tournament_name, "Tour Championship|The Northern Trust|Dell Technologies|Deutsche Bank|The Barclays|BMW Championship") ~ "FedEx Cup",
      str_detect(tournament_name, "Arnold Palmer|Genesis Invitational|The Memorial") ~ "Invitational",
      str_detect(tournament_name, "Hero World Challenge|Sentry Tournament of Champions|Tournament of Champions|Workday Charity Open") ~ "Other",
      TRUE ~ "Regular"
  )) %>%
  group_by(tournament_type, player) %>%
  summarize(
    avg_sg_total = mean(sg_total, na.rm = TRUE),
    events_played = n()
  ) %>%
  filter(events_played >= 1)
## `summarise()` has grouped output by 'tournament_type'. You can override using
## the `.groups` argument.

Visualization

performance_by_category %>%
  ggplot(aes(x = reorder(tournament_type, avg_sg_total), y = avg_sg_total, fill = tournament_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(
    title = "Average Strokes Gained by Tournament Category",
    x = "Tournament Category",
    y = "Average Strokes Gained",
    fill = "Tournament Category"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12))
## Warning: Removed 141 rows containing missing values or values outside the scale range
## (`geom_bar()`).