library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)
library(dplyr)
library(ggplot2)
library(tibble)
golf_data <- read_csv("Final Project/ASA All PGA Raw Data - Tourn Level.csv")
## Rows: 36864 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Player_initial_last, player, tournament name, course, Finish
## dbl (28): tournament id, player id, hole_par, strokes, hole_DKP, hole_FDP, ...
## lgl (3): Unnamed: 2, Unnamed: 3, Unnamed: 4
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
performance_by_year <- golf_data %>%
group_by(player, season) %>%
summarise(avg_sg_total = mean(sg_total, na.rm = TRUE),
total_events = n()) %>%
ungroup()
## `summarise()` has grouped output by 'player'. You can override using the
## `.groups` argument.
top_players1 <- performance_by_year %>%
group_by(player) %>%
summarise(overall_avg_sg = mean(avg_sg_total, na.rm = TRUE)) %>%
arrange(desc(overall_avg_sg)) %>%
slice_head(n = 5) %>%
pull(player)
top_perf <- performance_by_year %>%
filter(player %in% top_players1)
top_perf %>%
ggplot(aes(x = season, y = avg_sg_total, color = player)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
facet_wrap(~ player, ncol = 3) +
labs(title = "Top Players' Average Strokes Gained Over Time",
x = "Year",
y = "Avg Strokes Gained (Total)",
color = "Player") +
theme_minimal() +
theme(strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 8))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
skill_summary <- golf_data %>%
group_by(player) %>%
summarize(sg_drive = mean(sg_ott, na.rm = TRUE),
sg_short_game = mean(sg_arg, na.rm = TRUE),
sg_total = mean(sg_total, na.rm = TRUE),
events_played = n()
) %>%
filter(events_played >= 15)
top_players2 <- skill_summary %>%
arrange(desc(sg_total)) %>%
slice(1:10) %>%
select(player, sg_drive, sg_short_game) %>%
pivot_longer(cols = c(sg_drive, sg_short_game), names_to = "Skill", values_to = "SG")
top_players2 %>%
ggplot(aes(x = reorder(player, -SG), y = SG, fill = Skill)) +
geom_col(position = "dodge") +
labs(
title = "Top 10 Players: Driving vs. Short Game",
x = "Player",
y = "Avg Strokes Gained",
fill = "Skill Area"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
colnames(golf_data) <- gsub(" ", "_", colnames(golf_data))
performance_by_category <- golf_data %>%
mutate(
tournament_type = case_when(
str_detect(tournament_name, "Masters") ~ "Major",
str_detect(tournament_name, "U.S. Open") ~ "Major",
str_detect(tournament_name, "Open Championship") ~ "Major",
str_detect(tournament_name, "PGA Championship") ~ "Major",
str_detect(tournament_name, "WGC|World Golf Championshop") ~ "WGC",
str_detect(tournament_name, "Tour Championship|The Northern Trust|Dell Technologies|Deutsche Bank|The Barclays|BMW Championship") ~ "FedEx Cup",
str_detect(tournament_name, "Arnold Palmer|Genesis Invitational|The Memorial") ~ "Invitational",
str_detect(tournament_name, "Hero World Challenge|Sentry Tournament of Champions|Tournament of Champions|Workday Charity Open") ~ "Other",
TRUE ~ "Regular"
)) %>%
group_by(tournament_type, player) %>%
summarize(
avg_sg_total = mean(sg_total, na.rm = TRUE),
events_played = n()
) %>%
filter(events_played >= 1)
## `summarise()` has grouped output by 'tournament_type'. You can override using
## the `.groups` argument.
performance_by_category %>%
ggplot(aes(x = reorder(tournament_type, avg_sg_total), y = avg_sg_total, fill = tournament_type)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(
title = "Average Strokes Gained by Tournament Category",
x = "Tournament Category",
y = "Average Strokes Gained",
fill = "Tournament Category"
) +
theme_minimal() +
theme(axis.text.x = element_text(size = 12))
## Warning: Removed 141 rows containing missing values or values outside the scale range
## (`geom_bar()`).