setwd("/Users/michaelostrower/Desktop/DS4P_2025")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)
library(tidyr)
library(ggplot2)
outcomes <- read.csv("NHL Playoff Outcomes - Outcomes.csv")
games <- read.csv("NHL Game Logs - Outcome.csv")
### Creating win % tables for all games and final 20%
team_records <- games %>%
filter(Rslt %in% c("W", "L")) %>%
group_by(Season, Team) %>%
summarise(
wins = sum(Rslt == "W"),
losses = sum(Rslt == "L"),
games = wins + losses,
win_pct = wins / games,
.groups = "drop"
)
### Final 20%
team_records <- games %>%
filter(Rslt %in% c("W", "L")) %>%
group_by(Season, Team) %>%
mutate(
gtm_cutoff = quantile(Gtm, 0.80, na.rm = TRUE)
) %>%
summarise(
wins = sum(Rslt == "W"),
losses = sum(Rslt == "L"),
games = wins + losses,
win_pct = wins / games,
wins_20 = sum(Rslt == "W" & Gtm >= gtm_cutoff),
losses_20 = sum(Rslt == "L" & Gtm >= gtm_cutoff),
games_20 = wins_20 + losses_20,
win_pct_20 = if_else(games_20 > 0, wins_20 / games_20, NA_real_),
.groups = "drop"
)
### Creating playoff tables
playoffs_yes <- outcomes %>%
mutate(Year = as.integer(Year)) %>%
distinct(Year, Team) %>%
filter(Team != "League Average") %>%
filter(Team != "Team") %>%
mutate(made_playoffs = "Yes")
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Year = as.integer(Year)`.
## Caused by warning:
## ! NAs introduced by coercion
### figure out why this code chunk has NAs introduced by coercion
playoffs <- team_records %>%
rename(Year = Season) %>%
filter(Team != "League Average") %>%
filter(Team != "Team") %>%
left_join(playoffs_yes, by = c("Year", "Team")) %>%
mutate(
made_playoffs = if_else(is.na(made_playoffs), "No", made_playoffs)
) %>%
filter(!is.na(win_pct)) %>%
select(Year, Team, win_pct, made_playoffs)
### Combining all the data into one table called
library(stringr)
outcomes_rank <- outcomes %>%
mutate(
Year = str_extract(Year, "\\d{4}"),
Year = as.integer(Year)
) %>%
filter(!is.na(Year),
Team != "League Average",
Team != "Team") %>%
select(Year, Team, Rk)
team_records_clean <- team_records %>%
rename(Year = Season) %>%
filter(Team != "League Average",
Team != "Team")
analysis <- playoffs_yes %>%
left_join(team_records_clean, by = c("Year", "Team")) %>%
left_join(outcomes_rank, by = c("Year", "Team")) %>%
select(Year, Team, win_pct, win_pct_20, Rk, made_playoffs)
analysis_clean <- analysis %>%
mutate(
Rk = as.numeric(Rk)
) %>%
filter(
!is.na(Rk),
!is.na(win_pct),
!is.na(win_pct_20)
)
cor_win <- cor(analysis_clean$win_pct, analysis_clean$Rk, use = "complete.obs")
cor_win_20 <- cor(analysis_clean$win_pct_20, analysis_clean$Rk, use = "complete.obs")
model_win <- lm(Rk ~ win_pct, data = analysis_clean)
model_win_20 <- lm(Rk ~ win_pct_20, data = analysis_clean)
summary(model_win)$adj.r.squared
## [1] 0.07322491
summary(model_win_20)$adj.r.squared
## [1] 0.007208992
analysis_long <- analysis_clean %>%
pivot_longer(
cols = c(win_pct, win_pct_20),
names_to = "metric",
values_to = "value"
)
ggplot(analysis_long, aes(x = value, y = Rk)) +
geom_point(aes(color = Rk == 1), alpha = 0.6) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
geom_smooth(method = "lm", se = FALSE, color = "purple") +
facet_wrap(~ metric, scales = "free_x") +
labs(
title = "Relationship between winning % metrics and playoff ranking",
x = "Winning percentage",
y = "Playoff rank (lower is better)",
color = "Rk = 1?"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

data.frame(
metric = c("win_pct", "win_pct_20"),
correlation = c(cor_win, cor_win_20),
adj_r2 = c(summary(model_win)$adj.r.squared,
summary(model_win_20)$adj.r.squared)
)
## metric correlation adj_r2
## 1 win_pct -0.2775026 0.073224914
## 2 win_pct_20 -0.1061188 0.007208992
### Same as above but year 2020 is excluded since there was a five+ month break between the end of the seasona and the playoffs
analysis_clean_no2020 <- analysis %>%
mutate(
Rk = as.numeric(Rk)
) %>%
filter(
Year != 2020,
!is.na(Rk),
!is.na(win_pct),
!is.na(win_pct_20)
)
cor_win_no2020 <- cor(analysis_clean_no2020$win_pct, analysis_clean_no2020$Rk, use = "complete.obs")
cor_win_20_no2020 <- cor(analysis_clean_no2020$win_pct_20, analysis_clean_no2020$Rk, use = "complete.obs")
model_win_no2020 <- lm(Rk ~ win_pct, data = analysis_clean_no2020)
model_win_20_no2020 <- lm(Rk ~ win_pct_20, data = analysis_clean_no2020)
summary(model_win_no2020)$adj.r.squared
## [1] 0.05782789
summary(model_win_20_no2020)$adj.r.squared
## [1] 0.003109032
analysis_long_no2020 <- analysis_clean_no2020 %>%
pivot_longer(
cols = c(win_pct, win_pct_20),
names_to = "metric",
values_to = "value"
)
ggplot(analysis_long_no2020, aes(x = value, y = Rk)) +
geom_point(aes(color = Rk == 1), alpha = 0.6) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
facet_wrap(~ metric, scales = "free_x") +
labs(
title = "Relationship between winning % metrics and playoff ranking (2020 Removed)",
x = "Winning percentage",
y = "Playoff rank (lower is better)",
color = "Rk = 1?"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
