setwd("/Users/michaelostrower/Desktop/DS4P_2025")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(tidyr)
library(ggplot2)
outcomes <- read.csv("NHL Playoff Outcomes - Outcomes.csv")
games <- read.csv("NHL Game Logs - Outcome.csv")
### Creating win % tables for all games and final 20%
team_records <- games %>%
  filter(Rslt %in% c("W", "L")) %>%
  group_by(Season, Team) %>%
  summarise(
    wins   = sum(Rslt == "W"),
    losses = sum(Rslt == "L"),
    games  = wins + losses,
    win_pct = wins / games,
    .groups = "drop"
  )

### Final 20%

team_records <- games %>%
  filter(Rslt %in% c("W", "L")) %>% 
  group_by(Season, Team) %>%
  mutate(
    gtm_cutoff = quantile(Gtm, 0.80, na.rm = TRUE)
  ) %>%
  summarise(
    wins        = sum(Rslt == "W"),
    losses      = sum(Rslt == "L"),
    games       = wins + losses,
    win_pct     = wins / games,
    
    
    wins_20     = sum(Rslt == "W" & Gtm >= gtm_cutoff),
    losses_20   = sum(Rslt == "L" & Gtm >= gtm_cutoff),
    games_20    = wins_20 + losses_20,
    win_pct_20  = if_else(games_20 > 0, wins_20 / games_20, NA_real_),
    .groups = "drop"
  )
### Creating playoff tables
playoffs_yes <- outcomes %>%
  mutate(Year = as.integer(Year)) %>%
  distinct(Year, Team) %>%
  filter(Team != "League Average") %>%
  filter(Team != "Team") %>%
  mutate(made_playoffs = "Yes")
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Year = as.integer(Year)`.
## Caused by warning:
## ! NAs introduced by coercion
### figure out why this code chunk has NAs introduced by coercion

playoffs <- team_records %>%
  rename(Year = Season) %>%
  filter(Team != "League Average") %>%
  filter(Team != "Team") %>%
  left_join(playoffs_yes, by = c("Year", "Team")) %>%
  mutate(
    made_playoffs = if_else(is.na(made_playoffs), "No", made_playoffs)
  ) %>%
  filter(!is.na(win_pct)) %>%
  select(Year, Team, win_pct, made_playoffs)
### Combining all the data into one table called
library(stringr)
outcomes_rank <- outcomes %>%
  mutate(
    Year = str_extract(Year, "\\d{4}"),
    Year = as.integer(Year)
  ) %>%
  filter(!is.na(Year),
         Team != "League Average",
         Team != "Team") %>%
  select(Year, Team, Rk)

team_records_clean <- team_records %>%
  rename(Year = Season) %>%
  filter(Team != "League Average",
         Team != "Team")

analysis <- playoffs_yes %>%
  left_join(team_records_clean, by = c("Year", "Team")) %>% 
  left_join(outcomes_rank, by = c("Year", "Team")) %>%
  select(Year, Team, win_pct, win_pct_20, Rk, made_playoffs)
analysis_clean <- analysis %>%
  mutate(
    Rk = as.numeric(Rk)
  ) %>%
  filter(
    !is.na(Rk),
    !is.na(win_pct),
    !is.na(win_pct_20)
  )

cor_win <- cor(analysis_clean$win_pct,    analysis_clean$Rk, use = "complete.obs")
cor_win_20   <- cor(analysis_clean$win_pct_20, analysis_clean$Rk, use = "complete.obs")
model_win    <- lm(Rk ~ win_pct,    data = analysis_clean)
model_win_20 <- lm(Rk ~ win_pct_20, data = analysis_clean)

summary(model_win)$adj.r.squared
## [1] 0.07322491
summary(model_win_20)$adj.r.squared
## [1] 0.007208992
analysis_long <- analysis_clean %>%
  pivot_longer(
    cols = c(win_pct, win_pct_20),
    names_to = "metric",
    values_to = "value"
  )

ggplot(analysis_long, aes(x = value, y = Rk)) +
  geom_point(aes(color = Rk == 1), alpha = 0.6) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
  geom_smooth(method = "lm", se = FALSE, color = "purple") +
  facet_wrap(~ metric, scales = "free_x") +
  labs(
    title = "Relationship between winning % metrics and playoff ranking",
    x = "Winning percentage",
    y = "Playoff rank (lower is better)",
    color = "Rk = 1?"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

data.frame(
  metric = c("win_pct", "win_pct_20"),
  correlation = c(cor_win, cor_win_20),
  adj_r2 = c(summary(model_win)$adj.r.squared,
             summary(model_win_20)$adj.r.squared)
)
##       metric correlation      adj_r2
## 1    win_pct  -0.2775026 0.073224914
## 2 win_pct_20  -0.1061188 0.007208992
### Same as above but year 2020 is excluded since there was a five+ month break between the end of the seasona and the playoffs

analysis_clean_no2020 <- analysis %>%
  mutate(
    Rk = as.numeric(Rk)
  ) %>%
  filter(
    Year != 2020,
    !is.na(Rk),
    !is.na(win_pct),
    !is.na(win_pct_20)
  )

cor_win_no2020    <- cor(analysis_clean_no2020$win_pct,    analysis_clean_no2020$Rk, use = "complete.obs")
cor_win_20_no2020 <- cor(analysis_clean_no2020$win_pct_20, analysis_clean_no2020$Rk, use = "complete.obs")

model_win_no2020    <- lm(Rk ~ win_pct,    data = analysis_clean_no2020)
model_win_20_no2020 <- lm(Rk ~ win_pct_20, data = analysis_clean_no2020)

summary(model_win_no2020)$adj.r.squared
## [1] 0.05782789
summary(model_win_20_no2020)$adj.r.squared
## [1] 0.003109032
analysis_long_no2020 <- analysis_clean_no2020 %>%
  pivot_longer(
    cols = c(win_pct, win_pct_20),
    names_to = "metric",
    values_to = "value"
  )

ggplot(analysis_long_no2020, aes(x = value, y = Rk)) +
  geom_point(aes(color = Rk == 1), alpha = 0.6) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ metric, scales = "free_x") +
  labs(
    title = "Relationship between winning % metrics and playoff ranking (2020 Removed)",
    x = "Winning percentage",
    y = "Playoff rank (lower is better)",
    color = "Rk = 1?"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'