Vancouver Canucks Project

This project looks at the trends and successes of the Vancouver Canucks National Hockey League (NHL) team through time.

This report includes three parts:

Statistics through time at the team-level
Modeling Vancouver Canucks power play success through time
Random Forest analysis to identify most influential metrics in determining whether a goal is scored in a given scenario, and determine the best shooter on the Canucks

Data summary and citation:

“NHL Team Statistics.” NHL.com. Nation Hockey League, n.d. Web. July 2013. http://www.nhl.com/.

“NHL Team Stats.” Hockey Stats. Hockeyanalysis.com, 28 Apr. 2013. Web. July 2013. <http://stats.hockeyanalysis.com/teamstats.php?disp=1&db=201213&sit=5v5close&sort= PDO&sortdir=DESC>.

MoneyPuck. https://moneypuck.com/data.htm

Canucks Team Statistics

This report will be using the following metrics to measure team success through time 2008-2023): - Time on Ice (TOI) - Games Played (GP) - Corsi For = Shot attempts for at even strength: Shots + Blocks + Misses (CF) - Corsi Against = Corsi score of the opposing team (CA) - Goals For (GF) - Penalty minutes served (PENT) - Penalty minutes drawn (PEND) - Saves = blocked shot attempts for (SAVES) - Rebounds (REB) - Shots (SHOTS)

Data Wrangling

knitr::opts_chunk$set(echo = TRUE, warning= FALSE, message= FALSE)


# Packages ----
library(tidyverse)    # For data manipulation, plotting, etc. 
library(here)    # For reproducible data importation 
library(janitor)      # For text data manipulation
library(httr)         # To import data on github
library(TExPosition)  # PCA tools
library(ggrepel)      # Plotting tool for ggplot2
library(kableExtra)   # HTML table tools
library(RColorBrewer) # plotting colors
library(gridExtra)    # Plotting tools
library(dplyr)
library(magrittr)
library(corrplot)
library(tidyr)
library(ranger)
library(vip)
library(caret)
library(broom)


# Custom functions ----
# nice_table() simplifies the printing of HTML tables using kable
nice_table <- function(x){
  
  kable(x) %>%
    kable_styling(bootstrap_options = c('striped', 'hover', 'responsive', 'condensed'))
  
}

# Color palettes ----
rdgy <- brewer.pal(n = 11, name = "RdGy") # display.brewer.pal(11, "RdGy")

# ggplot2 finishings
pca_furnish <- theme_classic() +
  theme(axis.title = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_blank()
        )

## Import Data

canucks_2009<- read_csv(here("MP_data", "skaters_2008_2009.csv")) %>% 
  mutate(season= "2008-2009")
canucks_2010<- read_csv(here("MP_data", "skaters_2009_2010.csv")) %>% 
  mutate(season= "2009-2010")
canucks_2011<- read_csv(here("MP_data", "skaters_2010_2011.csv")) %>% 
  mutate(season= "2010-2011")
canucks_2012<- read_csv(here("MP_data", "skaters_2011_2012.csv")) %>% 
  mutate(season= "2011-2012")
canucks_2013<- read_csv(here("MP_data", "skaters_2012_2013.csv")) %>% 
  mutate(season= "2012-2013")
canucks_2014<- read_csv(here("MP_data", "skaters_2013_2014.csv")) %>% 
  mutate(season= "2013-2014")
canucks_2015<- read_csv(here("MP_data", "skaters_2014_2015.csv")) %>% 
  mutate(season= "2014-2015")
canucks_2016<- read_csv(here("MP_data", "skaters_2015_2016.csv")) %>% 
  mutate(season= "2015-2016")
canucks_2017<- read_csv(here("MP_data", "skaters_2016_2017.csv")) %>% 
  mutate(season= "2016-2017")
canucks_2018<- read_csv(here("MP_data", "skaters_2017_2018.csv")) %>% 
  mutate(season= "2017-2018")
canucks_2019<- read_csv(here("MP_data", "skaters_2018_2019.csv")) %>% 
  mutate(season= "2018-2019")
canucks_2020<- read_csv(here("MP_data", "skaters_2019_2020.csv")) %>% 
  mutate(season= "2019-2020")
canucks_2021<- read_csv(here("MP_data", "skaters_2020_2021.csv")) %>% 
  mutate(season= "2020-2021")
canucks_2022<- read_csv(here("MP_data", "skaters_2021_2022.csv")) %>% 
  mutate(season= "2021-2022")
canucks_2023<- read_csv(here("MP_data", "skaters_2022_2023.csv")) %>% 
  mutate(season= "2022-2023")



canucks_skater_data<- rbind(canucks_2009, canucks_2010, canucks_2011, canucks_2012, canucks_2013, canucks_2014, canucks_2015, canucks_2016, canucks_2017, canucks_2018, canucks_2019, canucks_2020, canucks_2021, canucks_2022, canucks_2023) %>% 
  filter(team== "VAN")
  



shots_2007<- read_csv(here("MP_data", "CAN_shots_2007.csv"))
shots_2008<- read_csv(here("MP_data", "CAN_shots_2008.csv"))
shots_2009<- read_csv(here("MP_data", "CAN_shots_2009.csv"))
shots_2010<- read_csv(here("MP_data", "CAN_shots_2010.csv"))
shots_2011<- read_csv(here("MP_data", "CAN_shots_2011.csv"))
shots_2012<- read_csv(here("MP_data", "CAN_shots_2012.csv"))
shots_2013<- read_csv(here("MP_data", "CAN_shots_2013.csv"))
shots_2014<- read_csv(here("MP_data", "CAN_shots_2014.csv"))
shots_2015<- read_csv(here("MP_data", "CAN_shots_2015.csv"))
shots_2016<- read_csv(here("MP_data", "CAN_shots_2016.csv"))
shots_2017<- read_csv(here("MP_data", "CAN_shots_2017.csv"))
shots_2018<- read_csv(here("MP_data", "CAN_shots_2018.csv"))
shots_2019<- read_csv(here("MP_data", "CAN_shots_2019.csv"))
shots_2020<- read_csv(here("MP_data", "CAN_shots_2020.csv"))[, colnames(shots_2019)]
shots_2021<- read_csv(here("MP_data", "CAN_shots_2021.csv"))
shots_2022<- read_csv(here("MP_data", "CAN_shots_2022.csv"))

shot_data<- rbind(shots_2009, shots_2010, shots_2011, shots_2012, shots_2013, shots_2014, shots_2015, shots_2016, shots_2017, shots_2018, shots_2019, shots_2020, shots_2021, shots_2022)

canucks_shot_data<- shot_data %>% 
  filter(homeTeamCode== "VAN" | awayTeamCode == "VAN")


file_path <- "/Users/maerennick/GitHub/NHL project/Vancouver_Canucks_Project/MP_data/teams_2007_2016.csv"

teams_2007_2016<- read.csv(file_path)

file_path_2 <- "/Users/maerennick/GitHub/NHL project/Vancouver_Canucks_Project/MP_data/teams_2017.csv"

teams_2017_2023<- read.csv(file_path_2)

nhl_team_data <- rbind(teams_2007_2016, teams_2017_2023)

## Data Preparation

canucks_data <-  nhl_team_data %>%
  clean_names() %>% 
  dplyr::filter(player_team == "VAN") %>% ## include only Vancouver (for and against)
  select(team:situation, corsi_percentage, ice_time, shots_on_goal_for:rebound_goals_for, penalties_for, penality_minutes_for, x_on_goal_for) %>% 
  rename(penalties_served= penalties_for) %>%
  rename(penality_minutes_served= penality_minutes_for) %>% 
  select(-opposing_team)


against_canucks_data<- nhl_team_data %>%
  clean_names() %>% 
  dplyr::filter(opposing_team == "VAN") %>% 
    select(team, game_id, situation, corsi_percentage, ice_time, shots_on_goal_for:rebound_goals_for, penalties_for, penality_minutes_for, x_on_goal_for) %>% 
  rename(opposing_team= team) %>% 
  rename(opposing_corsi_percentage= corsi_percentage) %>% 
  rename(opposing_ice_time= ice_time) %>% 
  rename(opposing_shots_on_goal= shots_on_goal_for) %>%
  rename(opposing_missed_shots= missed_shots_for) %>%
  rename(opposing_blocked_shot_attempts= blocked_shot_attempts_for) %>%
  rename(opposing_shot_attempts= shot_attempts_for) %>%
  rename(opposing_goals= goals_for) %>%
  rename(opposing_rebounds= rebounds_for) %>%
  rename(opposing_rebound_goals= rebound_goals_for) %>%
  rename(penalties_drawn= penalties_for) %>%
  rename(penality_minutes_drawn= penality_minutes_for) %>% 
  rename(opposing_x_on_goal= x_on_goal_for)
  

canucks_team_data<- canucks_data %>% 
  right_join(against_canucks_data, by=c("game_id","situation")) %>% 
  select(-opposing_shot_attempts) %>% 
  select(-opposing_rebounds) %>%
  select(-opposing_x_on_goal) %>%
  select(-opposing_blocked_shot_attempts) %>%
  select(-opposing_missed_shots) %>% 
  filter(situation== "5on5") %>% 
  group_by(season) %>% 
  mutate(GP= n_distinct(game_id)) %>% # games played
  rename(TOI= ice_time) %>% 
  rename(CF= corsi_percentage) %>% 
  rename(CA= opposing_corsi_percentage) %>% 
  rename(GF=goals_for) %>% 
  rename(GA=opposing_goals) %>% 
  rename(PENT= penality_minutes_served) %>% 
  rename(PEND= penality_minutes_drawn) %>% 
  rename(SHOTS=shots_on_goal_for) %>% 
  rename(SAVES= blocked_shot_attempts_for) %>% 
  rename(REB= rebounds_for)


canucks_team_data_avg<- canucks_team_data %>% 
  group_by(season) %>% 
  summarise(across(where(is.numeric), mean, na.rm = TRUE))

# Print data 
#kable(canucks_team_data) %>%
    #kable_styling(bootstrap_options = c('striped', 'hover', 'responsive', 'condensed'))

Calculate Z-Scores

Adjust all scores by the number of games played due to a shorted 2012-2013 and 2020-2021 seasons
Compute z-scores of all measures to facilitate comparisons

canucks_team_data$GP <- as.numeric(canucks_team_data$GP)
canucks_team_data$season <- as.numeric(canucks_team_data$season)

# Pre-processes, adjusts, and z-scores canucks data ----
canucks_data_long <- canucks_team_data_avg %>% 
  dplyr::select( -game_date, -game_id, -opposing_ice_time, -missed_shots_for, -opposing_shots_on_goal,-rebound_goals_for, -missed_shots_for, -x_on_goal_for, -penalties_served, -penalties_drawn, -opposing_rebound_goals, -shot_attempts_for) %>% 
  gather(Meas, Val, -season, -GP) %>%
  group_by(Meas) %>%
  mutate(Val_Adj = Val/GP,               # adjusts based on games played
         Val_Zscore = scale(Val_Adj)     # computes z-scores
         ) %>%
  ungroup() %>%
  mutate(sig = factor(ifelse(abs(Val_Zscore) > 1.96, "p < .05", "p > .05"))) # z score > 1.96

Performance across seasons

ggplot(canucks_data_long, aes(factor(season), Val_Zscore)) +
  geom_path(aes(group = 1), color = rdgy[8]) +
  geom_point(aes(color = sig), size = 1.75) +
  scale_color_manual(values = c(rdgy[3], rdgy[10]), name = "Z-Score") +
  scale_y_continuous(breaks = c(-2, 0, 2), minor_breaks = NULL) +
  coord_cartesian(ylim = c(-3,3)) +
  theme_minimal() + 
  labs(x = "\n Season", 
       y = "\n Measurement (Z-Score)",
       title = "Canucks Performance 2007-2023"
       ) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom"
        )+
  facet_wrap(~Meas, nrow = 5)

For the Vancouver Canucks, several performance metrics peaked in 2012 season. The 2012–13 Vancouver Canucks season was the team’s 43rd season in the National Hockey League and the Canucks won their fifth-straight Northwest Division title but finished third in the Western Conference. Corsi scores against and the amount of penalties served additionally peaked that year suggesting that they were facing strong competition. The 2020 season was additionally significant. This season was unconventional due to the COVID-19 shutdowns, which is likely why there is no peak in time on ice and could result in false peask due to the few numbers of games that were played.

Correlations

# Converts back to wide format

canucks_data_wide <- canucks_team_data %>% 
  dplyr::select(-team, -name, -player_team, -home_or_away, -game_date, -position, -situation, -opposing_team, -opposing_ice_time, -missed_shots_for, -opposing_shots_on_goal,-rebound_goals_for, -missed_shots_for, -x_on_goal_for, -penalties_served, -penalties_drawn, -opposing_rebound_goals, -shot_attempts_for, -game_id) %>% 
  group_by(season) %>% 
  mutate(CF= mean(CF)) %>% 
  mutate(TOI= mean(TOI)) %>% 
  mutate(SHOTS= mean(SHOTS)) %>% 
  mutate(SAVES= mean(SAVES)) %>% 
  mutate(GF= mean(GF)) %>% 
  mutate(REB= mean(REB)) %>% 
  mutate(PENT= mean(PENT)) %>% 
  mutate(CA= mean(CA)) %>% 
  mutate(GA= mean(GA)) %>% 
  mutate(PEND= mean(PEND)) %>% 
  mutate(GP= mean(GP)) %>% 
  gather(Meas, Val, -season, -GP) %>%
  group_by(Meas) %>%
  mutate(Val_Adj = Val/GP,               # adjusts based on games played
         Val_Zscore = scale(Val_Adj)     # computes z-scores
         ) %>%
  ungroup() %>%
  mutate(sig = factor(ifelse(abs(Val_Zscore) > 1.96, "p < .05", "p > .05"))) %>%  # z score > 1.96 
  select(season, Meas, Val_Zscore) %>%
  unique()%>% 
  spread(Meas, Val_Zscore)

# Computes correlations
canucks_cors <- cor(canucks_data_wide)

# Correlations to long format for plotting
canucks_cors_long <- canucks_cors %>%
  reshape2::melt() %>%
  arrange(Var1, Var2)

# Correlation heatmap
ggplot(canucks_cors_long, aes(x = Var1, y = Var2, fill = value)) +
  geom_raster() + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_blank()
        ) +
  scale_fill_distiller(palette = "GnBu", 
                       limits = c(-1, 1), 
                       name = "Correlation"
                       )

Season has a negative correlation with the number of penalties both drawn and received as well as corsi scores for. This suggests that through time, the Vancouver Canucks have steered away from play that would result in penalties. However, it also suggests that through time, the Canucks are taking less shots on goal and exhibiting less shot strength. There is a noticeable positive correlation between corsi scores and goals, which we would expect to see and time on ice with several of these metrics which we would also expect to see. Noticeably, there is no correlation between penalties taken or drawn and the amount of goals scored, which could explain why Vancouver may have strategically diverted from a rougher style of play.

Power Play Analysis

A “power play” is a sporting term used to describe a period of play where one team has a numerical advantage in players, usually due to a rule violation by the opposing team. In Hockey, they are considered to be a prime opportunity for scoring. Particularly for the Vancouver Canucks, successful power plays, resulting in a goal, is considered to be a main reason for the team’s success.

Read about: Five key Canucks are contributing to power play success

To take a deeper look into the success and performance of the Canucks team through time, we will analyze power plays and how goal scoring during power play opportunities have changed through time.

This is an extension of the work done by Ekarin Pongpipat & Matthew J. Kmiecik, 2018

First we will look across the entire NHL:

pp<- nhl_team_data %>% 
  filter(situation == "5on4")

# Initializing variables to store team names
pacific <- c('S.J', 'CGY', 'L.A', 'ANA', 'EDM', 'VAN', 'ARI')
central <- c('ATL.WPG', 'NSH', 'STL', 'DAL', 'COL', 'MIN', 'CHI')
metropolitan <- c('WSH', 'N.J', 'PHI', 'CBJ', 'PIT', 'NYR', 'NYI', 'CAR')
atlantic <- c('T.B', 'BOS', 'TOR', 'DET', 'MTL', 'FLA', 'OTT', 'BUF')

# Creating a column to identify each team's division
pp <- pp %>%
  mutate(
    division = case_when(
      pp$team %in% pacific ~ 'Pacific',
      pp$team %in% central ~ 'Central',
      pp$team %in% metropolitan ~ 'Metropolitan',
      pp$team %in% atlantic ~ 'Atlantic')
    )

divisions <- 
  data.frame(
    Pacific = c(pacific, ''),
    Central = c(central, ''),
    Metropolitan = metropolitan,
    Atlantic = atlantic
    )

NHL Divisions

# Prints table
kable(divisions) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'responsive'))

Pacific	Central	Metropolitan	Atlantic
S.J	ATL.WPG	WSH	T.B
CGY	NSH	N.J	BOS
L.A	STL	PHI	TOR
ANA	DAL	CBJ	DET
EDM	COL	PIT	MTL
VAN	MIN	NYR	FLA
ARI	CHI	NYI	OTT
		CAR	BUF

pp_season<- pp %>% 
  group_by(team, season) %>% 
  summarise(games_played= n_distinct(gameId), division, goalsFor= sum(goalsFor))

Average power play goals across divisions

ggplot(pp_season, aes(season, goalsFor/games_played, group = 1)) +
  geom_line(aes(group = team, color = division), alpha = 2/3) + 
  geom_line(stat = 'summary', fun.y = 'mean', size = .9, color = 'red') +
  scale_color_brewer(palette = 'Blues', direction = -1) +
  labs(x = 'Season (Start Year)',
       y = 'Season Power Play Goals (5v4)/ Games Played',
       caption = 'Each team is a separate line; red line is the average') +
  guides(color = guide_legend(title = 'Division')) +
  scale_x_continuous(limits = c(2008, 2022), breaks = c(2008:2022)) +
  theme_minimal()

There is a lot of variation across power play success through time across teams and across divisions. On average, success appears to be relatively consistent through time with the exception of the steep dip in the first few seasons. This could be due to improved team strategy, defense and/or goalie ability through time.

Model power play success through time

This model ignores the conference and division distinction. Each team will be modeled individually using a linear regression to predict power play goals per games played as a function of time (across 10 years). There are 30 teams in this dataset. Therefore, 30 distinct linear regressions will be performed; one for each team in the NHL. This allows us account for the variability of power plays within each team.

We’ll use these omnibus estimates to examine all regression models simultaneously via R2 estimates. These allow us to see how much variability in power play goals per games played was explained by time:

modTeam <-  
  pp_season %>%
  ungroup() %>% 
  nest_by(team) %>%
  mutate(level1 = list(lm((goalsFor/games_played) ~ season, data = data)))

level1Omni <- 
  modTeam %>% 
  summarise(broom::glance(level1)) %>% 
  ungroup() %>% 
  mutate(sig = p.value < .05) %>% 
  filter(team != "LAK", team != "NJD", team != "SEA", team != "SJS", team != "TBL") ## omitting teams that do not have enough data

Significance of Regressions

# Color palette
sigColorPal <- brewer.pal(11,'RdGy') # display.brewer.pal(11,'RdGy')

# R^2 Plot
ggplot(level1Omni, aes(r.squared, reorder(team, r.squared), color = sig)) +
  geom_point(size = 2) +
  scale_color_manual(values = sigColorPal[c(9,2)]) +
  labs(x = 'R-squared', y = 'Team') +
  guides(color = guide_legend(title = 'p < .05')) +
  theme_minimal()

## all but 2 are significantly different from 0

All but two of the regressions are significant, with Atlanta having the strongest relationship between power play success and time. While significant, the Vancouver cnaucks have a lower R^2 value suggesting there is a lot of variability in the data that cannot be explained or predicted by the regression.

Directionality of trends in power play success through time across teams

# Extracting level 1 coefficients
level1Coef <- 
  modTeam %>% 
  summarise(broom::tidy(level1)) %>% 
  ungroup() %>%
  filter(term == 'season') %>%   # Facilitates plotting
  mutate(sig = p.value < .05) %>% 
  filter(team != "LAK", team != "NJD", team != "SEA", team != "SJS", team != "TBL") ## omitting teams that do not have enough data


ggplot(level1Coef, aes(estimate, reorder(team, -1*estimate), color = sig)) +
  geom_point(size = 2) +
  geom_errorbarh(
    aes(xmin = estimate - std.error, xmax = estimate + std.error),
    alpha = 1/2
    ) +
  scale_color_manual(values = sigColorPal[c(9,2)]) +
  labs(
    x = 'Estimate (Yearly Change in Power Play Goals/Game)', 
    y = 'Team',
    caption = 'SEM error bars'
    ) +
  guides(color = guide_legend(title = 'p < .05')) +
  theme_minimal()

This figure looks at the directionality of power play success through time. Atlanta had the best fitting model and additionally has the most negative correlation, suggesting that there power play success has gone down significantly through time. The Vancouver Canucks have one of the shallower rates of change, but exhibit a negative trajectory in power play success through time.

Trends in Canucks power play success through time

model_data <- pp_season %>%
  filter(team == "VAN") %>% 
  mutate(goalsFor_per_game = goalsFor / games_played) %>%
  lm(goalsFor_per_game ~ season, data = .)

model_data <- model_data %>%
  augment() %>%
  mutate(fitted_values = .fitted)

ggplot(model_data, aes(x = season, y = goalsFor_per_game)) +
  geom_point(color = "blue") +  # Data points
  geom_line(aes(y = fitted_values), color = "darkgreen") +  # Fitted regression line
  labs(
       x = "Season",
       y = "Average Power Play Goals per Game") +
  theme_minimal()

As predicted by the R^2 value, there is a lot of variation in the data that is not well described by the model. However, we do see a decline in power play success on average through time.

Random Forest: scoring for the Vancouver Canucks

Random Forest is a powerful machine learning algorithm that can be used to predict outcomes, such as whether a goal is scored or not, based on a set of input features or metrics.

Determining the accuracy of the random forest

can_shot_data<- shot_data %>% 
  filter(homeTeamCode=="VAN") %>% 
  select(goal, season, isHomeTeam, shooterName, period, homeTeamGoals, awayTeamGoals, shotAngleAdjusted, xCordAdjusted, yCordAdjusted, shotDistance, shotType, shotOnEmptyNet, playerPositionThatDidEvent, shooterTimeOnIce)

# Check for missing values in each column
missing_values <- colSums(is.na(can_shot_data))

# Print columns with missing values
#print(missing_values[missing_values > 0])

# Remove rows with missing values
can_shot_data <- na.omit(can_shot_data) ## remvoing NAs --> there is enough data to make up for it

goal_rf<- randomForest::randomForest(as.factor(goal) ~season + isHomeTeam+ period+ homeTeamGoals+ awayTeamGoals+ shotAngleAdjusted+ xCordAdjusted+ yCordAdjusted+ shotDistance+ shotType+ shotOnEmptyNet+ playerPositionThatDidEvent+ shooterTimeOnIce, data= can_shot_data)

goal_rf

## 
## Call:
##  randomForest(formula = as.factor(goal) ~ season + isHomeTeam +      period + homeTeamGoals + awayTeamGoals + shotAngleAdjusted +      xCordAdjusted + yCordAdjusted + shotDistance + shotType +      shotOnEmptyNet + playerPositionThatDidEvent + shooterTimeOnIce,      data = can_shot_data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 6.55%
## Confusion matrix:
##       0   1 class.error
## 0 44188  61 0.001378562
## 1  3046 119 0.962401264

Importance of parameters in determining goal success

vip_goal_rf<- vip(goal_rf)

vip(goal_rf)

Interestingly, time on ice was the most important determinor in goal success, followed by distance of shot. This suggests that the stamina and/or fatigue of the palyer is even more important than where they are on the ice and how far away they are from the net.

actual_values <- as.numeric(as.character(can_shot_data$goal))
predicted_values <- as.numeric(as.character(predict(goal_rf, can_shot_data)))

# Create a data frame to combine actual and predicted values
data_plot <- data.frame(
  Values = c("Actual", "Predicted"),
  Goals = c(sum(actual_values), sum(predicted_values))
)

# Create the ggplot figure as a grouped bar plot
ggplot(data_plot, aes(x = Values, y = Goals, fill = Values)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Actual vs. Predicted - Random Forest",
       x = "Values",
       y = "Number of Goals") +
  scale_fill_manual(values = c("Actual" = "steelblue", "Predicted" = "orange")) +
  theme_minimal()

The amount of actual goals is well reflected by the predictions of the random forrest algorithm.

Best Shooters on the Canucks Ranked

goal_preds<- predict(goal_rf, type= "prob")

goal_preds_joined <- cbind(can_shot_data, goal_preds)

rankings<- goal_preds_joined %>% 
  mutate(goal_oe = goal- `1`) %>% #goals over expected
  group_by(shooterName) %>% 
  summarize(goals_oe_made = sum(goal_oe)) %>% 
  arrange(-goals_oe_made) %>% 
  rename("Goals Made Over Expectation" = goals_oe_made) %>% 
  rename("Shooter Name" = shooterName)

kable(head(rankings)) %>%
    kable_styling(bootstrap_options = c('striped', 'hover', 'responsive', 'condensed'))

Shooter Name	Goals Made Over Expectation
Elias Pettersson	10.058796
Jannik Hansen	9.473178
Mikael Samuelsson	6.983476
Andrei Kuzmenko	6.565507
Sven Baertschi	6.251878
Brandon Sutter	5.560945

Pythagorean Expectation for the 2022 season

## What we need in the data frame: 

# identity of team 
# points scored
# poitns scored against them
# aggregate of thos values across the entire season 

pe_nhl<- shot_data %>% 
  filter(season == 2022) %>% 
  clean_names() %>% 
  select(home_team_code, away_team_code, home_team_goals, away_team_goals, game_id)

## variable where 1 --> if home team won the game and 0 --> if away team didnt win the game 

pe_nhl_score<- pe_nhl %>% 
  group_by(game_id) %>% 
  mutate(hwin = case_when (sum(home_team_goals)> sum(away_team_goals) ~ 1,
                           sum(away_team_goals)>sum(home_team_goals)  ~ 0)) %>% 
  mutate(awin = case_when (sum(home_team_goals)< sum(away_team_goals) ~ 1,
                           sum(away_team_goals)<sum(home_team_goals)  ~ 0)) %>% 
  mutate(count =1)


### aggregation 

nhl_home<- pe_nhl_score %>% ### record of the team as a home team
group_by(home_team_code) %>% 
  summarise(hwin = sum(hwin),
            home_team_goals = sum(home_team_goals),
            away_team_goals = sum(away_team_goals),
            count = sum(count)) %>%
  rename(team = home_team_code, away_team_goals_h = away_team_goals, home_team_goals_h = home_team_goals, Gh = count)

nhl_away<- pe_nhl_score %>% ### record of the team as an away team 
  group_by(away_team_code) %>% 
  summarise(awin = sum(awin),
            home_team_goals = sum(home_team_goals),
            away_team_goals = sum(away_team_goals),
            count = sum(count)) %>%
  rename(team = away_team_code, away_team_goals_a = away_team_goals, home_team_goals_a = home_team_goals, Ga = count)


NHL22 <- merge(nhl_home, nhl_away, by = "team")


# Now we create the total wins, games, played, runs scored and run conceded by summing the totals as home team and away team


NHL22$W <- NHL22$hwin + NHL22$awin
NHL22$G <- NHL22$Gh + NHL22$Ga
NHL22$R <- NHL22$home_team_goals_h + NHL22$away_team_goals_a
NHL22$RA <- NHL22$away_team_goals_h + NHL22$home_team_goals_a
NHL22

##    team hwin home_team_goals_h away_team_goals_h   Gh awin home_team_goals_a
## 1   ANA    0               123               306  176   83               347
## 2   ARI   97               290               354  177   84                75
## 3   BOS   83               201                91   83   84                11
## 4   BUF    0               175               297   90   84                34
## 5   CAR   90               168                72   90   79                67
## 6   CBJ    0                99               177   83    0               221
## 7   CGY   77               323               285  171    0               395
## 8   CHI    0                62                66   80    0               217
## 9   COL    0               149               151   80  182               143
## 10  DAL    0               226               371  160    0               194
## 11  DET   78               223                58   78   75                50
## 12  EDM   89               244               301  168   75                46
## 13  FLA   95               242               168   95   90                36
## 14  LAK  144               148                79  144   73               226
## 15  MIN    0               160               175   87  183                63
## 16  MTL   80               251                33   80   84                95
## 17  NJD   83               166               154   83   86                40
## 18  NSH   90               202               111   90    0               432
## 19  NYI   87               231               218   87   81                69
## 20  NYR   65               155                82   65   72               171
## 21  OTT   96               164               141   96    0               167
## 22  PHI    0                84               143   83    0               178
## 23  PIT   87               266               245   87    0               143
## 24  SEA   81               396               208  159  177               340
## 25  SJS    0               297               367  176    0               407
## 26  STL  184               403               218  184   89                37
## 27  TBL   95               222               152   95   87                27
## 28  TOR   91               153               152   91    0                79
## 29  VAN 1336              4576              5553 3415 1438              6402
## 30  VGK    0                 5               273   92  163               201
## 31  WPG  168               389               289  168   67                 5
## 32  WSH   82               185               172   82   81                60
##    away_team_goals_a   Ga    W    G     R    RA
## 1                265  175   83  351   388   653
## 2                117   84  181  261   407   429
## 3                105   84  167  167   306   102
## 4                163   84   84  174   338   331
## 5                110   79  169  169   278   139
## 6                 83   82    0  165   182   398
## 7                210  190   77  361   533   680
## 8                 92  169    0  249   154   283
## 9                287  182  182  262   436   294
## 10                78   74    0  234   304   565
## 11               203   75  153  153   426   108
## 12               168   75  164  243   412   347
## 13               224   90  185  185   466   204
## 14               152  159  217  303   300   305
## 15               257  183  183  270   417   238
## 16               267   84  164  164   518   128
## 17               243   86  169  169   409   194
## 18               244  180   90  270   446   543
## 19               114   81  168  168   345   287
## 20               258   72  137  137   413   253
## 21                15   80   96  176   179   308
## 22                93   83    0  166   177   321
## 23                35   81   87  168   301   388
## 24               462  177  258  336   858   548
## 25               119  138    0  314   416   774
## 26               143   89  273  273   546   255
## 27               280   87  182  182   502   179
## 28                26   81   91  172   179   231
## 29              5909 3480 2774 6895 10485 11955
## 30               345  163  163  255   350   474
## 31               173   67  235  235   562   294
## 32               222   81  163  163   407   232

#define win percentage and the Pythagorean Expectation

NHL22$wpc <- NHL22$W / NHL22$G
NHL22$pyth <- NHL22$R^2 / (NHL22$R^2 + NHL22$RA^2)
NHL22

##    team hwin home_team_goals_h away_team_goals_h   Gh awin home_team_goals_a
## 1   ANA    0               123               306  176   83               347
## 2   ARI   97               290               354  177   84                75
## 3   BOS   83               201                91   83   84                11
## 4   BUF    0               175               297   90   84                34
## 5   CAR   90               168                72   90   79                67
## 6   CBJ    0                99               177   83    0               221
## 7   CGY   77               323               285  171    0               395
## 8   CHI    0                62                66   80    0               217
## 9   COL    0               149               151   80  182               143
## 10  DAL    0               226               371  160    0               194
## 11  DET   78               223                58   78   75                50
## 12  EDM   89               244               301  168   75                46
## 13  FLA   95               242               168   95   90                36
## 14  LAK  144               148                79  144   73               226
## 15  MIN    0               160               175   87  183                63
## 16  MTL   80               251                33   80   84                95
## 17  NJD   83               166               154   83   86                40
## 18  NSH   90               202               111   90    0               432
## 19  NYI   87               231               218   87   81                69
## 20  NYR   65               155                82   65   72               171
## 21  OTT   96               164               141   96    0               167
## 22  PHI    0                84               143   83    0               178
## 23  PIT   87               266               245   87    0               143
## 24  SEA   81               396               208  159  177               340
## 25  SJS    0               297               367  176    0               407
## 26  STL  184               403               218  184   89                37
## 27  TBL   95               222               152   95   87                27
## 28  TOR   91               153               152   91    0                79
## 29  VAN 1336              4576              5553 3415 1438              6402
## 30  VGK    0                 5               273   92  163               201
## 31  WPG  168               389               289  168   67                 5
## 32  WSH   82               185               172   82   81                60
##    away_team_goals_a   Ga    W    G     R    RA       wpc      pyth
## 1                265  175   83  351   388   653 0.2364672 0.2609294
## 2                117   84  181  261   407   429 0.6934866 0.4737024
## 3                105   84  167  167   306   102 1.0000000 0.9000000
## 4                163   84   84  174   338   331 0.4827586 0.5104622
## 5                110   79  169  169   278   139 1.0000000 0.8000000
## 6                 83   82    0  165   182   398 0.0000000 0.1729460
## 7                210  190   77  361   533   680 0.2132964 0.3805669
## 8                 92  169    0  249   154   283 0.0000000 0.2284668
## 9                287  182  182  262   436   294 0.6946565 0.6874286
## 10                78   74    0  234   304   565 0.0000000 0.2245063
## 11               203   75  153  153   426   108 1.0000000 0.9396086
## 12               168   75  164  243   412   347 0.6748971 0.5850155
## 13               224   90  185  185   466   204 1.0000000 0.8391789
## 14               152  159  217  303   300   305 0.7161716 0.4917361
## 15               257  183  183  270   417   238 0.6777778 0.7542911
## 16               267   84  164  164   518   128 1.0000000 0.9424533
## 17               243   86  169  169   409   194 1.0000000 0.8163354
## 18               244  180   90  270   446   543 0.3333333 0.4028556
## 19               114   81  168  168   345   287 1.0000000 0.5910057
## 20               258   72  137  137   413   253 1.0000000 0.7271313
## 21                15   80   96  176   179   308 0.5454545 0.2524802
## 22                93   83    0  166   177   321 0.0000000 0.2331547
## 23                35   81   87  168   301   388 0.5178571 0.3757117
## 24               462  177  258  336   858   548 0.7678571 0.7102622
## 25               119  138    0  314   416   774 0.0000000 0.2241275
## 26               143   89  273  273   546   255 1.0000000 0.8209373
## 27               280   87  182  182   502   179 1.0000000 0.8871975
## 28                26   81   91  172   179   231 0.5290698 0.3751786
## 29              5909 3480 2774 6895 10485 11955 0.4023205 0.4347719
## 30               345  163  163  255   350   474 0.6392157 0.3528470
## 31               173   67  235  235   562   294 1.0000000 0.7851347
## 32               222   81  163  163   407   232 1.0000000 0.7547580

## regression

pyth_lm <- lm(wpc ~ pyth, data = NHL22)
summary(pyth_lm)

## 
## Call:
## lm(formula = wpc ~ pyth, data = NHL22)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.20828 -0.11902 -0.02924  0.11852  0.33060 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.11477    0.06938  -1.654    0.109    
## pyth         1.32684    0.11328  11.713 1.02e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1582 on 30 degrees of freedom
## Multiple R-squared:  0.8206, Adjusted R-squared:  0.8146 
## F-statistic: 137.2 on 1 and 30 DF,  p-value: 1.02e-12

wpc = Intercept + coef x pyth

For every one unit increase in pyth, the value of wpc goes up by 1.33
Pythagorean Expectation can account for 81.5% of the variation in win percentage