This project looks at the trends and successes of the Vancouver Canucks National Hockey League (NHL) team through time.
This report includes three parts:
Statistics through time at the team-level
Modeling Vancouver Canucks power play success through time
Random Forest analysis to identify most influential metrics in determining whether a goal is scored in a given scenario, and determine the best shooter on the Canucks
Data summary and citation:
“NHL Team Statistics.” NHL.com. Nation Hockey League, n.d. Web. July 2013. http://www.nhl.com/.
“NHL Team Stats.” Hockey Stats. Hockeyanalysis.com, 28 Apr. 2013. Web. July 2013. <http://stats.hockeyanalysis.com/teamstats.php?disp=1&db=201213&sit=5v5close&sort= PDO&sortdir=DESC>.
MoneyPuck. https://moneypuck.com/data.htm
This report will be using the following metrics to measure team success through time 2008-2023): - Time on Ice (TOI) - Games Played (GP) - Corsi For = Shot attempts for at even strength: Shots + Blocks + Misses (CF) - Corsi Against = Corsi score of the opposing team (CA) - Goals For (GF) - Penalty minutes served (PENT) - Penalty minutes drawn (PEND) - Saves = blocked shot attempts for (SAVES) - Rebounds (REB) - Shots (SHOTS)
knitr::opts_chunk$set(echo = TRUE, warning= FALSE, message= FALSE)
# Packages ----
library(tidyverse) # For data manipulation, plotting, etc.
library(here) # For reproducible data importation
library(janitor) # For text data manipulation
library(httr) # To import data on github
library(TExPosition) # PCA tools
library(ggrepel) # Plotting tool for ggplot2
library(kableExtra) # HTML table tools
library(RColorBrewer) # plotting colors
library(gridExtra) # Plotting tools
library(dplyr)
library(magrittr)
library(corrplot)
library(tidyr)
library(ranger)
library(vip)
library(caret)
library(broom)
# Custom functions ----
# nice_table() simplifies the printing of HTML tables using kable
nice_table <- function(x){
kable(x) %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'responsive', 'condensed'))
}
# Color palettes ----
rdgy <- brewer.pal(n = 11, name = "RdGy") # display.brewer.pal(11, "RdGy")
# ggplot2 finishings
pca_furnish <- theme_classic() +
theme(axis.title = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank()
)## Import Data
canucks_2009<- read_csv(here("MP_data", "skaters_2008_2009.csv")) %>%
mutate(season= "2008-2009")
canucks_2010<- read_csv(here("MP_data", "skaters_2009_2010.csv")) %>%
mutate(season= "2009-2010")
canucks_2011<- read_csv(here("MP_data", "skaters_2010_2011.csv")) %>%
mutate(season= "2010-2011")
canucks_2012<- read_csv(here("MP_data", "skaters_2011_2012.csv")) %>%
mutate(season= "2011-2012")
canucks_2013<- read_csv(here("MP_data", "skaters_2012_2013.csv")) %>%
mutate(season= "2012-2013")
canucks_2014<- read_csv(here("MP_data", "skaters_2013_2014.csv")) %>%
mutate(season= "2013-2014")
canucks_2015<- read_csv(here("MP_data", "skaters_2014_2015.csv")) %>%
mutate(season= "2014-2015")
canucks_2016<- read_csv(here("MP_data", "skaters_2015_2016.csv")) %>%
mutate(season= "2015-2016")
canucks_2017<- read_csv(here("MP_data", "skaters_2016_2017.csv")) %>%
mutate(season= "2016-2017")
canucks_2018<- read_csv(here("MP_data", "skaters_2017_2018.csv")) %>%
mutate(season= "2017-2018")
canucks_2019<- read_csv(here("MP_data", "skaters_2018_2019.csv")) %>%
mutate(season= "2018-2019")
canucks_2020<- read_csv(here("MP_data", "skaters_2019_2020.csv")) %>%
mutate(season= "2019-2020")
canucks_2021<- read_csv(here("MP_data", "skaters_2020_2021.csv")) %>%
mutate(season= "2020-2021")
canucks_2022<- read_csv(here("MP_data", "skaters_2021_2022.csv")) %>%
mutate(season= "2021-2022")
canucks_2023<- read_csv(here("MP_data", "skaters_2022_2023.csv")) %>%
mutate(season= "2022-2023")
canucks_skater_data<- rbind(canucks_2009, canucks_2010, canucks_2011, canucks_2012, canucks_2013, canucks_2014, canucks_2015, canucks_2016, canucks_2017, canucks_2018, canucks_2019, canucks_2020, canucks_2021, canucks_2022, canucks_2023) %>%
filter(team== "VAN")
shots_2007<- read_csv(here("MP_data", "CAN_shots_2007.csv"))
shots_2008<- read_csv(here("MP_data", "CAN_shots_2008.csv"))
shots_2009<- read_csv(here("MP_data", "CAN_shots_2009.csv"))
shots_2010<- read_csv(here("MP_data", "CAN_shots_2010.csv"))
shots_2011<- read_csv(here("MP_data", "CAN_shots_2011.csv"))
shots_2012<- read_csv(here("MP_data", "CAN_shots_2012.csv"))
shots_2013<- read_csv(here("MP_data", "CAN_shots_2013.csv"))
shots_2014<- read_csv(here("MP_data", "CAN_shots_2014.csv"))
shots_2015<- read_csv(here("MP_data", "CAN_shots_2015.csv"))
shots_2016<- read_csv(here("MP_data", "CAN_shots_2016.csv"))
shots_2017<- read_csv(here("MP_data", "CAN_shots_2017.csv"))
shots_2018<- read_csv(here("MP_data", "CAN_shots_2018.csv"))
shots_2019<- read_csv(here("MP_data", "CAN_shots_2019.csv"))
shots_2020<- read_csv(here("MP_data", "CAN_shots_2020.csv"))[, colnames(shots_2019)]
shots_2021<- read_csv(here("MP_data", "CAN_shots_2021.csv"))
shots_2022<- read_csv(here("MP_data", "CAN_shots_2022.csv"))
shot_data<- rbind(shots_2009, shots_2010, shots_2011, shots_2012, shots_2013, shots_2014, shots_2015, shots_2016, shots_2017, shots_2018, shots_2019, shots_2020, shots_2021, shots_2022)
canucks_shot_data<- shot_data %>%
filter(homeTeamCode== "VAN" | awayTeamCode == "VAN")
file_path <- "/Users/maerennick/GitHub/NHL project/Vancouver_Canucks_Project/MP_data/teams_2007_2016.csv"
teams_2007_2016<- read.csv(file_path)
file_path_2 <- "/Users/maerennick/GitHub/NHL project/Vancouver_Canucks_Project/MP_data/teams_2017.csv"
teams_2017_2023<- read.csv(file_path_2)
nhl_team_data <- rbind(teams_2007_2016, teams_2017_2023)## Data Preparation
canucks_data <- nhl_team_data %>%
clean_names() %>%
dplyr::filter(player_team == "VAN") %>% ## include only Vancouver (for and against)
select(team:situation, corsi_percentage, ice_time, shots_on_goal_for:rebound_goals_for, penalties_for, penality_minutes_for, x_on_goal_for) %>%
rename(penalties_served= penalties_for) %>%
rename(penality_minutes_served= penality_minutes_for) %>%
select(-opposing_team)
against_canucks_data<- nhl_team_data %>%
clean_names() %>%
dplyr::filter(opposing_team == "VAN") %>%
select(team, game_id, situation, corsi_percentage, ice_time, shots_on_goal_for:rebound_goals_for, penalties_for, penality_minutes_for, x_on_goal_for) %>%
rename(opposing_team= team) %>%
rename(opposing_corsi_percentage= corsi_percentage) %>%
rename(opposing_ice_time= ice_time) %>%
rename(opposing_shots_on_goal= shots_on_goal_for) %>%
rename(opposing_missed_shots= missed_shots_for) %>%
rename(opposing_blocked_shot_attempts= blocked_shot_attempts_for) %>%
rename(opposing_shot_attempts= shot_attempts_for) %>%
rename(opposing_goals= goals_for) %>%
rename(opposing_rebounds= rebounds_for) %>%
rename(opposing_rebound_goals= rebound_goals_for) %>%
rename(penalties_drawn= penalties_for) %>%
rename(penality_minutes_drawn= penality_minutes_for) %>%
rename(opposing_x_on_goal= x_on_goal_for)
canucks_team_data<- canucks_data %>%
right_join(against_canucks_data, by=c("game_id","situation")) %>%
select(-opposing_shot_attempts) %>%
select(-opposing_rebounds) %>%
select(-opposing_x_on_goal) %>%
select(-opposing_blocked_shot_attempts) %>%
select(-opposing_missed_shots) %>%
filter(situation== "5on5") %>%
group_by(season) %>%
mutate(GP= n_distinct(game_id)) %>% # games played
rename(TOI= ice_time) %>%
rename(CF= corsi_percentage) %>%
rename(CA= opposing_corsi_percentage) %>%
rename(GF=goals_for) %>%
rename(GA=opposing_goals) %>%
rename(PENT= penality_minutes_served) %>%
rename(PEND= penality_minutes_drawn) %>%
rename(SHOTS=shots_on_goal_for) %>%
rename(SAVES= blocked_shot_attempts_for) %>%
rename(REB= rebounds_for)
canucks_team_data_avg<- canucks_team_data %>%
group_by(season) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
# Print data
#kable(canucks_team_data) %>%
#kable_styling(bootstrap_options = c('striped', 'hover', 'responsive', 'condensed'))Adjust all scores by the number of games played due to a shorted 2012-2013 and 2020-2021 seasons
Compute z-scores of all measures to facilitate comparisons
canucks_team_data$GP <- as.numeric(canucks_team_data$GP)
canucks_team_data$season <- as.numeric(canucks_team_data$season)
# Pre-processes, adjusts, and z-scores canucks data ----
canucks_data_long <- canucks_team_data_avg %>%
dplyr::select( -game_date, -game_id, -opposing_ice_time, -missed_shots_for, -opposing_shots_on_goal,-rebound_goals_for, -missed_shots_for, -x_on_goal_for, -penalties_served, -penalties_drawn, -opposing_rebound_goals, -shot_attempts_for) %>%
gather(Meas, Val, -season, -GP) %>%
group_by(Meas) %>%
mutate(Val_Adj = Val/GP, # adjusts based on games played
Val_Zscore = scale(Val_Adj) # computes z-scores
) %>%
ungroup() %>%
mutate(sig = factor(ifelse(abs(Val_Zscore) > 1.96, "p < .05", "p > .05"))) # z score > 1.96ggplot(canucks_data_long, aes(factor(season), Val_Zscore)) +
geom_path(aes(group = 1), color = rdgy[8]) +
geom_point(aes(color = sig), size = 1.75) +
scale_color_manual(values = c(rdgy[3], rdgy[10]), name = "Z-Score") +
scale_y_continuous(breaks = c(-2, 0, 2), minor_breaks = NULL) +
coord_cartesian(ylim = c(-3,3)) +
theme_minimal() +
labs(x = "\n Season",
y = "\n Measurement (Z-Score)",
title = "Canucks Performance 2007-2023"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
)+
facet_wrap(~Meas, nrow = 5)For the Vancouver Canucks, several performance metrics peaked in 2012 season. The 2012–13 Vancouver Canucks season was the team’s 43rd season in the National Hockey League and the Canucks won their fifth-straight Northwest Division title but finished third in the Western Conference. Corsi scores against and the amount of penalties served additionally peaked that year suggesting that they were facing strong competition. The 2020 season was additionally significant. This season was unconventional due to the COVID-19 shutdowns, which is likely why there is no peak in time on ice and could result in false peask due to the few numbers of games that were played.
# Converts back to wide format
canucks_data_wide <- canucks_team_data %>%
dplyr::select(-team, -name, -player_team, -home_or_away, -game_date, -position, -situation, -opposing_team, -opposing_ice_time, -missed_shots_for, -opposing_shots_on_goal,-rebound_goals_for, -missed_shots_for, -x_on_goal_for, -penalties_served, -penalties_drawn, -opposing_rebound_goals, -shot_attempts_for, -game_id) %>%
group_by(season) %>%
mutate(CF= mean(CF)) %>%
mutate(TOI= mean(TOI)) %>%
mutate(SHOTS= mean(SHOTS)) %>%
mutate(SAVES= mean(SAVES)) %>%
mutate(GF= mean(GF)) %>%
mutate(REB= mean(REB)) %>%
mutate(PENT= mean(PENT)) %>%
mutate(CA= mean(CA)) %>%
mutate(GA= mean(GA)) %>%
mutate(PEND= mean(PEND)) %>%
mutate(GP= mean(GP)) %>%
gather(Meas, Val, -season, -GP) %>%
group_by(Meas) %>%
mutate(Val_Adj = Val/GP, # adjusts based on games played
Val_Zscore = scale(Val_Adj) # computes z-scores
) %>%
ungroup() %>%
mutate(sig = factor(ifelse(abs(Val_Zscore) > 1.96, "p < .05", "p > .05"))) %>% # z score > 1.96
select(season, Meas, Val_Zscore) %>%
unique()%>%
spread(Meas, Val_Zscore)
# Computes correlations
canucks_cors <- cor(canucks_data_wide)
# Correlations to long format for plotting
canucks_cors_long <- canucks_cors %>%
reshape2::melt() %>%
arrange(Var1, Var2)
# Correlation heatmap
ggplot(canucks_cors_long, aes(x = Var1, y = Var2, fill = value)) +
geom_raster() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_blank()
) +
scale_fill_distiller(palette = "GnBu",
limits = c(-1, 1),
name = "Correlation"
)Season has a negative correlation with the number of penalties both drawn and received as well as corsi scores for. This suggests that through time, the Vancouver Canucks have steered away from play that would result in penalties. However, it also suggests that through time, the Canucks are taking less shots on goal and exhibiting less shot strength. There is a noticeable positive correlation between corsi scores and goals, which we would expect to see and time on ice with several of these metrics which we would also expect to see. Noticeably, there is no correlation between penalties taken or drawn and the amount of goals scored, which could explain why Vancouver may have strategically diverted from a rougher style of play.
A “power play” is a sporting term used to describe a period of play where one team has a numerical advantage in players, usually due to a rule violation by the opposing team. In Hockey, they are considered to be a prime opportunity for scoring. Particularly for the Vancouver Canucks, successful power plays, resulting in a goal, is considered to be a main reason for the team’s success.
To take a deeper look into the success and performance of the Canucks team through time, we will analyze power plays and how goal scoring during power play opportunities have changed through time.
This is an extension of the work done by Ekarin Pongpipat & Matthew J. Kmiecik, 2018
First we will look across the entire NHL:
pp<- nhl_team_data %>%
filter(situation == "5on4")
# Initializing variables to store team names
pacific <- c('S.J', 'CGY', 'L.A', 'ANA', 'EDM', 'VAN', 'ARI')
central <- c('ATL.WPG', 'NSH', 'STL', 'DAL', 'COL', 'MIN', 'CHI')
metropolitan <- c('WSH', 'N.J', 'PHI', 'CBJ', 'PIT', 'NYR', 'NYI', 'CAR')
atlantic <- c('T.B', 'BOS', 'TOR', 'DET', 'MTL', 'FLA', 'OTT', 'BUF')
# Creating a column to identify each team's division
pp <- pp %>%
mutate(
division = case_when(
pp$team %in% pacific ~ 'Pacific',
pp$team %in% central ~ 'Central',
pp$team %in% metropolitan ~ 'Metropolitan',
pp$team %in% atlantic ~ 'Atlantic')
)
divisions <-
data.frame(
Pacific = c(pacific, ''),
Central = c(central, ''),
Metropolitan = metropolitan,
Atlantic = atlantic
)# Prints table
kable(divisions) %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'responsive'))| Pacific | Central | Metropolitan | Atlantic |
|---|---|---|---|
| S.J | ATL.WPG | WSH | T.B |
| CGY | NSH | N.J | BOS |
| L.A | STL | PHI | TOR |
| ANA | DAL | CBJ | DET |
| EDM | COL | PIT | MTL |
| VAN | MIN | NYR | FLA |
| ARI | CHI | NYI | OTT |
| CAR | BUF |
pp_season<- pp %>%
group_by(team, season) %>%
summarise(games_played= n_distinct(gameId), division, goalsFor= sum(goalsFor))ggplot(pp_season, aes(season, goalsFor/games_played, group = 1)) +
geom_line(aes(group = team, color = division), alpha = 2/3) +
geom_line(stat = 'summary', fun.y = 'mean', size = .9, color = 'red') +
scale_color_brewer(palette = 'Blues', direction = -1) +
labs(x = 'Season (Start Year)',
y = 'Season Power Play Goals (5v4)/ Games Played',
caption = 'Each team is a separate line; red line is the average') +
guides(color = guide_legend(title = 'Division')) +
scale_x_continuous(limits = c(2008, 2022), breaks = c(2008:2022)) +
theme_minimal()There is a lot of variation across power play success through time across teams and across divisions. On average, success appears to be relatively consistent through time with the exception of the steep dip in the first few seasons. This could be due to improved team strategy, defense and/or goalie ability through time.
This model ignores the conference and division distinction. Each team will be modeled individually using a linear regression to predict power play goals per games played as a function of time (across 10 years). There are 30 teams in this dataset. Therefore, 30 distinct linear regressions will be performed; one for each team in the NHL. This allows us account for the variability of power plays within each team.
We’ll use these omnibus estimates to examine all regression models simultaneously via R2 estimates. These allow us to see how much variability in power play goals per games played was explained by time:
modTeam <-
pp_season %>%
ungroup() %>%
nest_by(team) %>%
mutate(level1 = list(lm((goalsFor/games_played) ~ season, data = data)))
level1Omni <-
modTeam %>%
summarise(broom::glance(level1)) %>%
ungroup() %>%
mutate(sig = p.value < .05) %>%
filter(team != "LAK", team != "NJD", team != "SEA", team != "SJS", team != "TBL") ## omitting teams that do not have enough data# Color palette
sigColorPal <- brewer.pal(11,'RdGy') # display.brewer.pal(11,'RdGy')
# R^2 Plot
ggplot(level1Omni, aes(r.squared, reorder(team, r.squared), color = sig)) +
geom_point(size = 2) +
scale_color_manual(values = sigColorPal[c(9,2)]) +
labs(x = 'R-squared', y = 'Team') +
guides(color = guide_legend(title = 'p < .05')) +
theme_minimal() ## all but 2 are significantly different from 0All but two of the regressions are significant, with Atlanta having the strongest relationship between power play success and time. While significant, the Vancouver cnaucks have a lower R^2 value suggesting there is a lot of variability in the data that cannot be explained or predicted by the regression.
# Extracting level 1 coefficients
level1Coef <-
modTeam %>%
summarise(broom::tidy(level1)) %>%
ungroup() %>%
filter(term == 'season') %>% # Facilitates plotting
mutate(sig = p.value < .05) %>%
filter(team != "LAK", team != "NJD", team != "SEA", team != "SJS", team != "TBL") ## omitting teams that do not have enough data
ggplot(level1Coef, aes(estimate, reorder(team, -1*estimate), color = sig)) +
geom_point(size = 2) +
geom_errorbarh(
aes(xmin = estimate - std.error, xmax = estimate + std.error),
alpha = 1/2
) +
scale_color_manual(values = sigColorPal[c(9,2)]) +
labs(
x = 'Estimate (Yearly Change in Power Play Goals/Game)',
y = 'Team',
caption = 'SEM error bars'
) +
guides(color = guide_legend(title = 'p < .05')) +
theme_minimal()This figure looks at the directionality of power play success through time. Atlanta had the best fitting model and additionally has the most negative correlation, suggesting that there power play success has gone down significantly through time. The Vancouver Canucks have one of the shallower rates of change, but exhibit a negative trajectory in power play success through time.
model_data <- pp_season %>%
filter(team == "VAN") %>%
mutate(goalsFor_per_game = goalsFor / games_played) %>%
lm(goalsFor_per_game ~ season, data = .)
model_data <- model_data %>%
augment() %>%
mutate(fitted_values = .fitted)
ggplot(model_data, aes(x = season, y = goalsFor_per_game)) +
geom_point(color = "blue") + # Data points
geom_line(aes(y = fitted_values), color = "darkgreen") + # Fitted regression line
labs(
x = "Season",
y = "Average Power Play Goals per Game") +
theme_minimal()As predicted by the R^2 value, there is a lot of variation in the data that is not well described by the model. However, we do see a decline in power play success on average through time.
Random Forest is a powerful machine learning algorithm that can be used to predict outcomes, such as whether a goal is scored or not, based on a set of input features or metrics.
can_shot_data<- shot_data %>%
filter(homeTeamCode=="VAN") %>%
select(goal, season, isHomeTeam, shooterName, period, homeTeamGoals, awayTeamGoals, shotAngleAdjusted, xCordAdjusted, yCordAdjusted, shotDistance, shotType, shotOnEmptyNet, playerPositionThatDidEvent, shooterTimeOnIce)
# Check for missing values in each column
missing_values <- colSums(is.na(can_shot_data))
# Print columns with missing values
#print(missing_values[missing_values > 0])
# Remove rows with missing values
can_shot_data <- na.omit(can_shot_data) ## remvoing NAs --> there is enough data to make up for it
goal_rf<- randomForest::randomForest(as.factor(goal) ~season + isHomeTeam+ period+ homeTeamGoals+ awayTeamGoals+ shotAngleAdjusted+ xCordAdjusted+ yCordAdjusted+ shotDistance+ shotType+ shotOnEmptyNet+ playerPositionThatDidEvent+ shooterTimeOnIce, data= can_shot_data)
goal_rf##
## Call:
## randomForest(formula = as.factor(goal) ~ season + isHomeTeam + period + homeTeamGoals + awayTeamGoals + shotAngleAdjusted + xCordAdjusted + yCordAdjusted + shotDistance + shotType + shotOnEmptyNet + playerPositionThatDidEvent + shooterTimeOnIce, data = can_shot_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 6.55%
## Confusion matrix:
## 0 1 class.error
## 0 44188 61 0.001378562
## 1 3046 119 0.962401264
vip_goal_rf<- vip(goal_rf)
vip(goal_rf)Interestingly, time on ice was the most important determinor in goal success, followed by distance of shot. This suggests that the stamina and/or fatigue of the palyer is even more important than where they are on the ice and how far away they are from the net.
actual_values <- as.numeric(as.character(can_shot_data$goal))
predicted_values <- as.numeric(as.character(predict(goal_rf, can_shot_data)))
# Create a data frame to combine actual and predicted values
data_plot <- data.frame(
Values = c("Actual", "Predicted"),
Goals = c(sum(actual_values), sum(predicted_values))
)
# Create the ggplot figure as a grouped bar plot
ggplot(data_plot, aes(x = Values, y = Goals, fill = Values)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Actual vs. Predicted - Random Forest",
x = "Values",
y = "Number of Goals") +
scale_fill_manual(values = c("Actual" = "steelblue", "Predicted" = "orange")) +
theme_minimal()The amount of actual goals is well reflected by the predictions of the random forrest algorithm.
goal_preds<- predict(goal_rf, type= "prob")
goal_preds_joined <- cbind(can_shot_data, goal_preds)
rankings<- goal_preds_joined %>%
mutate(goal_oe = goal- `1`) %>% #goals over expected
group_by(shooterName) %>%
summarize(goals_oe_made = sum(goal_oe)) %>%
arrange(-goals_oe_made) %>%
rename("Goals Made Over Expectation" = goals_oe_made) %>%
rename("Shooter Name" = shooterName)
kable(head(rankings)) %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'responsive', 'condensed'))| Shooter Name | Goals Made Over Expectation |
|---|---|
| Elias Pettersson | 10.058796 |
| Jannik Hansen | 9.473178 |
| Mikael Samuelsson | 6.983476 |
| Andrei Kuzmenko | 6.565507 |
| Sven Baertschi | 6.251878 |
| Brandon Sutter | 5.560945 |
## What we need in the data frame:
# identity of team
# points scored
# poitns scored against them
# aggregate of thos values across the entire season
pe_nhl<- shot_data %>%
filter(season == 2022) %>%
clean_names() %>%
select(home_team_code, away_team_code, home_team_goals, away_team_goals, game_id)
## variable where 1 --> if home team won the game and 0 --> if away team didnt win the game
pe_nhl_score<- pe_nhl %>%
group_by(game_id) %>%
mutate(hwin = case_when (sum(home_team_goals)> sum(away_team_goals) ~ 1,
sum(away_team_goals)>sum(home_team_goals) ~ 0)) %>%
mutate(awin = case_when (sum(home_team_goals)< sum(away_team_goals) ~ 1,
sum(away_team_goals)<sum(home_team_goals) ~ 0)) %>%
mutate(count =1)
### aggregation
nhl_home<- pe_nhl_score %>% ### record of the team as a home team
group_by(home_team_code) %>%
summarise(hwin = sum(hwin),
home_team_goals = sum(home_team_goals),
away_team_goals = sum(away_team_goals),
count = sum(count)) %>%
rename(team = home_team_code, away_team_goals_h = away_team_goals, home_team_goals_h = home_team_goals, Gh = count)
nhl_away<- pe_nhl_score %>% ### record of the team as an away team
group_by(away_team_code) %>%
summarise(awin = sum(awin),
home_team_goals = sum(home_team_goals),
away_team_goals = sum(away_team_goals),
count = sum(count)) %>%
rename(team = away_team_code, away_team_goals_a = away_team_goals, home_team_goals_a = home_team_goals, Ga = count)
NHL22 <- merge(nhl_home, nhl_away, by = "team")
# Now we create the total wins, games, played, runs scored and run conceded by summing the totals as home team and away team
NHL22$W <- NHL22$hwin + NHL22$awin
NHL22$G <- NHL22$Gh + NHL22$Ga
NHL22$R <- NHL22$home_team_goals_h + NHL22$away_team_goals_a
NHL22$RA <- NHL22$away_team_goals_h + NHL22$home_team_goals_a
NHL22## team hwin home_team_goals_h away_team_goals_h Gh awin home_team_goals_a
## 1 ANA 0 123 306 176 83 347
## 2 ARI 97 290 354 177 84 75
## 3 BOS 83 201 91 83 84 11
## 4 BUF 0 175 297 90 84 34
## 5 CAR 90 168 72 90 79 67
## 6 CBJ 0 99 177 83 0 221
## 7 CGY 77 323 285 171 0 395
## 8 CHI 0 62 66 80 0 217
## 9 COL 0 149 151 80 182 143
## 10 DAL 0 226 371 160 0 194
## 11 DET 78 223 58 78 75 50
## 12 EDM 89 244 301 168 75 46
## 13 FLA 95 242 168 95 90 36
## 14 LAK 144 148 79 144 73 226
## 15 MIN 0 160 175 87 183 63
## 16 MTL 80 251 33 80 84 95
## 17 NJD 83 166 154 83 86 40
## 18 NSH 90 202 111 90 0 432
## 19 NYI 87 231 218 87 81 69
## 20 NYR 65 155 82 65 72 171
## 21 OTT 96 164 141 96 0 167
## 22 PHI 0 84 143 83 0 178
## 23 PIT 87 266 245 87 0 143
## 24 SEA 81 396 208 159 177 340
## 25 SJS 0 297 367 176 0 407
## 26 STL 184 403 218 184 89 37
## 27 TBL 95 222 152 95 87 27
## 28 TOR 91 153 152 91 0 79
## 29 VAN 1336 4576 5553 3415 1438 6402
## 30 VGK 0 5 273 92 163 201
## 31 WPG 168 389 289 168 67 5
## 32 WSH 82 185 172 82 81 60
## away_team_goals_a Ga W G R RA
## 1 265 175 83 351 388 653
## 2 117 84 181 261 407 429
## 3 105 84 167 167 306 102
## 4 163 84 84 174 338 331
## 5 110 79 169 169 278 139
## 6 83 82 0 165 182 398
## 7 210 190 77 361 533 680
## 8 92 169 0 249 154 283
## 9 287 182 182 262 436 294
## 10 78 74 0 234 304 565
## 11 203 75 153 153 426 108
## 12 168 75 164 243 412 347
## 13 224 90 185 185 466 204
## 14 152 159 217 303 300 305
## 15 257 183 183 270 417 238
## 16 267 84 164 164 518 128
## 17 243 86 169 169 409 194
## 18 244 180 90 270 446 543
## 19 114 81 168 168 345 287
## 20 258 72 137 137 413 253
## 21 15 80 96 176 179 308
## 22 93 83 0 166 177 321
## 23 35 81 87 168 301 388
## 24 462 177 258 336 858 548
## 25 119 138 0 314 416 774
## 26 143 89 273 273 546 255
## 27 280 87 182 182 502 179
## 28 26 81 91 172 179 231
## 29 5909 3480 2774 6895 10485 11955
## 30 345 163 163 255 350 474
## 31 173 67 235 235 562 294
## 32 222 81 163 163 407 232
#define win percentage and the Pythagorean Expectation
NHL22$wpc <- NHL22$W / NHL22$G
NHL22$pyth <- NHL22$R^2 / (NHL22$R^2 + NHL22$RA^2)
NHL22## team hwin home_team_goals_h away_team_goals_h Gh awin home_team_goals_a
## 1 ANA 0 123 306 176 83 347
## 2 ARI 97 290 354 177 84 75
## 3 BOS 83 201 91 83 84 11
## 4 BUF 0 175 297 90 84 34
## 5 CAR 90 168 72 90 79 67
## 6 CBJ 0 99 177 83 0 221
## 7 CGY 77 323 285 171 0 395
## 8 CHI 0 62 66 80 0 217
## 9 COL 0 149 151 80 182 143
## 10 DAL 0 226 371 160 0 194
## 11 DET 78 223 58 78 75 50
## 12 EDM 89 244 301 168 75 46
## 13 FLA 95 242 168 95 90 36
## 14 LAK 144 148 79 144 73 226
## 15 MIN 0 160 175 87 183 63
## 16 MTL 80 251 33 80 84 95
## 17 NJD 83 166 154 83 86 40
## 18 NSH 90 202 111 90 0 432
## 19 NYI 87 231 218 87 81 69
## 20 NYR 65 155 82 65 72 171
## 21 OTT 96 164 141 96 0 167
## 22 PHI 0 84 143 83 0 178
## 23 PIT 87 266 245 87 0 143
## 24 SEA 81 396 208 159 177 340
## 25 SJS 0 297 367 176 0 407
## 26 STL 184 403 218 184 89 37
## 27 TBL 95 222 152 95 87 27
## 28 TOR 91 153 152 91 0 79
## 29 VAN 1336 4576 5553 3415 1438 6402
## 30 VGK 0 5 273 92 163 201
## 31 WPG 168 389 289 168 67 5
## 32 WSH 82 185 172 82 81 60
## away_team_goals_a Ga W G R RA wpc pyth
## 1 265 175 83 351 388 653 0.2364672 0.2609294
## 2 117 84 181 261 407 429 0.6934866 0.4737024
## 3 105 84 167 167 306 102 1.0000000 0.9000000
## 4 163 84 84 174 338 331 0.4827586 0.5104622
## 5 110 79 169 169 278 139 1.0000000 0.8000000
## 6 83 82 0 165 182 398 0.0000000 0.1729460
## 7 210 190 77 361 533 680 0.2132964 0.3805669
## 8 92 169 0 249 154 283 0.0000000 0.2284668
## 9 287 182 182 262 436 294 0.6946565 0.6874286
## 10 78 74 0 234 304 565 0.0000000 0.2245063
## 11 203 75 153 153 426 108 1.0000000 0.9396086
## 12 168 75 164 243 412 347 0.6748971 0.5850155
## 13 224 90 185 185 466 204 1.0000000 0.8391789
## 14 152 159 217 303 300 305 0.7161716 0.4917361
## 15 257 183 183 270 417 238 0.6777778 0.7542911
## 16 267 84 164 164 518 128 1.0000000 0.9424533
## 17 243 86 169 169 409 194 1.0000000 0.8163354
## 18 244 180 90 270 446 543 0.3333333 0.4028556
## 19 114 81 168 168 345 287 1.0000000 0.5910057
## 20 258 72 137 137 413 253 1.0000000 0.7271313
## 21 15 80 96 176 179 308 0.5454545 0.2524802
## 22 93 83 0 166 177 321 0.0000000 0.2331547
## 23 35 81 87 168 301 388 0.5178571 0.3757117
## 24 462 177 258 336 858 548 0.7678571 0.7102622
## 25 119 138 0 314 416 774 0.0000000 0.2241275
## 26 143 89 273 273 546 255 1.0000000 0.8209373
## 27 280 87 182 182 502 179 1.0000000 0.8871975
## 28 26 81 91 172 179 231 0.5290698 0.3751786
## 29 5909 3480 2774 6895 10485 11955 0.4023205 0.4347719
## 30 345 163 163 255 350 474 0.6392157 0.3528470
## 31 173 67 235 235 562 294 1.0000000 0.7851347
## 32 222 81 163 163 407 232 1.0000000 0.7547580
## regression
pyth_lm <- lm(wpc ~ pyth, data = NHL22)
summary(pyth_lm)##
## Call:
## lm(formula = wpc ~ pyth, data = NHL22)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.20828 -0.11902 -0.02924 0.11852 0.33060
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.11477 0.06938 -1.654 0.109
## pyth 1.32684 0.11328 11.713 1.02e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1582 on 30 degrees of freedom
## Multiple R-squared: 0.8206, Adjusted R-squared: 0.8146
## F-statistic: 137.2 on 1 and 30 DF, p-value: 1.02e-12
wpc = Intercept + coef x pyth