Project NHL MCMC Simulation

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(sportyR)
library(dplyr)
library(markovchain)

## Package:  markovchain
## Version:  0.9.5
## Date:     2023-09-24 09:20:02 UTC
## BugReport: https://github.com/spedygiorgio/markovchain/issues
## 
## 
## Attaching package: 'markovchain'
## 
## The following object is masked from 'package:lubridate':
## 
##     period

library(openxlsx)
library(stringr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggthemes)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Reading 2022-23 season play by play NHL games

NHL_2023 <- load_pbp(2023)

Make sure number of columns are same across all the seasons

NHL_2023 <- NHL_2023 |>
  mutate(points_outcome = ifelse(event=='Goal',1,0))

Work flow:

2. Add the possession_retained column

Add_PossesionRetained <- function(Data) {
  df_test <- Data |>
    add_column(PossessionRetained = 0)|>
    select(game_id,event_id,event_team_abbr,event,description,PossessionRetained,points_outcome
           ,home_abbreviation,away_abbreviation,ordinal_num,home_score,away_score
           ,event_player_1_name,event_player_1_type,event_player_2_name,event_player_2_type
           ,event_player_3_name,event_player_3_type,period_seconds,period_time,x,y,x_fixed,y_fixed,shot_distance, shot_angle,date_time) 
  
  df1 <- df_test |>
    mutate(PossessionRetained = case_when(
      event %in% c('Faceoff', 'Takeaway') ~ 1,  # Possession retained
      event %in% c('Giveaway', 'Penalty','Goal') ~ 0,  # Possession lost
      TRUE ~ NA_real_  # Handle other cases with NA or a default value
    ))
  
  # Create a shifted version of event_team_abbr and event for the next row
  df1 <- df1 |>
    mutate(next_event_team_abbr = lead(event_team_abbr),
           next_event = lead(event))
  
  # Update PossessionRetained based on the next event using vectorized conditions
  df1 <- df1 |>
    mutate(PossessionRetained = case_when(
      event %in% c('Blocked Shot', 'Shot', 'Missed Shot', 'Hit') ~ 
        case_when(
          # Handle case where next_event_team_abbr is NA
          is.na(next_event_team_abbr) ~ 0,  # Default to 0 if next_event_team_abbr is NA
          next_event_team_abbr == event_team_abbr ~
            case_when(
              next_event %in% c("Missed Shot", "Shot", "Goal", "Giveaway", 
                                "Failed Shot Attempt","Blocked Shot","Penalty") ~ 1,
              next_event %in% c( "Hit", "Faceoff", "Takeaway",
                                "Period End") ~ 0,
              TRUE ~ NA_real_  # For any unexpected events, return NA
            ),
          next_event_team_abbr != event_team_abbr ~
            case_when(
              next_event %in% c("Missed Shot", "Shot", "Goal", "Giveaway", 
                                "Failed Shot Attempt", "Period End","Blocked Shot"
                                ,"Faceoff") ~ 0,
              next_event %in% c("Hit", "Takeaway", "Penalty") ~ 1,
              TRUE ~ NA_real_  # For any unexpected events, return NA
            ),
          TRUE ~ NA_real_  # Catch any unhandled cases
        ),
      TRUE ~ PossessionRetained  # Retain existing values for other events
    ))
  
  # Clean up temporary columns
  df1 <- df1 |>
    select(-next_event_team_abbr, -next_event)
  return(df1)
  
}

events_to_keep <- c(
  "Faceoff", "Hit", "Shot", "Missed Shot", 
  "Giveaway", "Blocked Shot", "Goal", 
  "Takeaway", "Penalty", "Failed Shot Attempt"
)
NHL_2023 <- NHL_2023 |> filter(event %in% events_to_keep)

NHL_2023_Possessions <- Add_PossesionRetained(NHL_2023)
NHL_2023_Possessions <- NHL_2023_Possessions |>
  filter(!is.na(NHL_2023_Possessions$PossessionRetained))

2. Select Home and Away Teams

home_team <- 'EDM'
away_team <- 'VGK'

3. For each game create a Markov Chain probabilities.

Summary

The markov_chain_cleansing function processes a dataset of events to build a comprehensive Markov chain model representing transitions between various event states across all teams. It then isolates and extracts a transition matrix specific to a given team by:

Concatenating relevant event information to define unique states.
Fitting a Markov chain to determine transition probabilities.
Filtering the transition matrix to include only states and transitions pertinent to the specified team.

markov_chain_cleansing <- function(data,team){
  df <- data
  concat_df <- paste(df$event_team_abbr,df$event,df$PossessionRetained,df$points_outcome,sep = ';')
  markov_chain_matrix <- markovchainFit(concat_df)$estimate@transitionMatrix

  row_names <- rownames(markov_chain_matrix)
  col_names <- colnames(markov_chain_matrix)
  
  first_part_rows <- sapply(str_split(row_names, ";"), "[", 1)
  first_part_cols <- sapply(str_split(col_names, ";"), "[", 1)
  
  row_index <- which(first_part_rows == team)
  col_index <- which(first_part_cols == team)
  
  markov_chain_matrix_cleansed <- markov_chain_matrix[row_index, col_index]

  return(markov_chain_matrix_cleansed)
}

4. Create a markov chain matrices for each team (home and away) of all games.

Home_team_markov_matrices <- function(data, team) {
  # Filter the data for the specified home team
  Home_team_data <- data |> 
    filter(home_abbreviation == team)
  
  # Split the data by gameid
  split_data <- split(Home_team_data, Home_team_data$game_id)
  
  # Apply the markov_chain_cleansing function to each game
  markov_chain_list <- lapply(split_data, function(game_data) {
    markov_chain_cleansing(game_data, team)
  })
  
  return(markov_chain_list)
}

Away_team_markov_matrices <- function(data, team) {
  # Filter the data for the specified home team
  Away_team_data <- data |> 
    filter(away_abbreviation == team)
  
  # Split the data by gameid
  split_data <- split(Away_team_data, Away_team_data$game_id)
  
  # Apply the markov_chain_cleansing function to each game
  markov_chain_list <- lapply(split_data, function(game_data) {
    markov_chain_cleansing(game_data, team)
  })
  
  return(markov_chain_list)
}

List of markov chain matrices for all the games of home and away teams.

Home_team_markov_matrices <- Home_team_markov_matrices(NHL_2023_Possessions, home_team)
Away_team_markov_matrices <- Away_team_markov_matrices(NHL_2023_Possessions, away_team)

Simplify them to remove all sum 0 rows and columns

simplify_matrix <- function(markov_chain_matrices){
  xx <- markov_chain_matrices[as.logical(rowSums(markov_chain_matrices !=0,na.rm = TRUE)), ]
  xx <- xx[,as.logical(colSums(xx!=0,na.rm=TRUE))]
  return(xx)
}

Home_team_markov_matrices <- lapply(Home_team_markov_matrices, simplify_matrix)
Away_team_markov_matrices <- lapply(Away_team_markov_matrices, simplify_matrix)

Home Team Work Flow.

5(a). Combine the Markov Chains or each team into one large matrix.

5.1 Create master matrix that contains all options and assign 0’s.

5.2 Function extract the dates of all the games played by home.

5.3 Assign the weights for each game inversely proportional to date of the game.

5.4 Multiply the weight for each game markov matrix.

5.5 Add all markov chain matrices to create a final markov chain matrix.

chome <- unique(unlist(lapply(Home_team_markov_matrices, colnames)))
rhome <- unique(unlist(lapply(Home_team_markov_matrices, rownames)))

temp_home_master_markov <- matrix(0,nrow = length(rhome), ncol = length(chome), dimnames = list(rhome,chome))
home_master_markov <- rep(list(temp_home_master_markov),length(Home_team_markov_matrices))

temp_index_home_list <- lapply(Home_team_markov_matrices, function(x) outer(rownames(x), colnames(x), paste))
index_home_list <- lapply(temp_index_home_list, function(x) outer(rhome, chome, paste) %in% x)

home_team_game_dates <- function(data, team) {
  # Filter the data for the specified home team
  Home_team_data <- data |> 
    filter(home_abbreviation == team ,ordinal_num == '1st')
  
  
  Home_team_data <- Home_team_data |>
    mutate(date_only = as.Date(date_time)) |>
    select(game_id,date_only) |>
    group_by(game_id)
  
  
  return(unique(Home_team_data$date_only))
}

today <- Sys.Date()
home_team_game_dates <- home_team_game_dates(NHL_2023_Possessions,home_team)
home_team_game_weights <- exp(as.numeric(home_team_game_dates-today))/sum(exp(as.numeric(home_team_game_dates-today)))

for (element in 1:length(home_master_markov)) {
  for (row in 1:nrow(Home_team_markov_matrices[[element]])) {
    row_name <- rownames(Home_team_markov_matrices[[element]])[row]
    column_names <- names(which(Home_team_markov_matrices[[element]][row,] != 0))
    for (column in 1:length(column_names)) {
      home_master_markov[[element]][which(rownames(home_master_markov[[element]])==row_name)
                                    ,which(colnames(home_master_markov[[element]])==column_names[column])] <-
        Home_team_markov_matrices[[element]][row,which(colnames(Home_team_markov_matrices[[element]])==column_names[column])] * home_team_game_weights[element] 
      
    }
    
  }
  
}

home_final_markov_chain <- Reduce("+",home_master_markov )

home_final_markov_chain <- simplify_matrix(home_final_markov_chain)

Away Team Work Flow.

5(b). Combine the Markov Chains or each team into one large matrix.

5.1 Create master matrix that contains all options and assign 0’s.

5.2 Function extract the dates of all the games played by away.

5.3 Assign the weights for each game inversely proportional to date of the game.

5.4 Multiply the weight for each game markov matrix.

5.5 Add all markov chain matrices to create a final markov chain matrix.

caway <- unique(unlist(lapply(Away_team_markov_matrices, colnames)))
raway <- unique(unlist(lapply(Away_team_markov_matrices, rownames)))

temp_away_master_markov <- matrix(0,nrow = length(raway), ncol = length(caway), dimnames = list(raway,caway))
away_master_markov <- rep(list(temp_away_master_markov),length(Away_team_markov_matrices))

temp_index_away_list <- lapply(Away_team_markov_matrices, function(x) outer(rownames(x), colnames(x), paste))
index_away_list <- lapply(temp_index_away_list, function(x) outer(raway, caway, paste) %in% x)

away_team_game_dates <- function(data, team) {
  # Filter the data for the specified home team
  Away_team_data <- data |> 
    filter(away_abbreviation == team ,ordinal_num == '1st')
  
  
  Away_team_data <- Away_team_data |>
    mutate(date_only = as.Date(date_time)) |>
    select(game_id,date_only) |>
    group_by(game_id)
  
  
  return(unique(Away_team_data$date_only))
}

away_team_game_dates <- away_team_game_dates(NHL_2023_Possessions,away_team)
away_team_game_weights <- exp(as.numeric(away_team_game_dates-today))/sum(exp(as.numeric(away_team_game_dates-today)))

for (element in 1:length(away_master_markov)) {
  for (row in 1:nrow(Away_team_markov_matrices[[element]])) {
    row_name <- rownames(Away_team_markov_matrices[[element]])[row]
    column_names <- names(which(Away_team_markov_matrices[[element]][row,] != 0))
    for (column in 1:length(column_names)) { 
      away_master_markov[[element]][which(rownames(away_master_markov[[element]])==row_name)
                                    ,which(colnames(away_master_markov[[element]])==column_names[column])] <-
        Away_team_markov_matrices[[element]][row,which(colnames(Away_team_markov_matrices[[element]])==column_names[column])] *away_team_game_weights[element] 
    }
    
  }
}

away_final_markov_chain <- Reduce("+",away_master_markov)

away_final_markov_chain <- simplify_matrix(away_final_markov_chain)

all_home_colnames <- unlist(lapply(Home_team_markov_matrices, rownames))
home_colnames_df <- data.frame(column_name = all_home_colnames, stringsAsFactors = FALSE)

home_final_table <- home_colnames_df |>
  count(column_name, name = "freq") 
home_final_table <- home_final_table[home_final_table$column_name %in% rownames(home_final_markov_chain),]
home_final_table <- home_final_table |>
  arrange(desc(freq)) |>
  mutate(prob = freq/sum(freq),
         cumprob = cumsum(prob))

all_away_colnames <- unlist(lapply(Away_team_markov_matrices, rownames))
away_colnames_df <- data.frame(column_name = all_away_colnames, stringsAsFactors = FALSE)

away_final_table <- away_colnames_df |>
  count(column_name, name = "freq") 
away_final_table <- away_final_table[away_final_table$column_name %in% rownames(away_final_markov_chain),]
away_final_table <- away_final_table |>
  arrange(desc(freq)) |>
  mutate(prob = freq/sum(freq),
         cumprob = cumsum(prob))

Do these updates:
1. After goal the next event should be faceoff.

ss <- function(home_final_markov_chain, away_final_markov_chain, home_final_table, away_final_table, away_team, home_team){
  
  simulated_final_game <- data.frame(
    home_team = character(),
    away_team = character(),
    event_team = character(),
    event = character(),
    PossessionRetained = integer(),
    points_outcome = integer(),
    stringsAsFactors = FALSE
  )
  
  # Function to select and add a faceoff play
  add_faceoff <- function(sim_game, home_table, away_table, home_team, away_team){
    options <- c('home','away')
    select_faceoff_team <- sample(options, 1)
    if(select_faceoff_team == 'home'){
      faceoff_row <- home_table[grepl("faceoff", home_table$column_name, ignore.case = TRUE), ]
      first_play_parsed <- c(home_team, away_team, 
                             unlist(str_split(as.character(faceoff_row$column_name), ';')))
    }
    else{
      faceoff_row <- away_table[grepl("faceoff", away_table$column_name, ignore.case = TRUE), ]
      first_play_parsed <- c(home_team, away_team, 
                             unlist(str_split(as.character(faceoff_row$column_name), ';')))
    }
    if(length(first_play_parsed) != ncol(sim_game)){
      stop("Mismatch in first_play_parsed length")
    }
    first_play_df <- t(as.data.frame(first_play_parsed, stringsAsFactors = FALSE))
    colnames(first_play_df) <- colnames(sim_game)
    rownames(first_play_df) <- NULL
    sim_game <- rbind(sim_game, first_play_df)
    return(sim_game)
  }
  
  # Initialize the game with a faceoff
  simulated_final_game <- add_faceoff(simulated_final_game, home_final_table, away_final_table,
                                      home_team, away_team)
  
  while (nrow(simulated_final_game) <= 200) {
    # Find last row
    last_row <- simulated_final_game[nrow(simulated_final_game), ]
    
    # Check if the last event was a goal
    if(tolower(last_row$event) == "goal"){
      # Start from faceoff
      simulated_final_game <- add_faceoff(simulated_final_game, home_final_table,
                                          away_final_table, home_team, away_team)
      
      # Proceed to next iteration
      next
    }
    
    # Concatenate the relevant columns to form the state
    last_row_concatenated <- paste(as.character(last_row[3:ncol(last_row)]), sep=';', 
                                   collapse = ';')
    
    # Determine the next play based on possession and event team
    if(last_row$PossessionRetained == 1 & last_row$event_team == home_team){
      # Home team retains possession
      markov_chain <- home_final_markov_chain
      current_markov <- "home"
    } 
    else if(last_row$PossessionRetained == 0 & last_row$event_team == home_team) {
      # Possession changes to away team
      current_table <- away_final_table
    }
    else if(last_row$PossessionRetained == 1 & last_row$event_team == away_team) {
      # Away team retains possession
      markov_chain <- away_final_markov_chain
      current_markov <- "away"
    } 
    else if(last_row$PossessionRetained == 0 & last_row$event_team == away_team) {
      # Possession changes to home team
      current_table <- home_final_table
    }
    
    # Proceed only if last event was not a goal
    if(tolower(last_row$event) != "goal"){
      if(last_row$PossessionRetained == 1){
        # Find probability of the next event from the current markov chain
        row_to_inspect <- which(rownames(markov_chain) == last_row_concatenated)
        if(length(row_to_inspect) == 0){
          stop(paste("State not found in", current_markov, "markov chain:", last_row_concatenated))
        }
        options_to_inspect <- as.data.frame(markov_chain[row_to_inspect, ], stringsAsFactors = FALSE)
        options_to_inspect <- options_to_inspect[which(markov_chain[row_to_inspect, ] > 0), , drop = FALSE]
        colnames(options_to_inspect) <- c('freq')
        options_to_inspect$prob <- options_to_inspect$freq / sum(options_to_inspect$freq)
        options_to_inspect <- options_to_inspect[order(-options_to_inspect$prob), ]
        options_to_inspect$cumprob <- round(cumsum(options_to_inspect$prob), 6)
        
        # Randomly select next play
        next_play <- runif(1)
        selected_state <- rownames(options_to_inspect)[min(which(options_to_inspect$cumprob >= next_play))]
        next_play_parsed <- c(home_team, away_team, unlist(str_split(as.character(selected_state), ';')))
        
        # Add the selected play to the simulation
        next_play_df <- t(as.data.frame(next_play_parsed, stringsAsFactors = FALSE))
        colnames(next_play_df) <- colnames(simulated_final_game)
        rownames(next_play_df) <- NULL
        simulated_final_game <- rbind(simulated_final_game, next_play_df)
      } 
      else {
        # Possession changes to the opponent team
        # Select next play based on the opponent's table cumulative probabilities
        next_play <- runif(1)
        selected_row <- which(next_play <= current_table$cumprob)[1]
        if(is.na(selected_row)){
          stop("No valid play found for the opponent's cumulative probability.")
        }
        next_play_parsed <- c(home_team, away_team, unlist(str_split(as.character(current_table$column_name[selected_row]), ';')))
        
        # Add the selected play to the simulation
        next_play_df <- t(as.data.frame(next_play_parsed, stringsAsFactors = FALSE))
        colnames(next_play_df) <- colnames(simulated_final_game)
        rownames(next_play_df) <- NULL
        simulated_final_game <- rbind(simulated_final_game, next_play_df)
      }
    }
  }
  
  Home_Shots_taken <- simulated_final_game |>
    filter(event_team == home_team & 
             event %in% c("Shot", "Missed Shot", "Blocked Shot", "Goal"))
  Away_Shots_taken <- simulated_final_game |>
    filter(event_team == away_team & 
             event %in% c("Shot", "Missed Shot", "Blocked Shot", "Goal"))
  
  Home_goals <- sum(as.numeric(simulated_final_game$points_outcome[
    which(simulated_final_game$event_team == home_team)]))
  Away_goals <- sum(as.numeric(simulated_final_game$points_outcome[
    which(simulated_final_game$event_team == away_team)]))
          
  goal_diff <- Home_goals - Away_goals
  
  # Return Data frame 
  return(data.frame(
  goal_diff = goal_diff,
  Home_goals = Home_goals,
  Away_goals = Away_goals,
  Home_shots = nrow(Home_Shots_taken),
  Away_shots = nrow(Away_Shots_taken),
  stringsAsFactors = FALSE
))
  
}

Key Points to Note

Faceoff Selection: After a goal, the simulation resets by selecting a faceoff play, mimicking real-game scenarios where play resumes from a faceoff after a goal.
Possession Logic: The possession logic remains intact. If possession is retained or changes, the simulation continues based on the Markov chains.
Flexibility: The helper function add_faceoff allows for easy modifications in the future, such as changing how faceoffs are handled or adding more complexity.
Error Handling: Proper error messages help in debugging issues related to state mismatches or invalid probability selections.

Simulate match ups 100x

results_list <- replicate(
  n = 100,
  expr = ss(
    home_final_markov_chain = home_final_markov_chain,
    away_final_markov_chain = away_final_markov_chain,
    home_final_table = home_final_table,
    away_final_table = away_final_table,
    away_team = away_team,
    home_team = home_team
  ),
  simplify = FALSE
)

Results_df <- function(result,n){
  results_df <- bind_rows(result)
  results_df <- results_df %>%
    mutate(simulation = 1:n)
  results_df <- results_df %>%
    select(simulation, everything())
    return(results_df)
}

Result_100 <- Results_df(results_list,100)

Result_100 <- Result_100 |>
  mutate(result = ifelse(goal_diff > 0, 'win',
                         ifelse(goal_diff == 0, 'tie', 'loss')))

data <- Result_100

# 1. Distribution of Goal Difference
# Create a histogram for goal_diff
ggplot(data, aes(x = goal_diff)) +
  geom_histogram(binwidth = 1, fill = "saddlebrown", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Goal Difference - 100x (NSH vs SJS)",
       x = "Goal Difference",
       y = "Frequency") +
  theme_classic()

# 2. Pie Chart for Result Variable
# Create a table for result variable
result_table <- data %>% 
  count(result) %>% 
  mutate(percentage = n / sum(n))

# Create the pie chart
library(ggplot2)
library(scales)

# Define a custom brown color palette
brown_palette <- c("win" = "#8B4513",   # SaddleBrown
                   "tie" = "#A0522D",   # Sienna
                   "loss" = "#D2B48C")  # Tan

# Create the pie chart with the brown palette
ggplot(result_table, aes(x = "", y = percentage, fill = as.factor(result))) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_y_continuous(labels = percent) +
  geom_text(aes(label = scales::percent(percentage, accuracy = 0.1)), 
            position = position_stack(vjust = 0.5), 
            color = "white", 
            size = 5) +
  scale_fill_manual(values = brown_palette) +
  labs(title = "Pie Chart of Results",
       fill = "Result") +
  theme_classic() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid  = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

# 3. Shot Conversion Rate Bar Chart
# Calculate shot conversion rates
conversion_rates <- data %>%
  summarise(
    Home_Conversion = mean(Home_goals) / mean(Home_shots),
    Away_Conversion = mean(Away_goals) / mean(Away_shots)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Team", values_to = "ConversionRate")
brown_palette <- c("Home_Conversion" = "#8B4513",   # SaddleBrown
                   "Away_Conversion" = "#D2B48C")

# Create the bar chart
ggplot(conversion_rates, aes(x = Team, y = ConversionRate, fill = Team)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = scales::percent(ConversionRate, accuracy = 0.01)), vjust = -0.5) +
  labs(title = "Shot Conversion Rate for Home and Away Teams - 100x",
       x = "Team",
       y = "Conversion Rate") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 0.01)) +
  scale_fill_manual(values = brown_palette) +  # Apply the custom brown palette

  theme_classic()

Simulation 500x

results_list_500 <- replicate(
  n = 500,
  expr = ss(
    home_final_markov_chain = home_final_markov_chain,
    away_final_markov_chain = away_final_markov_chain,
    home_final_table = home_final_table,
    away_final_table = away_final_table,
    away_team = away_team,
    home_team = home_team
  ),
  simplify = FALSE
)
Result_500 <- Results_df(results_list_500,500)

Result_500 <- Result_500 |>
  mutate(result = ifelse(goal_diff > 0, 'win',
                         ifelse(goal_diff == 0, 'tie', 'loss')))

data <- Result_500

# 1. Distribution of Goal Difference
# Create a histogram for goal_diff
ggplot(data, aes(x = goal_diff)) +
  geom_histogram(binwidth = 1, fill = "saddlebrown", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Goal Difference - 500x (NSH vs SJS)",
       x = "Goal Difference",
       y = "Frequency") +
  theme_classic()

# 2. Pie Chart for Result Variable
# Create a table for result variable
result_table <- data %>% 
  count(result) %>% 
  mutate(percentage = n / sum(n))

# Create the pie chart
library(ggplot2)
library(scales)

# Define a custom brown color palette
brown_palette <- c("win" = "#8B4513",   # SaddleBrown
                   "tie" = "#A0522D",   # Sienna
                   "loss" = "#D2B48C")  # Tan

# Create the pie chart with the brown palette
ggplot(result_table, aes(x = "", y = percentage, fill = as.factor(result))) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_y_continuous(labels = percent) +
  geom_text(aes(label = scales::percent(percentage, accuracy = 0.1)), 
            position = position_stack(vjust = 0.5), 
            color = "white", 
            size = 5) +
  scale_fill_manual(values = brown_palette) +
  labs(title = "Pie Chart of Results",
       fill = "Result") +
  theme_classic() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid  = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

# 3. Shot Conversion Rate Bar Chart
# Calculate shot conversion rates
conversion_rates <- data %>%
  summarise(
    Home_Conversion = mean(Home_goals) / mean(Home_shots),
    Away_Conversion = mean(Away_goals) / mean(Away_shots)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Team", values_to = "ConversionRate")
brown_palette <- c("Home_Conversion" = "#8B4513",   # SaddleBrown
                   "Away_Conversion" = "#D2B48C")

# Create the bar chart
ggplot(conversion_rates, aes(x = Team, y = ConversionRate, fill = Team)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = scales::percent(ConversionRate, accuracy = 0.01)), vjust = -0.5) +
  labs(title = "Shot Conversion Rate for Home and Away Teams - 500x",
       x = "Team",
       y = "Conversion Rate") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 0.01)) +
  scale_fill_manual(values = brown_palette) +  # Apply the custom brown palette

  theme_classic()

Simulation 1000x

results_list_1000 <- replicate(
  n = 1000,
  expr = ss(
    home_final_markov_chain = home_final_markov_chain,
    away_final_markov_chain = away_final_markov_chain,
    home_final_table = home_final_table,
    away_final_table = away_final_table,
    away_team = away_team,
    home_team = home_team
  ),
  simplify = FALSE
)
Result_1000 <- Results_df(results_list_1000,1000)

Result_1000 <- Result_1000 |>
  mutate(result = ifelse(goal_diff > 0, 'win',
                         ifelse(goal_diff == 0, 'tie', 'loss')))

data <- Result_1000


# 1. Distribution of Goal Difference
# Create a histogram for goal_diff
ggplot(data, aes(x = goal_diff)) +
  geom_histogram(binwidth = 1, fill = "saddlebrown", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Goal Difference - 1000x (NSH vs SJS)",
       x = "Goal Difference",
       y = "Frequency") +
  theme_classic()

# 2. Pie Chart for Result Variable
# Create a table for result variable
result_table <- data %>% 
  count(result) %>% 
  mutate(percentage = n / sum(n))

# Create the pie chart
library(ggplot2)
library(scales)

# Define a custom brown color palette
brown_palette <- c("win" = "#8B4513",   # SaddleBrown
                   "tie" = "#A0522D",   # Sienna
                   "loss" = "#D2B48C")  # Tan

# Create the pie chart with the brown palette
ggplot(result_table, aes(x = "", y = percentage, fill = as.factor(result))) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_y_continuous(labels = percent) +
  geom_text(aes(label = scales::percent(percentage, accuracy = 0.1)), 
            position = position_stack(vjust = 0.5), 
            color = "white", 
            size = 5) +
  scale_fill_manual(values = brown_palette) +
  labs(title = "Pie Chart of Results",
       fill = "Result") +
  theme_classic() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid  = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

# 3. Shot Conversion Rate Bar Chart
# Calculate shot conversion rates
conversion_rates <- data %>%
  summarise(
    Home_Conversion = mean(Home_goals) / mean(Home_shots),
    Away_Conversion = mean(Away_goals) / mean(Away_shots)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Team", values_to = "ConversionRate")
brown_palette <- c("Home_Conversion" = "#8B4513",   # SaddleBrown
                   "Away_Conversion" = "#D2B48C")

# Create the bar chart
ggplot(conversion_rates, aes(x = Team, y = ConversionRate, fill = Team)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = scales::percent(ConversionRate, accuracy = 0.01)), vjust = -0.5) +
  labs(title = "Shot Conversion Rate for Home and Away Teams - 1000x",
       x = "Team",
       y = "Conversion Rate") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 0.01)) +
  scale_fill_manual(values = brown_palette) +  # Apply the custom brown palette

  theme_classic()

results_list_5000 <- replicate(
  n = 5000,
  expr = ss(
    home_final_markov_chain = home_final_markov_chain,
    away_final_markov_chain = away_final_markov_chain,
    home_final_table = home_final_table,
    away_final_table = away_final_table,
    away_team = away_team,
    home_team = home_team
  ),
  simplify = FALSE
)
Result_5000 <- Results_df(results_list_5000,5000)

Result_5000 <- Result_5000 |>
  mutate(result = ifelse(goal_diff > 0, 'win',
                         ifelse(goal_diff == 0, 'tie', 'loss')))

data <- Result_5000


# 1. Distribution of Goal Difference
# Create a histogram for goal_diff
ggplot(data, aes(x = goal_diff)) +
  geom_histogram(binwidth = 1, fill = "saddlebrown", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Goal Difference - 5000x (NSH vs SJS)",
       x = "Goal Difference",
       y = "Frequency") +
  theme_classic()

# 2. Pie Chart for Result Variable
# Create a table for result variable
result_table <- data %>% 
  count(result) %>% 
  mutate(percentage = n / sum(n))

# Create the pie chart
library(ggplot2)
library(scales)

# Define a custom brown color palette
brown_palette <- c("win" = "#8B4513",   # SaddleBrown
                   "tie" = "#A0522D",   # Sienna
                   "loss" = "#D2B48C")  # Tan

# Create the pie chart with the brown palette
ggplot(result_table, aes(x = "", y = percentage, fill = as.factor(result))) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_y_continuous(labels = percent) +
  geom_text(aes(label = scales::percent(percentage, accuracy = 0.1)), 
            position = position_stack(vjust = 0.5), 
            color = "white", 
            size = 5) +
  scale_fill_manual(values = brown_palette) +
  labs(title = "Pie Chart of Results",
       fill = "Result") +
  theme_classic() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid  = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )

# 3. Shot Conversion Rate Bar Chart
# Calculate shot conversion rates
conversion_rates <- data %>%
  summarise(
    Home_Conversion = mean(Home_goals) / mean(Home_shots),
    Away_Conversion = mean(Away_goals) / mean(Away_shots)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Team", values_to = "ConversionRate")
brown_palette <- c("Home_Conversion" = "#8B4513",   # SaddleBrown
                   "Away_Conversion" = "#D2B48C")

# Create the bar chart
ggplot(conversion_rates, aes(x = Team, y = ConversionRate, fill = Team)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = scales::percent(ConversionRate, accuracy = 0.01)), vjust = -0.5) +
  labs(title = "Shot Conversion Rate for Home and Away Teams - 5000x",
       x = "Team",
       y = "Conversion Rate") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 0.01)) +
  scale_fill_manual(values = brown_palette) +  # Apply the custom brown palette

  theme_classic()

NHL - Markov Chain

Anurag

2024-09-24

Project NHL MCMC Simulation

Contents:

1. Importing required libraries

2. Load NHL play by play season (last 5 seasons excluding 2020 and 2021 seasons)

3. Making sure all seasons have similar column names.

Loading required libraries

Reading 2022-23 season play by play NHL games

Make sure number of columns are same across all the seasons

Work flow:

2. Add the possession_retained column

2. Select Home and Away Teams

3. For each game create a Markov Chain probabilities.

Summary

4. Create a markov chain matrices for each team (home and away) of all games.

List of markov chain matrices for all the games of home and away teams.

Simplify them to remove all sum 0 rows and columns

Home Team Work Flow.

5(a). Combine the Markov Chains or each team into one large matrix.

5.1 Create master matrix that contains all options and assign 0’s.

5.2 Function extract the dates of all the games played by home.

5.3 Assign the weights for each game inversely proportional to date of the game.

5.4 Multiply the weight for each game markov matrix.

5.5 Add all markov chain matrices to create a final markov chain matrix.

Away Team Work Flow.

5(b). Combine the Markov Chains or each team into one large matrix.

5.1 Create master matrix that contains all options and assign 0’s.

5.2 Function extract the dates of all the games played by away.

5.3 Assign the weights for each game inversely proportional to date of the game.

5.4 Multiply the weight for each game markov matrix.

5.5 Add all markov chain matrices to create a final markov chain matrix.

Key Points to Note

Simulate match ups 100x

Simulation 500x

Simulation 1000x