#Package Loading
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(feasts)
## Warning: package 'feasts' was built under R version 4.4.3
## Loading required package: fabletools
## Warning: package 'fabletools' was built under R version 4.4.3
library(fable)
## Warning: package 'fable' was built under R version 4.4.3
library(lubridate)
#Loading the MoneyPuck Shot Dataset
mpd = read.csv("C:/Users/Logan/Downloads/shots_2024_1/shots_2024.csv")
#adding descriptors to dataframe
#Load the data dictionary (update with your file path)
#data_dict <- read.csv("C:/Users/Logan/Downloads/MoneyPuck_Shot_Data_Dictionary (1) (1).csv")
#Iterate through the data dictionary and assign labels (from ChatGPT -- QOL Step)
#for (i in 1:nrow(data_dict)) {
#column_name <- data_dict$Variable[i]
#description <- data_dict$Definition[i]
#if (column_name %in% colnames(mpd)) {
#label(mpd[[column_name]]) <- description
#}
#}
For this analysis, we will look at when goals occur, using time and goal!
# Convert 'goal' to numeric (1 for goal, 0 for no goal)
mpd_processed <- mpd %>%
mutate(is_goal = as.numeric(goal))
# Aggregate by 30-second chunks
mpd_aggregated_30sec <- mpd_processed %>%
mutate(half_minute = floor(time / 30)) %>% # Convert seconds to 30-second intervals
group_by(half_minute) %>%
summarise(
total_shots = n(),
total_goals = sum(is_goal)
) %>%
ungroup()
# Create a tsibble object
hockey_goals_30sec_ts <- as_tsibble(mpd_aggregated_30sec, index = half_minute)
# Calculate approximate period breaks in 30-second intervals for regulation time
period_break_1_30sec <- (20 * 60) / 30 # 40 intervals
period_break_2_30sec <- (40 * 60) / 30 # 80 intervals
max_half_minute_reg <- floor(3600 / 30) - 1 # Last 30-second interval in regulation
ggplot(mpd_aggregated_30sec, aes(x = half_minute)) +
geom_line(aes(y = total_shots, color = "Total Shots")) +
geom_area(aes(y = total_goals * (max(mpd_aggregated_30sec$total_shots) / max(mpd_aggregated_30sec$total_goals, na.rm = TRUE)), fill = "Total Goals (Scaled)"), alpha = 0.3) +
geom_vline(xintercept = period_break_1_30sec + 0.5, linetype = "dashed", color = "grey50") +
geom_vline(xintercept = period_break_2_30sec + 0.5, linetype = "dashed", color = "grey50") +
scale_color_manual(values = c("Total Shots" = "blue")) +
scale_fill_manual(values = c("Total Goals (Scaled)" = "red")) +
scale_y_continuous(
name = "Total Shots per 30 Seconds",
sec.axis = sec_axis(~ . * (max(mpd_aggregated_30sec$total_goals, na.rm = TRUE) / max(mpd_aggregated_30sec$total_shots, na.rm = TRUE)), name = "Total Goals per 30 Seconds")
) +
labs(title = "Goals as Shaded Function of Shots (Regulation Time)",
x = "Time (30-Second Intervals from Game Start)",
color = "Metric",
fill = "Metric") +
annotate("text", x = period_break_1_30sec / 2, y = max(mpd_aggregated_30sec$total_shots) * 0.9, label = "Period 1", color = "black") +
annotate("text", x = period_break_1_30sec + (period_break_2_30sec - period_break_1_30sec) / 2, y = max(mpd_aggregated_30sec$total_shots) * 0.9, label = "Period 2", color = "black") +
annotate("text", x = period_break_2_30sec + (max_half_minute_reg - period_break_2_30sec) / 2, y = max(mpd_aggregated_30sec$total_shots) * 0.9, label = "Period 3", color = "black") +
xlim(0, max_half_minute_reg + 1) + # Limit x-axis to regulation time
theme_minimal()
## Warning: Use of `mpd_aggregated_30sec$total_shots` is discouraged.
## ℹ Use `total_shots` instead.
## Warning: Use of `mpd_aggregated_30sec$total_goals` is discouraged.
## ℹ Use `total_goals` instead.
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_line()`).
Goals Tend to Occur During Shot Bursts: While not a perfect correlation, the shaded red areas (goals) often appear during or shortly after peaks in the blue shot line. This makes intuitive sense, as more shots generally increase the opportunity for goals.
Potential Decrease in Shot Volume Over Time: Visually, the peak height of the blue shot line seems to be somewhat higher in the first period compared to the third period, suggesting a possible trend of decreasing shot volume as the game progresses. Additionally, within periods, shots tend to decrease over the course of a period. However, this needs to be confirmed with the linear regression analysis.
Empty Net Presence: Empty net goals are clearly represented on the graph around the 4 minutes left mark of the third period, where there is a huge spike in goals.
###Linear Regression
# Extract the index as a numeric vector
time_index_vector <- hockey_goals_30sec_ts %>% pull(half_minute)
# Add it as a new column
hockey_goals_30sec_ts <- hockey_goals_30sec_ts %>%
mutate(time_index = time_index_vector)
# Add it as a new column using tibble's as_tibble to ensure compatibility
hockey_goals_30sec_ts <- hockey_goals_30sec_ts %>%
as_tibble() %>%
mutate(time_index = as.numeric(time_index_vector)) %>%
as_tsibble(index = half_minute) # Convert back to tsibble, keeping the original index
# Linear regression for total shots
trend_shots <- lm(total_shots ~ time_index, data = hockey_goals_30sec_ts)
summary(trend_shots)
##
## Call:
## lm(formula = total_shots ~ time_index, data = hockey_goals_30sec_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -192.98 -22.55 15.68 38.03 138.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 322.1046 11.1933 28.776 < 2e-16 ***
## time_index -0.8199 0.1500 -5.466 2.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.18 on 128 degrees of freedom
## Multiple R-squared: 0.1892, Adjusted R-squared: 0.1829
## F-statistic: 29.87 on 1 and 128 DF, p-value: 2.318e-07
# Linear regression for total goals
trend_goals <- lm(total_goals ~ time_index, data = hockey_goals_30sec_ts)
summary(trend_goals)
##
## Call:
## lm(formula = total_goals ~ time_index, data = hockey_goals_30sec_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.790 -3.617 -0.194 2.485 43.281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.51850 1.38598 13.361 <2e-16 ***
## time_index 0.01009 0.01857 0.543 0.588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.947 on 128 degrees of freedom
## Multiple R-squared: 0.0023, Adjusted R-squared: -0.005495
## F-statistic: 0.2951 on 1 and 128 DF, p-value: 0.5879
# Plot with trend lines (same as before)
ggplot(hockey_goals_30sec_ts, aes(x = half_minute)) +
geom_point(aes(y = total_shots, color = "Total Shots"), alpha = 0.5) +
geom_smooth(aes(y = total_shots, color = "Total Shots", x = half_minute), method = "lm", se = FALSE) +
geom_point(aes(y = total_goals * (max(hockey_goals_30sec_ts$total_shots) / max(hockey_goals_30sec_ts$total_goals, na.rm = TRUE)), color = "Total Goals (Scaled)"), alpha = 0.5) +
geom_smooth(aes(y = total_goals * (max(hockey_goals_30sec_ts$total_shots) / max(hockey_goals_30sec_ts$total_goals, na.rm = TRUE)), color = "Total Goals (Scaled)", x = half_minute), method = "lm", se = FALSE) +
scale_color_manual(values = c("Total Shots" = "blue", "Total Goals (Scaled)" = "red")) +
scale_y_continuous(
name = "Total Shots per 30 Seconds",
sec.axis = sec_axis(~ . * (max(hockey_goals_30sec_ts$total_goals, na.rm = TRUE) / max(hockey_goals_30sec_ts$total_shots, na.rm = TRUE)), name = "Total Goals per 30 Seconds")
) +
labs(title = "Trends in Shots and Goals Over Regulation Time",
x = "Time (30-Second Intervals from Game Start)",
color = "Metric") +
theme_minimal()
## Warning: Use of `hockey_goals_30sec_ts$total_shots` is discouraged.
## ℹ Use `total_shots` instead.
## Warning: Use of `hockey_goals_30sec_ts$total_goals` is discouraged.
## ℹ Use `total_goals` instead.
## Warning: Use of `hockey_goals_30sec_ts$total_shots` is discouraged.
## ℹ Use `total_shots` instead.
## Warning: Use of `hockey_goals_30sec_ts$total_goals` is discouraged.
## ℹ Use `total_goals` instead.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Downward Trend in Total Shots: The blue line, representing the linear trend for “Total Shots per 30 Seconds,” clearly slopes downwards. This suggests that, on average, the number of shots taken per 30-second interval tends to decrease as regulation time progresses.
Slight Upward Trend in Total Goals (Scaled): The red line, representing the linear trend for “Total Goals (Scaled) per 30 Seconds,” shows a slight upward slope. This suggests that, on average (after scaling), the number of goals per 30-second interval tends to slightly increase as regulation time progresses. However, given the sparsity of goals, this trend might be less pronounced or statistically significant than the trend in shots.
summary(trend_shots)
##
## Call:
## lm(formula = total_shots ~ time_index, data = hockey_goals_30sec_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -192.98 -22.55 15.68 38.03 138.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 322.1046 11.1933 28.776 < 2e-16 ***
## time_index -0.8199 0.1500 -5.466 2.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.18 on 128 degrees of freedom
## Multiple R-squared: 0.1892, Adjusted R-squared: 0.1829
## F-statistic: 29.87 on 1 and 128 DF, p-value: 2.318e-07
summary(trend_goals)
##
## Call:
## lm(formula = total_goals ~ time_index, data = hockey_goals_30sec_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.790 -3.617 -0.194 2.485 43.281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.51850 1.38598 13.361 <2e-16 ***
## time_index 0.01009 0.01857 0.543 0.588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.947 on 128 degrees of freedom
## Multiple R-squared: 0.0023, Adjusted R-squared: -0.005495
## F-statistic: 0.2951 on 1 and 128 DF, p-value: 0.5879
Linear Regression for Total Shots:
Estimate for time_index: -0.8199. This indicates a negative slope, meaning that for each unit increase in time_index (each subsequent 30-second interval), the average number of total shots decreases by approximately 0.82 shots. This confirms the downward trend we observed visually.
Pr(>|t|) for time_index: 2.32e-07. This p-value is very small (much less than 0.05), indicating that the downward trend in total shots over time is statistically significant. It’s highly unlikely that this trend occurred by random chance.
Adjusted R-squared: 0.1829. This value suggests that approximately 18.3% of the variation in the total number of shots per 30-second interval can be explained by the linear trend over time. While statistically significant, the R-squared indicates that there’s still a lot of variability in shot counts that isn’t captured by a simple linear trend.
Linear Regression for Total Goals:
Estimate for time_index: 0.01009. This indicates a positive slope, suggesting a very slight upward trend in the average number of total goals per 30-second interval as time progresses.
Pr(>|t|) for time_index: 0.588. This p-value is much higher than 0.05, indicating that the slight upward trend in total goals is not statistically significant. We cannot conclude that there is a meaningful linear trend in the number of goals over regulation time based on this analysis. The observed upward slope could easily be due to random variation.
Adjusted R-squared: -0.005495. The negative adjusted R-squared suggests that the linear model does not explain the variance in total goals any better than a horizontal line (a model with no trend). This further supports the conclusion that there isn’t a significant linear trend in goal scoring over regulation time.
# Time series plot of total shots
hockey_goals_30sec_ts %>%
autoplot(total_shots) +
labs(title = "Total Shots Over Time (30-Second Intervals)")
# ACF and PACF of total shots
hockey_goals_30sec_ts %>%
ACF(total_shots, lag_max = 120) %>%
autoplot() +
labs(title = "ACF of Total Shots (30-Second Intervals)")
hockey_goals_30sec_ts %>%
PACF(total_shots, lag_max = 120) %>%
autoplot() +
labs(title = "PACF of Total Shots (30-Second Intervals)")
# Time series plot of total goals
hockey_goals_30sec_ts %>%
autoplot(total_goals) +
labs(title = "Total Goals Over Time (30-Second Intervals)")
# ACF and PACF of total goals
hockey_goals_30sec_ts %>%
ACF(total_goals, lag_max = 120) %>%
autoplot() +
labs(title = "ACF of Total Goals (30-Second Intervals)")
hockey_goals_30sec_ts %>%
PACF(total_goals, lag_max = 120) %>%
autoplot() +
labs(title = "PACF of Total Goals (30-Second Intervals)")
The analysis of total shots over 30-second intervals reveals a highly variable process throughout regulation time, characterized by short bursts of activity and an overall downward trend as the game progresses. Autocorrelation analysis indicates a strong immediate dependence in shot frequency, with a tendency for clusters of shots in consecutive intervals. However, no clear cyclical pattern emerges in the ACF or PACF that aligns with the 20-minute periods, suggesting that shot generation is driven more by immediate game dynamics than a predictable periodicity related to the game’s structure.
Goal scoring, a much less frequent event, exhibits a spiky temporal distribution, with a notable increase towards the end of regulation likely influenced by strategic decisions like pulling the goalie. Autocorrelation analysis of goal occurrences suggests a short-term clustering effect, where one goal increases the probability of another in the immediate aftermath, potentially followed by a brief period of reduced scoring. Similar to shots, there is no strong evidence of a cyclical pattern in goal scoring that directly corresponds to the game’s periods in the ACF or PACF, indicating that these critical events are largely driven by specific in-game situations rather than a predictable rhythm across the structured segments of play.