#Package Loading
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
#Loading the MoneyPuck Shot Dataset
mpd = read.csv('./shots_2024.csv')

#adding descriptors to dataframe

#Load the data dictionary (update with your file path)
data_dict <- read.csv("C:/Users/Logan/Downloads/MoneyPuck_Shot_Data_Dictionary (1) (1).csv")

#Iterate through the data dictionary and assign labels (from ChatGPT -- QOL Step)

for (i in 1:nrow(data_dict)) {
  column_name <- data_dict$Variable[i]
  description <- data_dict$Definition[i]
  
  if (column_name %in% colnames(mpd)) {
    label(mpd[[column_name]]) <- description
  }
}

mpd <- mpd %>% 
  filter(playerPositionThatDidEvent != "" & 
         !is.na(playerPositionThatDidEvent) & 
         playerPositionThatDidEvent != "G")

Variable Selection

Continuous Response: XGoal Categorical: Shooter Position Continuous Explanatory: Arena Adj. Shot Angle

ANOVA Test: xGoal vs Shooter Position

H0: There is no significant difference in expected goals (XGoal) across shooter positions.

H1: At least one shooter position has a significantly different XGoal value.

# Run ANOVA test
anova_model <- aov(xGoal ~ playerPositionThatDidEvent, data = mpd)
summary(anova_model)
##                               Df Sum Sq Mean Sq F value Pr(>F)    
## playerPositionThatDidEvent     3   35.6   11.86    1157 <2e-16 ***
## Residuals                  58547  599.9    0.01                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(mpd, aes(x = playerPositionThatDidEvent, y = xGoal)) +
  geom_boxplot() +
  labs(title = "XGoal Distribution by Shooter Position", 
       x = "Shooter Position", 
       y = "Expected Goals") +
  theme_minimal()

The ANOVA test results reveal a significant difference in expected goals (xGoal) across different shooter positions, with a very low p-value (< 2e-16), indicating that shooter position influences the number of expected goals. Box plots suggest that defensemen likely contribute significantly lower xGoal values compared to other positions, such as forwards. This variation is likely due to the many factors influenced by position, such as the types of shots taken, shooting locations, and defensive responsibilities. While the ANOVA results provide strong evidence of this difference, follow-up testing, such as Tukey’s HSD, is necessary to confirm exactly which pairs of positions are significantly different from each other. Further investigation into shot types and locations could offer additional insights into these differences in expected goals across positions.

Regression: Shot Distance and xGoal

# Run linear regression model
lm_model <- lm(xGoal ~ mpd$shotDistance, data = mpd)
summary(lm_model)
## 
## Call:
## lm(formula = xGoal ~ mpd$shotDistance, data = mpd)
## 
## Residuals:
## The probability the shot will be a goal. Also known as "Expected Goals" 
##      Min       1Q   Median       3Q      Max 
## -0.13774 -0.04094 -0.01844  0.01179  0.86497 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.530e-01  7.685e-04   199.1   <2e-16 ***
## mpd$shotDistance -2.353e-03  1.939e-05  -121.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09313 on 58549 degrees of freedom
## Multiple R-squared:  0.2009, Adjusted R-squared:  0.2009 
## F-statistic: 1.472e+04 on 1 and 58549 DF,  p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(mpd, aes(x = shotDistance, y = xGoal)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "XGoal vs. Arena Adjusted Shot Angle", 
       x = "Shot Angle", 
       y = "Expected Goals") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

The analysis of shot distance as a predictor for expected goals (xGoal) reveals a statistically significant relationship, with a p-value less than 2e-16 for both the intercept and shot distance coefficient. The coefficient for shot distance is -0.002353, suggesting that as the shot distance increases, the expected probability of a goal slightly decreases. The R-squared value of 0.2009 indicates that shot distance accounts for only about 20% of the variability in xGoal, pointing to a weak linear relationship. This suggests that while shot distance has some influence on expected goals, it is not a strong enough predictor to explain much of the variation in the dataset. Furthermore, the lack of a strong linear relationship across other continuous variables in the dataset implies that expected goals are likely driven by a combination of factors beyond just shot distance, indicating that the dataset may require more complex modeling techniques, such as incorporating non-linear relationships or exploring other influential factors, to better understand the predictors of xGoal.