NHL Data Science Project – Part 2
```{r nhl_part2, echo=TRUE, message=FALSE, warning=FALSE}# —————————————# Packages# —————————————# If you don’t have these installed yet, run these once in the Console (not in the .Rmd):# install.packages(“tidyverse”)# install.packages(“skimr”)# install.packages(“summarytools”)# install.packages(“corrplot”)library(tidyverse)library(skimr)library(summarytools)library(corrplot)library(ggplot2)# —————————————# Load & Process Data# —————————————nhl <- read_csv(“NHL.csv”) |> rename(Team = `Unnamed: 0`) |> mutate( Team = as.factor(Team), GD = GF - GA, # Goal Differential PTS_per_game = PTS / GP # Points per Game )# Dimensions & structuredim(nhl)glimpse(nhl)# Missing values checkcolSums(is.na(nhl))# —————————————# Summary Statistics# —————————————# Overall summariesskim(nhl)dfSummary(nhl)# Focus numeric variablesnhl_numeric <- nhl |> select(AvAge, GP, W, L, OL, PTS, PTS_per_game, GF, GA, GD, SOW, SOL, SRS, SOS)summary(nhl_numeric)# Example detailed statsfivenum(nhl$PTS); sd(nhl$PTS)fivenum(nhl$GD); sd(nhl$GD)summary(nhl$AvAge); sd(nhl$AvAge)# Categorical variable: Teamtable(nhl$Team)prop.table(table(nhl$Team))# —————————————# Distributions & Normality (with explicit print)# —————————————p_pts_hist <- ggplot(nhl, aes(x = PTS)) + geom_histogram(bins = 10) + labs(title = “Distribution of Standings Points (PTS)”, x = “Points”, y = “Count”)print(p_pts_hist)p_gd_hist <- ggplot(nhl, aes(x = GD)) + geom_histogram(bins = 10) + labs(title = “Distribution of Goal Differential (GD)”, x = “Goal Differential”, y = “Count”)print(p_gd_hist)p_pts_pg_hist <- ggplot(nhl, aes(x = PTS_per_game)) + geom_histogram(bins = 10) + labs(title = “Distribution of Points per Game”, x = “Points per Game”, y = “Count”)print(p_pts_pg_hist)p_srs_hist <- ggplot(nhl, aes(x = SRS)) + geom_histogram(bins = 10) + labs(title = “Distribution of SRS”, x = “SRS”, y = “Count”)print(p_srs_hist)# QQ plotsp_qq_pts <- ggplot(nhl, aes(sample = PTS)) + stat_qq() + stat_qq_line() + labs(title = “QQ Plot for PTS”)print(p_qq_pts)p_qq_gd <- ggplot(nhl, aes(sample = GD)) + stat_qq() + stat_qq_line() + labs(title = “QQ Plot for GD”)print(p_qq_gd)p_qq_srs <- ggplot(nhl, aes(sample = SRS)) + stat_qq() + stat_qq_line() + labs(title = “QQ Plot for SRS”)print(p_qq_srs)# —————————————# Correlation Analysis# —————————————nhl_corr <- nhl |> select(PTS, GF, GA, GD, AvAge, SRS, SOS, W, L, PTS_per_game)cor_mat <- cor(nhl_corr, use = “pairwise.complete.obs”)cor_mat # full matrixcor_mat[“PTS”, ] # correlations with PTScorrplot(cor_mat, method = “color”, type = “upper”, tl.cex = 0.7, number.cex = 0.7)# —————————————# Linear Model: PTS ~ GD# —————————————lm_pts_gd <- lm(PTS ~ GD, data = nhl)summary(lm_pts_gd)# Diagnostics (base plots)par(mfrow = c(2, 2))plot(lm_pts_gd)par(mfrow = c(1, 1))# Scatterplot with regression linep_scatter <- ggplot(nhl, aes(x = GD, y = PTS)) + geom_point() + geom_smooth(method = “lm”, se = TRUE) + labs( title = “Relationship between Goal Differential (GD) and Points (PTS)”, x = “Goal Differential (GF - GA)”, y = “Standings Points (PTS)” )print(p_scatter)