library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
nba_data <- read_csv("C:/Statistics/nba.csv")
## Rows: 1703 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): bbrID, Tm, Opp, Season
## dbl (12): TRB, AST, STL, BLK, PTS, GmSc, Year, GameIndex, GmScMovingZ, GmSc...
## lgl (1): Playoffs
## date (2): Date, Date2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(nba_data)
## spc_tbl_ [1,703 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ bbrID : chr [1:1703] "abdelal01" "abdulma02" "abdulta01" "abdursh01" ...
## $ Date : Date[1:1703], format: "1993-03-16" "1991-04-02" ...
## $ Tm : chr [1:1703] "BOS" "DEN" "SAC" "ATL" ...
## $ Opp : chr [1:1703] "GSW" "DAL" "VAN" "DET" ...
## $ TRB : num [1:1703] 10 2 2 12 2 13 10 14 2 10 ...
## $ AST : num [1:1703] 2 6 3 5 0 3 1 1 8 3 ...
## $ STL : num [1:1703] 0 4 1 2 0 0 0 1 5 1 ...
## $ BLK : num [1:1703] 0 0 0 1 0 1 0 0 0 3 ...
## $ PTS : num [1:1703] 25 30 31 50 25 17 18 19 31 17 ...
## $ GmSc : num [1:1703] 22.7 29.7 26.4 46 17.1 16.9 19.2 20.7 33.2 20.6 ...
## $ Season : chr [1:1703] "1992-93" "1990-91" "1997-98" "2001-02" ...
## $ Playoffs : logi [1:1703] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Year : num [1:1703] 1993 1991 1998 2002 2019 ...
## $ GameIndex : num [1:1703] 181 64 58 386 160 8 236 124 100 4 ...
## $ GmScMovingZ : num [1:1703] 4.13 3.82 4.11 4.06 3.37 2.58 4.27 4.15 3.16 4.68 ...
## $ GmScMovingZTop2Delta: num [1:1703] 0.24 0.64 1.67 0.84 0.18 0.05 0.02 0.93 0.22 1.16 ...
## $ Date2 : Date[1:1703], format: "1991-12-04" "1995-12-07" ...
## $ GmSc2 : num [1:1703] 18.6 40.1 16.9 34.3 16.6 16.8 19.6 18.5 42.3 29.5 ...
## $ GmScMovingZ2 : num [1:1703] 3.89 3.18 2.44 3.22 3.19 2.53 4.25 3.22 2.94 3.52 ...
## - attr(*, "spec")=
## .. cols(
## .. bbrID = col_character(),
## .. Date = col_date(format = ""),
## .. Tm = col_character(),
## .. Opp = col_character(),
## .. TRB = col_double(),
## .. AST = col_double(),
## .. STL = col_double(),
## .. BLK = col_double(),
## .. PTS = col_double(),
## .. GmSc = col_double(),
## .. Season = col_character(),
## .. Playoffs = col_logical(),
## .. Year = col_double(),
## .. GameIndex = col_double(),
## .. GmScMovingZ = col_double(),
## .. GmScMovingZTop2Delta = col_double(),
## .. Date2 = col_date(format = ""),
## .. GmSc2 = col_double(),
## .. GmScMovingZ2 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Convert necessary columns to factors
nba_data$Playoffs <- as.factor(nba_data$Playoffs)
nba_data$Season <- as.factor(nba_data$Season)
# Build an extended linear model with an interaction term and a binary variable
model <- lm(GmSc ~ PTS + STL + BLK + Playoffs, data = nba_data)
summary(model)
##
## Call:
## lm(formula = GmSc ~ PTS + STL + BLK + Playoffs, data = nba_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6976 -1.9043 -0.1271 1.7509 11.4281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.210982 0.213869 15.014 <2e-16 ***
## PTS 0.745476 0.006927 107.619 <2e-16 ***
## STL 1.066850 0.048563 21.968 <2e-16 ***
## BLK 0.801871 0.054617 14.682 <2e-16 ***
## PlayoffsTRUE -0.004059 0.431223 -0.009 0.992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.926 on 1698 degrees of freedom
## Multiple R-squared: 0.8809, Adjusted R-squared: 0.8806
## F-statistic: 3140 on 4 and 1698 DF, p-value: < 2.2e-16
# Check VIF again
vif(model)
## PTS STL BLK Playoffs
## 1.014410 1.004968 1.003802 1.013460
par(mfrow=c(2,2))
plot(model)
## R-squared Output
adj_r2_revised <- summary(model)$adj.r.squared
cat("Adjusted R² - Revised model:", round(adj_r2_revised, 4), "\n")
## Adjusted R² - Revised model: 0.8806
Residuals vs Fitted The plot shows a fairly random scatter with slight curvature. There’s mild evidence of non-linearity, suggesting we might explore interactions or polynomial terms.
Normal Q-Q Plot Residuals largely follow the diagonal line, indicating that the normality assumption is reasonably satisfied. A few deviations at the tails hint at minor outliers.
Scale-Location Plot There’s a slight upward trend, suggesting non-constant variance. A log transformation or weighted least squares may help stabilize variance.
Residuals vs Leverage No points exceed critical thresholds for leverage or Cook’s Distance, indicating no overly influential observations. The model seems stable.
The adjusted R-squared remains stable after removing high VIF variables, indicating we reduced multicollinearity without sacrificing explanatory power.
The original model included multiple variables with high multicollinearity, as shown by VIF > 5.
After removing those predictors, the filtered model retained a strong adjusted R-squared value and passed most regression diagnostics.
The residual plots for the updated model show mild non-linearity and potential heteroscedasticity, which could be addressed in future models via transformations or interaction terms.
This step improves model interpretability and ensures assumptions are better met.