rm(list = ls())  # Clear environment
  gc()             # Clear unused memory

##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  626197 33.5    1431310 76.5         NA   715648 38.3
## Vcells 1186707  9.1    8388608 64.0      16384  2010356 15.4

  cat("\f")        # Clear the console

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(wooldridge)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

0.1 Data Prep

stats  <- read.csv("/Users/elsablease/HeteroskedasticityEGB/all_seasons_2 3.csv")
salary <- read.csv("/Users/elsablease/HeteroskedasticityEGB/NBA_Player_Salaries.csv")

names(stats)

##  [1] "X"                 "player_name"       "team_abbreviation"
##  [4] "age"               "player_height"     "player_weight"    
##  [7] "college"           "country"           "draft_year"       
## [10] "draft_round"       "draft_number"      "gp"               
## [13] "pts"               "reb"               "ast"              
## [16] "net_rating"        "oreb_pct"          "dreb_pct"         
## [19] "usg_pct"           "ts_pct"            "ast_pct"          
## [22] "season"

names(salary)

## [1] "Player.Id"    "Player.Name"  "X2022.2023"   "X2023.2024"   "X2024.2025"  
## [6] "X2024.2025.1"

salary$Salary_2024_25 <- as.numeric(gsub("[\\$,]", "", salary$X2024.2025))
sum(is.na(as.numeric(gsub("[\\$,]", "", salary$X2024.2025))))

## [1] 0

merged <- merge(
  stats,
  salary[, c("Player.Name", "Salary_2024_25")],
  by.x = "player_name",
  by.y = "Player.Name",
  all.x = TRUE
)

summary(merged$Salary_2024_25)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0        0        0  7590446  9000000 55761217    10101

sum(is.na(merged$Salary_2024_25))

## [1] 10101

head(merged[, c("player_name", "Salary_2024_25")])

##     player_name Salary_2024_25
## 1    A.C. Green             NA
## 2    A.C. Green             NA
## 3    A.C. Green             NA
## 4    A.C. Green             NA
## 5    A.C. Green             NA
## 6 A.J. Bramlett             NA

library(dplyr)
library(magrittr)

stats$player_clean <- stats$player_name %>%
  tolower() %>%
  gsub("[^a-z ]", "", .) %>%
  trimws()

salary$player_clean <- salary$Player.Name %>%
  tolower() %>%
  gsub("[^a-z ]", "", .) %>%
  trimws()

merged <- merge(
  stats,
  salary[, c("player_clean", "Salary_2024_25")],
  by = "player_clean",
  all.x = TRUE
)

stats$player_clean <- stats$player_name %>%
  tolower() %>%
  gsub("[^a-z ]", "", .) %>%
  trimws()

salary$player_clean <- salary$Player.Name %>%
  tolower() %>%
  gsub("[^a-z ]", "", .) %>%
  trimws()

salary$Salary_2024_25 <- salary$X2024.2025 %>%
  gsub("[\\$,]", "", .) %>%   # remove $ and commas
  as.numeric()

salary_small <- salary[, c("player_clean", "Salary_2024_25")]

merged <- merge(
  stats,
  salary_small,
  by = "player_clean",
  all.x = TRUE
)

merged_clean <- merged %>%
  filter(
    !is.na(Salary_2024_25),       
    Salary_2024_25 > 0,              
    gp > 0,                          
    !is.na(pts),
    !is.na(ast),
    !is.na(age)
  )

nrow(merged_clean)

## [1] 1008

summary(merged_clean$Salary_2024_25)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   268032  7577163 16500000 21255224 34848340 55761217

head(merged_clean[, c("player_name", "pts", "ast", "age", "Salary_2024_25")])

##    player_name  pts ast age Salary_2024_25
## 1 Aaron Gordon 16.3 3.0  27       22841455
## 2 Aaron Gordon  9.2 1.6  20       22841455
## 3 Aaron Gordon 12.4 3.2  25       22841455
## 4 Aaron Gordon  5.2 0.7  19       22841455
## 5 Aaron Gordon 12.7 1.9  21       22841455
## 6 Aaron Gordon 17.6 2.3  22       22841455

0.2 ISSUE SUMMARY

What is “heteroskedasticity”, and the econometric issue it causes (affects point estimates or standard errors)? Do not confuse heteroskedasticity with other terms like multicollinearity, serial correlation, et cetra (2-3 sentences in your own words - EG do not copy/paste directly from the web.)

Heteroskedasticity occurs when the variance of the regression errors is not constant across observations. Some observations have much larger or smaller error spreads than others. While heteroskedasticity does not bias the OLS coefficient estimates, it does make the standard errors incorrect, which means hypothesis tests and confidence intervals can become misleading or unreliable. As a result, we may incorrectly conclude that a variable is statistically significant or insignificant when it actually isn’t.

What is the null and alternative hypothesis in BPLinks to an external site. or WhiteLinks to an external site. test? The hypothesis is the same, but the auxiliary regression specification is slightly different. Do you agree with the test logic? (2-3 sentences)

For both the Breusch-Pagan test and the White test, the null hypothesis is that the regression errors are homoskedastic, meaning they have constant variance. The alternative hypothesis is that the variance of the errors depends on one or more regressors (heteroskedasticity). The tests differ in how they model that dependence, as BP assumes a linear relationship between the error variance and the regressors, while White allows a more flexible form including squares and interactions. The logic behind these tests is reasonable, because if the residual variance systematically changes with predictors, it will show up in the auxiliary regression and signal a violation of the homoskedasticity assumption.

0.3 CODING

0.4 Data

I used two different data sets from Kaggle, being NBA - Player Stats - Season 24/25 (https://www.kaggle.com/datasets/eduardopalmieri/nba-player-stats-season-2425) and 🏀💰NBA Salaries: Hoops Fortune (2020-2025) (https://www.kaggle.com/datasets/omarsobhy14/nba-players-salaries). I had to merge them together to before working on the regression.

In my merged dataset, each observation represents an NBA player, and I focused on three key explanatory variables (points per game, assists per game, and age). Before estimating the model, I removed observations with missing values, zero salaries, or players who did not appear in any games, to ensure that the final sample accurately reflects active players with meaningful performance statistics.

The fitted regression shows a positve relationship between performance and salary. Players who score more points or record more assists tend to earn higher salaries. However, when visually inspecting the residuals and scatterplots, I observed tha the variability of salary increases for players with higher performance statistics. High-scoring and highly experienced players show much larger spreads around the regression line than lower-scoring players. This pattern indicates heteroskedasticity, meaning the variance of the errors is not constant across observations.

Since OLS standard errors assume homoskedasticity, relaying on them without adjustment can result in misleading inferences about statistical significance. In the presence of heteroskedasticity, robust standard errors provide more reliable inference by correcting the estimated variability of the coefficients. These robust errors allow me to conduct hypothesis tests and construct confidence intervals that remain valid even when the error variance is not consistent.

1 Estimate a Multivariate Regression

My model examines how NBA player salary varies with performance and age. The dependent variable is the player’s salary for the 2024-25 season, and the independent variables are points per game, assists per game, and player’s age.

\[Salary_i = \beta_0 + \beta_1(pts_i) + \beta_2(ast_i) + \beta_3(age_i) + \epsilon_i\]Where:

\(Salary_i\) = 2024-2025 NBA salary for player i
\(pts_i\) = points per game
\(ast_i\) = assists per game
\(age_i\) = age in years
\(\epsilon_i\) = error term

library(ggplot2)

ggplot(merged_clean, aes(x = pts, y = Salary_2024_25)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Points per game",
    y = "Salary (2024–25 season)",
    title = "NBA Salary vs Points per Game"
  )

## `geom_smooth()` using formula = 'y ~ x'

1.1 Robust standard errors

lm.mod <- lm(
  formula = Salary_2024_25 ~ pts + ast + age,
  data    = merged_clean
)

summary(lm.mod)

## 
## Call:
## lm(formula = Salary_2024_25 ~ pts + ast + age, data = merged_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -33370995  -5981199   -993754   5257096  45275435 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1064963    2201378   0.484   0.6287    
## pts          1598287      59014  27.083   <2e-16 ***
## ast           331296     190520   1.739   0.0824 .  
## age           -81666      90951  -0.898   0.3694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10330000 on 1004 degrees of freedom
## Multiple R-squared:  0.6002, Adjusted R-squared:  0.599 
## F-statistic: 502.4 on 3 and 1004 DF,  p-value: < 2.2e-16

par(mfrow = c(2, 2))
plot(lm.mod)

par(mfrow = c(1, 1))

1.1.1 t test

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)

coeftest(
  lm.mod,
  vcov = vcovHC(lm.mod, type = "HC1")
)

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1064963    2216651  0.4804  0.63102    
## pts          1598287      53143 30.0751  < 2e-16 ***
## ast           331296     196256  1.6881  0.09171 .  
## age           -81666      87974 -0.9283  0.35348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

df <- merged_clean

lm_model <- lm(
  data    = df,
  formula = Salary_2024_25 ~ pts + ast + age
)
summary(lm_model)

## 
## Call:
## lm(formula = Salary_2024_25 ~ pts + ast + age, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -33370995  -5981199   -993754   5257096  45275435 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1064963    2201378   0.484   0.6287    
## pts          1598287      59014  27.083   <2e-16 ***
## ast           331296     190520   1.739   0.0824 .  
## age           -81666      90951  -0.898   0.3694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10330000 on 1004 degrees of freedom
## Multiple R-squared:  0.6002, Adjusted R-squared:  0.599 
## F-statistic: 502.4 on 3 and 1004 DF,  p-value: < 2.2e-16

plot(lm_model)

robust_se <- sqrt(diag(vcovHC(lm_model, type = "HC3")))

print(robust_se)

## (Intercept)         pts         ast         age 
##  2223287.33    53378.76   197429.52    88280.87

coeftest(lm_model, vcov = vcovHC(lm_model, type = "HC3"))

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1064963    2223287  0.4790  0.63204    
## pts          1598287      53379 29.9424  < 2e-16 ***
## ast           331296     197430  1.6780  0.09365 .  
## age           -81666      88281 -0.9251  0.35515    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.1.2 F test

model_unres <- lm(Salary_2024_25 ~ pts + ast + age, data = merged_clean)

model_res <- lm(Salary_2024_25 ~ pts, data = merged_clean)

anova(model_res, model_unres)

## Analysis of Variance Table
## 
## Model 1: Salary_2024_25 ~ pts
## Model 2: Salary_2024_25 ~ pts + ast + age
##   Res.Df        RSS Df  Sum of Sq      F Pr(>F)
## 1   1006 1.0744e+17                            
## 2   1004 1.0707e+17  2 3.7047e+14 1.7369 0.1766

waldtest(model_res, model_unres, vcov = vcovHC(model_unres, type = "HC0"))

## Wald test
## 
## Model 1: Salary_2024_25 ~ pts
## Model 2: Salary_2024_25 ~ pts + ast + age
##   Res.Df Df      F Pr(>F)
## 1   1006                 
## 2   1004  2 1.5956 0.2033

1.2 Main Regression Model Specification

Main Regression Model

\(Salary_{i, 2024-25} = \beta_0 + \beta_1 PointsPerGame_i + \beta_2 AssistsPerGame_i + \beta_3 Age_i + \epsilon_i\)

where \(i\) indexes individual NBA players in the 2024–2025 season.

The dependent variable is the player’s 2024–2025 NBA salary (Salary_2024_25), measured in U.S. dollars.

The independent variables are:

Points per game (pts) — a measure of scoring productivity
Assists per game (ast) — a measure of playmaking
Age (age) — experience and maturity

This model examines how a player’s performance and experience help explain variation in salaries across NBA players in the 2024–25 season.

subset_NBA <- merged_clean[, c("Salary_2024_25", "pts", "ast", "age")]

lm.mod <- lm(
  formula = Salary_2024_25 ~ . ,
  data    = subset_NBA
)

summary(lm.mod)

## 
## Call:
## lm(formula = Salary_2024_25 ~ ., data = subset_NBA)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -33370995  -5981199   -993754   5257096  45275435 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1064963    2201378   0.484   0.6287    
## pts          1598287      59014  27.083   <2e-16 ***
## ast           331296     190520   1.739   0.0824 .  
## age           -81666      90951  -0.898   0.3694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10330000 on 1004 degrees of freedom
## Multiple R-squared:  0.6002, Adjusted R-squared:  0.599 
## F-statistic: 502.4 on 3 and 1004 DF,  p-value: < 2.2e-16

2 Heteroskedasticty

2.1 Residual Analysis

Eyeballing the residual plots suggests that heteroskedasticity is likely present in my NBA salary model. The spread of the residuals increases at higher fitted salary values, indicating that the variance of the errors is not constant across players. To formally assess this, I used statistical tests such as the Breusch–Pagan and White tests.

plot(lm.mod, which = 1)

par(mfrow = c(2, 2))

plot(lm.mod)

par(mfrow = c(1, 1))

2.2 Formal Tests with packages

2.2.1 White test for heteroskedasticity

White’s Test allows us to examine whether the variance of the regression errors is constant across the values of the independent variables in our model. In the context of the NBA salary regression, this test checks whether the variability in player salaries becomes more or less predictable depending on on-court performance measures such as points, assists, and age. White’s Test works by estimating an auxiliary regression in which the squared residuals from the original model are regressed on the original regressors, their squares, and their interaction terms. The resulting test statistic is computed as:

\(T = n × R^2_{aux}\)

Under the null hypothesis of homoskedasticity, this statistic follows a chi-square distribution with degrees of freedom equal to the number of terms in the auxiliary regression (excluding the constant). The White test is a right-tailed test.

2.3 WHITE TEST

library(skedastic)

white_test_result <- skedastic::white(lm.mod, interactions = TRUE)

white_test_result

## # A tibble: 1 × 5
##   statistic  p.value parameter method       alternative
##       <dbl>    <dbl>     <dbl> <chr>        <chr>      
## 1      68.3 3.35e-11         9 White's Test greater

The White test statistic for my NBA salary regression is 68.25, with a p-value of 3.35e-11. Because the p-value is less than 0.05, we reject the null hypothesis of homoskedasticity.

Rejecting the null means that heteroskedasticity is present in the NBA salary regression. The variance of the regression errors is not constant across different levels of player performance. In practical terms, salary becomes increasingly variable for players with higher performance metrics (e.g., more points, assists, etc.).

2.4 Manually Test with “Auxillary regression”

The auxiliary regression results indicate that the variance of the error term is systematically related to the independent variables in my model. In practical terms, this means that salary dispersion increases as player performance changes, which violates the classical assumption of constant error variance.

This finding confirms that:

heteroskedasticity is present in my NBA salary regression
standard OLS standard errors are unreliable
robust (HC) standard errors are needed for valid inference

Therefore, any hypothesis testing or confidence intervals based on the non-robust OLS results should not be used without correcting for heteroskedasticity.

2.4.1 White test for heteroskedasticity

subset_NBA$residuals <- resid(object = lm.mod)

summary(subset_NBA$residuals)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -33370995  -5981199   -993754         0   5257096  45275435

subset_NBA$squared_residuals <- (subset_NBA$residuals)^2

summary(subset_NBA$squared_residuals)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.626e+07 6.030e+12 3.223e+13 1.062e+14 1.101e+14 2.050e+15

white_auxillary_reg <- lm(
  formula = squared_residuals ~ pts + ast + age +
    I(pts^2) + I(ast^2) + I(age^2) +
    pts:ast + pts:age + ast:age,
  data = subset_NBA
)

white_auxillary_reg_summary <- summary(white_auxillary_reg)
white_auxillary_reg_summary

## 
## Call:
## lm(formula = squared_residuals ~ pts + ast + age + I(pts^2) + 
##     I(ast^2) + I(age^2) + pts:ast + pts:age + ast:age, data = subset_NBA)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.094e+14 -9.069e+13 -4.749e+13  9.750e+12  2.011e+15 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.065e+14  2.385e+14   2.963  0.00312 ** 
## pts          6.285e+12  7.797e+12   0.806  0.42036    
## ast          4.749e+13  2.423e+13   1.960  0.05025 .  
## age         -4.797e+13  1.848e+13  -2.596  0.00958 ** 
## I(pts^2)    -6.882e+11  1.641e+11  -4.193    3e-05 ***
## I(ast^2)    -3.514e+12  1.413e+12  -2.487  0.01304 *  
## I(age^2)     7.049e+11  3.596e+11   1.960  0.05023 .  
## pts:ast      9.682e+11  7.282e+11   1.330  0.18399    
## pts:age      4.492e+11  3.072e+11   1.463  0.14391    
## ast:age     -9.161e+11  9.052e+11  -1.012  0.31180    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.954e+14 on 998 degrees of freedom
## Multiple R-squared:  0.06771,    Adjusted R-squared:  0.0593 
## F-statistic: 8.054 on 9 and 998 DF,  p-value: 1.382e-11

white_LM_stat <- white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)

chisq_p_value <- pchisq(
  q          = white_LM_stat,
  df         = 9,         
  lower.tail = FALSE      
)

chisq_p_value

## [1] 3.348535e-11

This results in the same exact p-value from the package and by hand above.

white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)

## [1] 68.25225

The test statistic also has the same exact result.

critical_value <- qchisq(
  p  = 0.05,
  df = 9,
  lower.tail = FALSE
)
critical_value

## [1] 16.91898

test_statistic <- qchisq(
  p  = white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg),
  df = 9,
  lower.tail = FALSE
)

## Warning in qchisq(p = white_auxillary_reg_summary$r.squared *
## nobs(white_auxillary_reg), : NaNs produced

test_statistic

## [1] NaN

What low/high R-squared values suggest about heteroskedasticity

In the context of the White test, the R-squared from the auxiliary regression measures how strongly the squared residuals (the error variance) are related to the regressors, their squares, and their interactions.

My auxiliary R² of about 0.12 (12%) is considered high for a White-test auxiliary regression, which is high enough to indicate that the squared residuals are meaningfully related to the regressors, suggesting heteroskedasticity.

Test statistic: T = 68.25

5% chi-squared critical value: 16.92

The test statistic falls far into the rejection region, meaning it is “more extreme” than the chi-square cutoff.

White test p-value is: \(p = 3.35 × 10^{−11}\)

The p-value is far below 0.05, providing strong evidence against the null hypothesis of homoskedasticity.

Since the test statistic exceeds the critical value and the p-value is extremely small, we reject the null hypothesis of homoskedasticity.

3 Extra: BP test

3.1 Breusch-Pagan test for heteroskedasticity

lmtest_package_bp <- bptest(
  formula    = lm.mod,  
  studentize = TRUE       
)

lmtest_package_bp

## 
##  studentized Breusch-Pagan test
## 
## data:  lm.mod
## BP = 29.688, df = 3, p-value = 1.605e-06

skedastic_package_bp <- skedastic::breusch_pagan(lm.mod)

skedastic_package_bp

## # A tibble: 1 × 5
##   statistic    p.value parameter method                alternative
##       <dbl>      <dbl>     <dbl> <chr>                 <chr>      
## 1      29.7 0.00000160         3 Koenker (studentised) greater

lmtest::bptest gives the exact same outpout as skedastic::breusch_pagan, as expected.

The test statistic value is 29.68, follows a chi-squared distribution with parameter (the number of regressors without the constant in the model) degrees of freedom, and it has a p-value of approximately \(1.6e-06\) — thus we reject the null of homoskedasticity.

# MANUAL BREUSCH–PAGAN TEST (NBA VERSION)

# Auxiliary regression: regress squared residuals on the ORIGINAL regressors only
bp_auxillary_reg <- lm(
  formula = squared_residuals ~ pts + ast + age,
  data    = subset_NBA
)

summary(bp_auxillary_reg)

## 
## Call:
## lm(formula = squared_residuals ~ pts + ast + age, data = subset_NBA)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.943e+14 -9.416e+13 -6.151e+13  1.218e+13  1.954e+15 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.736e+14  4.237e+13   6.458 1.65e-10 ***
## pts          8.063e+11  1.136e+12   0.710   0.4779    
## ast          8.837e+12  3.667e+12   2.410   0.0161 *  
## age         -8.307e+12  1.750e+12  -4.746 2.37e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.987e+14 on 1004 degrees of freedom
## Multiple R-squared:  0.02945,    Adjusted R-squared:  0.02655 
## F-statistic: 10.16 on 3 and 1004 DF,  p-value: 1.36e-06

r2_bp_auxillary_reg <- summary(bp_auxillary_reg)$r.squared
r2_bp_auxillary_reg

## [1] 0.02945285

n_bp_auxillary_reg <- nobs(bp_auxillary_reg)
n_bp_auxillary_reg

## [1] 1008

############## I. COMPUTE CHI-SQUARED TEST STATISTIC
chisq_test_statistic_bp <- n_bp_auxillary_reg * r2_bp_auxillary_reg
chisq_test_statistic_bp

## [1] 29.68847

############## II. COMPUTE CHI-SQUARED CRITICAL VALUE
chisq_critical_value_bp <- qchisq(
  p          = 0.95,   
  df         = 3,     
  lower.tail = TRUE
)
chisq_critical_value_bp

## [1] 7.814728

############## III. COMPARE TEST STATISTIC WITH CRITICAL VALUE
chisq_test_statistic_bp > chisq_critical_value_bp

## [1] TRUE

chisq_p_value_bp <- pchisq(
  q          = chisq_test_statistic_bp,
  df         = 3,
  lower.tail = FALSE    
)

chisq_p_value_bp

## [1] 1.604786e-06

library(ggplot2)
library(broom)

nba_aug <- augment(lm.mod, data = subset_NBA)

ggplot(nba_aug, aes(x = .fitted, y = Salary_2024_25)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    x = "Predicted salary (from model)",
    y = "Actual salary (2024–25)",
    title = "Actual vs Predicted NBA Salaries"
  ) +
  theme_minimal()

# Residuals vs points per game
ggplot(nba_aug, aes(x = pts, y = .resid)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_point(alpha = 0.5) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Points per game",
    y = "Residual",
    title = "Residuals vs Points per Game"
  ) +
  theme_minimal()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Residuals vs age
ggplot(nba_aug, aes(x = age, y = .resid)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_point(alpha = 0.5) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Age",
    y = "Residual",
    title = "Residuals vs Age"
  ) +
  theme_minimal()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

It looks like age might have more of a factor in the variance/heteroskedasticity.

ggplot(subset_NBA, aes(x = pts, y = Salary_2024_25, color = ast)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_viridis_c(option = "plasma") +
  labs(
    x = "Points per game",
    y = "Salary (2024–25 season)",
    color = "Assists per game",
    title = "NBA Salary vs Scoring, Colored by Playmaking"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(subset_NBA, aes(x = pts, y = Salary_2024_25, color = age)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_viridis_c(option = "plasma") +
  labs(
    x = "Points per game",
    y = "Salary (2024–25 season)",
    color = "Age",
    title = "NBA Salary vs Scoring, Colored by Playmaking"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Heteroskedasticity

Elsa Blease

2025-12-04