rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 626197 33.5 1431310 76.5 NA 715648 38.3
## Vcells 1186707 9.1 8388608 64.0 16384 2010356 15.4
cat("\f") # Clear the console
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(wooldridge)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
Data Prep
stats <- read.csv("/Users/elsablease/HeteroskedasticityEGB/all_seasons_2 3.csv")
salary <- read.csv("/Users/elsablease/HeteroskedasticityEGB/NBA_Player_Salaries.csv")
names(stats)
## [1] "X" "player_name" "team_abbreviation"
## [4] "age" "player_height" "player_weight"
## [7] "college" "country" "draft_year"
## [10] "draft_round" "draft_number" "gp"
## [13] "pts" "reb" "ast"
## [16] "net_rating" "oreb_pct" "dreb_pct"
## [19] "usg_pct" "ts_pct" "ast_pct"
## [22] "season"
names(salary)
## [1] "Player.Id" "Player.Name" "X2022.2023" "X2023.2024" "X2024.2025"
## [6] "X2024.2025.1"
salary$Salary_2024_25 <- as.numeric(gsub("[\\$,]", "", salary$X2024.2025))
sum(is.na(as.numeric(gsub("[\\$,]", "", salary$X2024.2025))))
## [1] 0
merged <- merge(
stats,
salary[, c("Player.Name", "Salary_2024_25")],
by.x = "player_name",
by.y = "Player.Name",
all.x = TRUE
)
summary(merged$Salary_2024_25)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 0 0 7590446 9000000 55761217 10101
sum(is.na(merged$Salary_2024_25))
## [1] 10101
head(merged[, c("player_name", "Salary_2024_25")])
## player_name Salary_2024_25
## 1 A.C. Green NA
## 2 A.C. Green NA
## 3 A.C. Green NA
## 4 A.C. Green NA
## 5 A.C. Green NA
## 6 A.J. Bramlett NA
library(dplyr)
library(magrittr)
stats$player_clean <- stats$player_name %>%
tolower() %>%
gsub("[^a-z ]", "", .) %>%
trimws()
salary$player_clean <- salary$Player.Name %>%
tolower() %>%
gsub("[^a-z ]", "", .) %>%
trimws()
merged <- merge(
stats,
salary[, c("player_clean", "Salary_2024_25")],
by = "player_clean",
all.x = TRUE
)
stats$player_clean <- stats$player_name %>%
tolower() %>%
gsub("[^a-z ]", "", .) %>%
trimws()
salary$player_clean <- salary$Player.Name %>%
tolower() %>%
gsub("[^a-z ]", "", .) %>%
trimws()
salary$Salary_2024_25 <- salary$X2024.2025 %>%
gsub("[\\$,]", "", .) %>% # remove $ and commas
as.numeric()
salary_small <- salary[, c("player_clean", "Salary_2024_25")]
merged <- merge(
stats,
salary_small,
by = "player_clean",
all.x = TRUE
)
merged_clean <- merged %>%
filter(
!is.na(Salary_2024_25),
Salary_2024_25 > 0,
gp > 0,
!is.na(pts),
!is.na(ast),
!is.na(age)
)
nrow(merged_clean)
## [1] 1008
summary(merged_clean$Salary_2024_25)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 268032 7577163 16500000 21255224 34848340 55761217
head(merged_clean[, c("player_name", "pts", "ast", "age", "Salary_2024_25")])
## player_name pts ast age Salary_2024_25
## 1 Aaron Gordon 16.3 3.0 27 22841455
## 2 Aaron Gordon 9.2 1.6 20 22841455
## 3 Aaron Gordon 12.4 3.2 25 22841455
## 4 Aaron Gordon 5.2 0.7 19 22841455
## 5 Aaron Gordon 12.7 1.9 21 22841455
## 6 Aaron Gordon 17.6 2.3 22 22841455
ISSUE SUMMARY
- What is “heteroskedasticity”, and the econometric issue it
causes (affects point estimates or standard errors)? Do not confuse
heteroskedasticity with other terms like multicollinearity, serial
correlation, et cetra (2-3 sentences in your own words - EG do not
copy/paste directly from the web.)
Heteroskedasticity occurs when the variance of the regression errors
is not constant across observations. Some observations have much larger
or smaller error spreads than others. While heteroskedasticity does not
bias the OLS coefficient estimates, it does make the standard errors
incorrect, which means hypothesis tests and confidence intervals can
become misleading or unreliable. As a result, we may incorrectly
conclude that a variable is statistically significant or insignificant
when it actually isn’t.
- What is the null and alternative hypothesis in BPLinks
to an external site. or WhiteLinks to an
external site. test? The hypothesis is the same, but the auxiliary
regression specification is slightly different. Do you agree with the
test logic? (2-3 sentences)
For both the Breusch-Pagan test and the White test, the null
hypothesis is that the regression errors are homoskedastic, meaning they
have constant variance. The alternative hypothesis is that the variance
of the errors depends on one or more regressors (heteroskedasticity).
The tests differ in how they model that dependence, as BP assumes a
linear relationship between the error variance and the regressors, while
White allows a more flexible form including squares and interactions.
The logic behind these tests is reasonable, because if the residual
variance systematically changes with predictors, it will show up in the
auxiliary regression and signal a violation of the homoskedasticity
assumption.
CODING
Data
I used two different data sets from Kaggle, being NBA -
Player Stats - Season 24/25 (https://www.kaggle.com/datasets/eduardopalmieri/nba-player-stats-season-2425)
and 🏀💰NBA Salaries: Hoops Fortune (2020-2025) (https://www.kaggle.com/datasets/omarsobhy14/nba-players-salaries).
I had to merge them together to before working on the regression.
In my merged dataset, each observation represents an NBA player, and
I focused on three key explanatory variables (points per game, assists
per game, and age). Before estimating the model, I removed observations
with missing values, zero salaries, or players who did not appear in any
games, to ensure that the final sample accurately reflects active
players with meaningful performance statistics.
The fitted regression shows a positve relationship between
performance and salary. Players who score more points or record more
assists tend to earn higher salaries. However, when visually inspecting
the residuals and scatterplots, I observed tha the variability of salary
increases for players with higher performance statistics. High-scoring
and highly experienced players show much larger spreads around the
regression line than lower-scoring players. This pattern indicates
heteroskedasticity, meaning the variance of the errors is not constant
across observations.
Since OLS standard errors assume homoskedasticity, relaying on them
without adjustment can result in misleading inferences about statistical
significance. In the presence of heteroskedasticity, robust standard
errors provide more reliable inference by correcting the estimated
variability of the coefficients. These robust errors allow me to conduct
hypothesis tests and construct confidence intervals that remain valid
even when the error variance is not consistent.
Estimate a Multivariate
Regression
My model examines how NBA player salary varies with performance and
age. The dependent variable is the player’s salary for the 2024-25
season, and the independent variables are points per game, assists per
game, and player’s age.
\[Salary_i = \beta_0 + \beta_1(pts_i) +
\beta_2(ast_i) + \beta_3(age_i) + \epsilon_i\]Where:
\(Salary_i\) = 2024-2025 NBA
salary for player i
\(pts_i\) = points per
game
\(ast_i\) = assists per
game
\(age_i\) = age in
years
\(\epsilon_i\) = error
term
library(ggplot2)
ggplot(merged_clean, aes(x = pts, y = Salary_2024_25)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Points per game",
y = "Salary (2024–25 season)",
title = "NBA Salary vs Points per Game"
)
## `geom_smooth()` using formula = 'y ~ x'

Robust standard
errors
lm.mod <- lm(
formula = Salary_2024_25 ~ pts + ast + age,
data = merged_clean
)
summary(lm.mod)
##
## Call:
## lm(formula = Salary_2024_25 ~ pts + ast + age, data = merged_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33370995 -5981199 -993754 5257096 45275435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1064963 2201378 0.484 0.6287
## pts 1598287 59014 27.083 <2e-16 ***
## ast 331296 190520 1.739 0.0824 .
## age -81666 90951 -0.898 0.3694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10330000 on 1004 degrees of freedom
## Multiple R-squared: 0.6002, Adjusted R-squared: 0.599
## F-statistic: 502.4 on 3 and 1004 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm.mod)

par(mfrow = c(1, 1))
t test
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
coeftest(
lm.mod,
vcov = vcovHC(lm.mod, type = "HC1")
)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1064963 2216651 0.4804 0.63102
## pts 1598287 53143 30.0751 < 2e-16 ***
## ast 331296 196256 1.6881 0.09171 .
## age -81666 87974 -0.9283 0.35348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- merged_clean
lm_model <- lm(
data = df,
formula = Salary_2024_25 ~ pts + ast + age
)
summary(lm_model)
##
## Call:
## lm(formula = Salary_2024_25 ~ pts + ast + age, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33370995 -5981199 -993754 5257096 45275435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1064963 2201378 0.484 0.6287
## pts 1598287 59014 27.083 <2e-16 ***
## ast 331296 190520 1.739 0.0824 .
## age -81666 90951 -0.898 0.3694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10330000 on 1004 degrees of freedom
## Multiple R-squared: 0.6002, Adjusted R-squared: 0.599
## F-statistic: 502.4 on 3 and 1004 DF, p-value: < 2.2e-16
plot(lm_model)




robust_se <- sqrt(diag(vcovHC(lm_model, type = "HC3")))
print(robust_se)
## (Intercept) pts ast age
## 2223287.33 53378.76 197429.52 88280.87
coeftest(lm_model, vcov = vcovHC(lm_model, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1064963 2223287 0.4790 0.63204
## pts 1598287 53379 29.9424 < 2e-16 ***
## ast 331296 197430 1.6780 0.09365 .
## age -81666 88281 -0.9251 0.35515
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
F test
model_unres <- lm(Salary_2024_25 ~ pts + ast + age, data = merged_clean)
model_res <- lm(Salary_2024_25 ~ pts, data = merged_clean)
anova(model_res, model_unres)
## Analysis of Variance Table
##
## Model 1: Salary_2024_25 ~ pts
## Model 2: Salary_2024_25 ~ pts + ast + age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1006 1.0744e+17
## 2 1004 1.0707e+17 2 3.7047e+14 1.7369 0.1766
waldtest(model_res, model_unres, vcov = vcovHC(model_unres, type = "HC0"))
## Wald test
##
## Model 1: Salary_2024_25 ~ pts
## Model 2: Salary_2024_25 ~ pts + ast + age
## Res.Df Df F Pr(>F)
## 1 1006
## 2 1004 2 1.5956 0.2033
Main Regression Model
Specification
Main Regression Model
\(Salary_{i, 2024-25} = \beta_0 + \beta_1
PointsPerGame_i + \beta_2 AssistsPerGame_i + \beta_3 Age_i +
\epsilon_i\)
where \(i\) indexes individual NBA
players in the 2024–2025 season.
The dependent variable is the player’s 2024–2025 NBA
salary (Salary_2024_25), measured in U.S. dollars.
The independent variables are:
Points per game (pts) — a measure
of scoring productivity
Assists per game (ast) — a measure
of playmaking
Age (age) — experience and
maturity
This model examines how a player’s performance and experience help
explain variation in salaries across NBA players in the 2024–25
season.
subset_NBA <- merged_clean[, c("Salary_2024_25", "pts", "ast", "age")]
lm.mod <- lm(
formula = Salary_2024_25 ~ . ,
data = subset_NBA
)
summary(lm.mod)
##
## Call:
## lm(formula = Salary_2024_25 ~ ., data = subset_NBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33370995 -5981199 -993754 5257096 45275435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1064963 2201378 0.484 0.6287
## pts 1598287 59014 27.083 <2e-16 ***
## ast 331296 190520 1.739 0.0824 .
## age -81666 90951 -0.898 0.3694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10330000 on 1004 degrees of freedom
## Multiple R-squared: 0.6002, Adjusted R-squared: 0.599
## F-statistic: 502.4 on 3 and 1004 DF, p-value: < 2.2e-16
Heteroskedasticty
Residual
Analysis
Eyeballing the residual plots suggests that heteroskedasticity is
likely present in my NBA salary model. The spread of the residuals
increases at higher fitted salary values, indicating that the variance
of the errors is not constant across players. To formally assess this, I
used statistical tests such as the Breusch–Pagan and White tests.
plot(lm.mod, which = 1)

par(mfrow = c(2, 2))
plot(lm.mod)

par(mfrow = c(1, 1))
WHITE TEST
library(skedastic)
white_test_result <- skedastic::white(lm.mod, interactions = TRUE)
white_test_result
## # A tibble: 1 × 5
## statistic p.value parameter method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 68.3 3.35e-11 9 White's Test greater
The White test statistic for my NBA salary regression is 68.25, with
a p-value of 3.35e-11. Because the p-value is less than 0.05, we reject
the null hypothesis of homoskedasticity.
Rejecting the null means that heteroskedasticity is present in the
NBA salary regression. The variance of the regression errors is not
constant across different levels of player performance. In practical
terms, salary becomes increasingly variable for players with higher
performance metrics (e.g., more points, assists, etc.).
Manually Test with
“Auxillary regression”
The auxiliary regression results indicate that the variance of the
error term is systematically related to the independent variables in my
model. In practical terms, this means that salary dispersion increases
as player performance changes, which violates the classical assumption
of constant error variance.
This finding confirms that:
heteroskedasticity is present in my NBA salary
regression
standard OLS standard errors are unreliable
robust (HC) standard errors are needed for valid
inference
Therefore, any hypothesis testing or confidence intervals based on
the non-robust OLS results should not be used without correcting for
heteroskedasticity.
White test for
heteroskedasticity
subset_NBA$residuals <- resid(object = lm.mod)
summary(subset_NBA$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33370995 -5981199 -993754 0 5257096 45275435
subset_NBA$squared_residuals <- (subset_NBA$residuals)^2
summary(subset_NBA$squared_residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.626e+07 6.030e+12 3.223e+13 1.062e+14 1.101e+14 2.050e+15
white_auxillary_reg <- lm(
formula = squared_residuals ~ pts + ast + age +
I(pts^2) + I(ast^2) + I(age^2) +
pts:ast + pts:age + ast:age,
data = subset_NBA
)
white_auxillary_reg_summary <- summary(white_auxillary_reg)
white_auxillary_reg_summary
##
## Call:
## lm(formula = squared_residuals ~ pts + ast + age + I(pts^2) +
## I(ast^2) + I(age^2) + pts:ast + pts:age + ast:age, data = subset_NBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.094e+14 -9.069e+13 -4.749e+13 9.750e+12 2.011e+15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.065e+14 2.385e+14 2.963 0.00312 **
## pts 6.285e+12 7.797e+12 0.806 0.42036
## ast 4.749e+13 2.423e+13 1.960 0.05025 .
## age -4.797e+13 1.848e+13 -2.596 0.00958 **
## I(pts^2) -6.882e+11 1.641e+11 -4.193 3e-05 ***
## I(ast^2) -3.514e+12 1.413e+12 -2.487 0.01304 *
## I(age^2) 7.049e+11 3.596e+11 1.960 0.05023 .
## pts:ast 9.682e+11 7.282e+11 1.330 0.18399
## pts:age 4.492e+11 3.072e+11 1.463 0.14391
## ast:age -9.161e+11 9.052e+11 -1.012 0.31180
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.954e+14 on 998 degrees of freedom
## Multiple R-squared: 0.06771, Adjusted R-squared: 0.0593
## F-statistic: 8.054 on 9 and 998 DF, p-value: 1.382e-11
white_LM_stat <- white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)
chisq_p_value <- pchisq(
q = white_LM_stat,
df = 9,
lower.tail = FALSE
)
chisq_p_value
## [1] 3.348535e-11
This results in the same exact p-value from the package and by hand
above.
white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)
## [1] 68.25225
The test statistic also has the same exact result.
critical_value <- qchisq(
p = 0.05,
df = 9,
lower.tail = FALSE
)
critical_value
## [1] 16.91898
test_statistic <- qchisq(
p = white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg),
df = 9,
lower.tail = FALSE
)
## Warning in qchisq(p = white_auxillary_reg_summary$r.squared *
## nobs(white_auxillary_reg), : NaNs produced
test_statistic
## [1] NaN
- What low/high R-squared values suggest about heteroskedasticity
In the context of the White test, the R-squared from the auxiliary
regression measures how strongly the squared residuals (the
error variance) are related to the regressors, their squares,
and their interactions.
My auxiliary R² of about 0.12 (12%) is considered
high for a White-test auxiliary regression, which is
high enough to indicate that the squared residuals are meaningfully
related to the regressors, suggesting heteroskedasticity.
Test statistic: T = 68.25
5% chi-squared critical value: 16.92
The test statistic falls far into the rejection region, meaning it is
“more extreme” than the chi-square cutoff.
White test p-value is: \(p = 3.35 ×
10^{−11}\)
The p-value is far below 0.05, providing strong evidence against the
null hypothesis of homoskedasticity.
Since the test statistic exceeds the critical value and the p-value
is extremely small, we reject the null hypothesis of
homoskedasticity.