#install.packages("xfun")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#install.packages("skedastic")
library("skedastic")
library(xfun)
##
## Attaching package: 'xfun'
##
## The following objects are masked from 'package:base':
##
## attr, isFALSE
Issue Summary
Heteroskedasticity is a term to describe a situation where the
spread of residuals in a regression model changes across
observations.
Standard Errors measures how much the coefficient is expected to
vary if repeating the experiment by multiple times. Under
heteroskedasticity, the standard errors could underestimate or
overestimate their true value. Also, the misestimation of standard
errors leads to unreliable confidence intervals and hypothesis tests. To
be more specific, the CI may be too narrow or wide.
Point estimates are actual values calculated by regression, like
slope and intercept. Under heteroskedasticity, the coefficient of
variable is not stable.
The null hypothesis in the white test homoskedasticity(H0 = 0).
The variance of residuals is constant across all levels of X
variables.
The alternative hypothesis in the white test is
heteroskedasticity, which indicates the variance of the residuals is not
constant and varies with levels of X variables.
Coding
Load the Dataset
data("mtcars")
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
Main Regression Model
Specification
\[
log(mpg) =\beta_0\ + \beta_1wt\ + \beta_2 disp\ + \beta_3gear\ +
\epsilon\
\]
lm_1 <- lm(data = mtcars,
log(mpg) ~ wt + disp + gear)
summary(lm_1)
##
## Call:
## lm(formula = log(mpg) ~ wt + disp + gear, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16208 -0.09601 -0.03072 0.06928 0.28506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8453697 0.2111409 18.212 < 2e-16 ***
## wt -0.1694092 0.0519452 -3.261 0.00291 **
## disp -0.0010233 0.0004006 -2.554 0.01636 *
## gear -0.0289398 0.0381019 -0.760 0.45388
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1265 on 28 degrees of freedom
## Multiple R-squared: 0.837, Adjusted R-squared: 0.8196
## F-statistic: 47.93 on 3 and 28 DF, p-value: 3.723e-11
Heteroskedasticity
Manually Test with
“Auxillary Regression”
Create the residual
squared variable
mtcars$residuals <- resid(object = lm_1)
summary(mtcars$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.16208 -0.09601 -0.03072 0.00000 0.06928 0.28506
mtcars$squared_residuals <- (mtcars$residuals)^2
summary(mtcars$squared_residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.958e-05 1.937e-03 7.789e-03 1.400e-02 1.694e-02 8.126e-02
Create the X
variables
white_auxillary_reg <- lm(formula = squared_residuals ~ wt + disp + gear +
I(wt^2) + I(disp^2) + I(gear^2) +
wt:disp + disp:gear + gear:wt,
data = mtcars)
white_auxillary_reg_summary <- summary(white_auxillary_reg)
white_auxillary_reg_summary
##
## Call:
## lm(formula = squared_residuals ~ wt + disp + gear + I(wt^2) +
## I(disp^2) + I(gear^2) + wt:disp + disp:gear + gear:wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.031648 -0.007493 -0.001899 0.005509 0.057323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.810e-01 3.979e-01 0.706 0.487
## wt -7.227e-02 1.677e-01 -0.431 0.671
## disp 2.208e-04 9.279e-04 0.238 0.814
## gear -9.280e-02 1.382e-01 -0.672 0.509
## I(wt^2) 1.031e-02 2.502e-02 0.412 0.684
## I(disp^2) 1.424e-06 1.305e-06 1.091 0.287
## I(gear^2) 8.209e-03 1.410e-02 0.582 0.566
## wt:disp -1.842e-04 3.069e-04 -0.600 0.555
## disp:gear -8.724e-05 1.184e-04 -0.737 0.469
## wt:gear 1.339e-02 2.164e-02 0.619 0.542
##
## Residual standard error: 0.01956 on 22 degrees of freedom
## Multiple R-squared: 0.2287, Adjusted R-squared: -0.0868
## F-statistic: 0.7249 on 9 and 22 DF, p-value: 0.6818
Test with Chi-Squared
Distribution
chisq_p_value <- pchisq(q = white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg),
df = 9, # degrees of freedom is the number of parameters estimated in the model minus 1 (for constant term) i.e. equal to the number of variables in the auxillary regression
lower.tail = FALSE )
chisq_p_value
## [1] 0.6039407
Compare the results
skedastic_package_white
## # A tibble: 1 × 5
## statistic p.value parameter method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 7.32 0.604 9 White's Test greater
# TEST CRITICAL VALUE
white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)
## [1] 7.319037