#install.packages("xfun")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#install.packages("skedastic")
library("skedastic")
library(xfun)
## 
## Attaching package: 'xfun'
## 
## The following objects are masked from 'package:base':
## 
##     attr, isFALSE

1 Issue Summary

  1. Heteroskedasticity is a term to describe a situation where the spread of residuals in a regression model changes across observations.

  2. Standard Errors measures how much the coefficient is expected to vary if repeating the experiment by multiple times. Under heteroskedasticity, the standard errors could underestimate or overestimate their true value. Also, the misestimation of standard errors leads to unreliable confidence intervals and hypothesis tests. To be more specific, the CI may be too narrow or wide.

  3. Point estimates are actual values calculated by regression, like slope and intercept. Under heteroskedasticity, the coefficient of variable is not stable.

  4. The null hypothesis in the white test homoskedasticity(H0 = 0). The variance of residuals is constant across all levels of X variables.

  5. The alternative hypothesis in the white test is heteroskedasticity, which indicates the variance of the residuals is not constant and varies with levels of X variables.

2 Coding

2.1 Load the Dataset

data("mtcars")
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

2.2 Main Regression Model Specification

\[ log(mpg) =\beta_0\ + \beta_1wt\ + \beta_2 disp\ + \beta_3gear\ + \epsilon\ \]

lm_1 <- lm(data = mtcars,
                 log(mpg) ~ wt + disp + gear)
summary(lm_1)
## 
## Call:
## lm(formula = log(mpg) ~ wt + disp + gear, data = mtcars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16208 -0.09601 -0.03072  0.06928  0.28506 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.8453697  0.2111409  18.212  < 2e-16 ***
## wt          -0.1694092  0.0519452  -3.261  0.00291 ** 
## disp        -0.0010233  0.0004006  -2.554  0.01636 *  
## gear        -0.0289398  0.0381019  -0.760  0.45388    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1265 on 28 degrees of freedom
## Multiple R-squared:  0.837,  Adjusted R-squared:  0.8196 
## F-statistic: 47.93 on 3 and 28 DF,  p-value: 3.723e-11

3 Heteroskedasticity

3.1 Residual Plot

plot(lm_1)

3.2 Formal Test with Packages

# Conduct White test for heteroscedasticity
skedastic_package_white  <- white(mainlm =  lm_1, 
                                  interactions = TRUE
                                  )         

# View the test results
skedastic_package_white
## # A tibble: 1 × 5
##   statistic p.value parameter method       alternative
##       <dbl>   <dbl>     <dbl> <chr>        <chr>      
## 1      7.32   0.604         9 White's Test greater

Based on the results, the test statistics has a value of 7.3, follows a chi - squared distribution with 9 degrees of freedom. Moreover, the p - value is 0.604 which is greater than 0.05. Thus, we fail to reject the null hypothesis of homoskedasticity.

3.3 Manually Test with “Auxillary Regression”

3.3.1 Create the residual squared variable

mtcars$residuals <- resid(object = lm_1)
summary(mtcars$residuals)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.16208 -0.09601 -0.03072  0.00000  0.06928  0.28506
mtcars$squared_residuals <- (mtcars$residuals)^2 
summary(mtcars$squared_residuals) 
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 3.958e-05 1.937e-03 7.789e-03 1.400e-02 1.694e-02 8.126e-02

3.3.2 Create the X variables

white_auxillary_reg <- lm(formula = squared_residuals ~ wt + disp + gear +
                            I(wt^2)  + I(disp^2) + I(gear^2) +
                            wt:disp + disp:gear + gear:wt,
                          data = mtcars)
white_auxillary_reg_summary <- summary(white_auxillary_reg)
white_auxillary_reg_summary
## 
## Call:
## lm(formula = squared_residuals ~ wt + disp + gear + I(wt^2) + 
##     I(disp^2) + I(gear^2) + wt:disp + disp:gear + gear:wt, data = mtcars)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.031648 -0.007493 -0.001899  0.005509  0.057323 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  2.810e-01  3.979e-01   0.706    0.487
## wt          -7.227e-02  1.677e-01  -0.431    0.671
## disp         2.208e-04  9.279e-04   0.238    0.814
## gear        -9.280e-02  1.382e-01  -0.672    0.509
## I(wt^2)      1.031e-02  2.502e-02   0.412    0.684
## I(disp^2)    1.424e-06  1.305e-06   1.091    0.287
## I(gear^2)    8.209e-03  1.410e-02   0.582    0.566
## wt:disp     -1.842e-04  3.069e-04  -0.600    0.555
## disp:gear   -8.724e-05  1.184e-04  -0.737    0.469
## wt:gear      1.339e-02  2.164e-02   0.619    0.542
## 
## Residual standard error: 0.01956 on 22 degrees of freedom
## Multiple R-squared:  0.2287, Adjusted R-squared:  -0.0868 
## F-statistic: 0.7249 on 9 and 22 DF,  p-value: 0.6818

3.4 Test with Chi-Squared Distribution

chisq_p_value <-  pchisq(q = white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg), 
                         df = 9,       # degrees of freedom is the number of parameters estimated in the model minus 1 (for constant term) i.e.  equal to the number of variables in the auxillary regression
                         lower.tail = FALSE  )
chisq_p_value
## [1] 0.6039407

Compare the results

skedastic_package_white
## # A tibble: 1 × 5
##   statistic p.value parameter method       alternative
##       <dbl>   <dbl>     <dbl> <chr>        <chr>      
## 1      7.32   0.604         9 White's Test greater
# TEST CRITICAL VALUE 
white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)
## [1] 7.319037