#Clearing up the environment
rm(list = ls())

1 1. Issue

Heteroskedasticity occurs when the variance of the error term in a regression model is not constant across all obvservations. This makes the standard errors unreliable, leading to incorrect conclusions about statistical significance.

In the Breusch-Pagan or White test, the null hypothesis is that the variance of the errors is constant (homoskedasticity), while the alternative hypothesis is that the variance changes systematically with one or more independent variables (heteroskedasticity). The test logic is sound because it uses an auxiliary regression to detect patterns in the squared residuals, effectively checking if they are related to the explanatory variables or their transformations.

2 2. Estimate a Multivariate Regression

2.1 Data

## Warning: package 'AER' was built under R version 4.3.3
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# Explore the dataset
head(df)       
##   district municipality expreg expspecial expbil expocc exptot scratio special
## 1        1     Abington   4201    7375.69      0      0   4646    16.6    14.6
## 2        2        Acton   4129    8573.99      0      0   4930     5.7    17.4
## 3        3     Acushnet   3627    8081.72      0      0   4281     7.5    12.1
## 4        5       Agawam   4015    8181.37      0      0   4826     8.6    21.1
## 5        7     Amesbury   4273    7037.22      0      0   4824     6.1    16.8
## 6        8      Amherst   5183   10595.80   6235      0   6454     7.7    17.2
##   lunch stratio income score4 score8  salary   english
## 1  11.8    19.0 16.379    714    691 34.3600 0.0000000
## 2   2.5    22.6 25.792    731     NA 38.0630 1.2461059
## 3  14.1    19.3 14.040    704    693 32.4910 0.0000000
## 4  12.1    17.9 16.111    704    691 33.1060 0.3225806
## 5  17.4    17.5 15.423    701    699 34.4365 0.0000000
## 6  26.8    15.7 11.144    714     NA      NA 3.9215686
str(df)       
## 'data.frame':    220 obs. of  16 variables:
##  $ district    : chr  "1" "2" "3" "5" ...
##  $ municipality: chr  "Abington" "Acton" "Acushnet" "Agawam" ...
##  $ expreg      : int  4201 4129 3627 4015 4273 5183 4685 5518 5009 3823 ...
##  $ expspecial  : num  7376 8574 8082 8181 7037 ...
##  $ expbil      : int  0 0 0 0 0 6235 0 0 0 12943 ...
##  $ expocc      : int  0 0 0 0 0 0 0 0 0 11519 ...
##  $ exptot      : int  4646 4930 4281 4826 4824 6454 5537 6405 5649 4814 ...
##  $ scratio     : num  16.6 5.7 7.5 8.6 6.1 ...
##  $ special     : num  14.6 17.4 12.1 21.1 16.8 ...
##  $ lunch       : num  11.8 2.5 14.1 12.1 17.4 ...
##  $ stratio     : num  19 22.6 19.3 17.9 17.5 ...
##  $ income      : num  16.4 25.8 14 16.1 15.4 ...
##  $ score4      : num  714 731 704 704 701 714 725 717 702 701 ...
##  $ score8      : num  691 NA 693 691 699 NA 728 715 705 688 ...
##  $ salary      : num  34.4 38.1 32.5 33.1 34.4 ...
##  $ english     : num  0 1.246 0 0.323 0 ...
stargazer(df, type = "text", title = "Summary of Dataset", summary = TRUE) 
## 
## Summary of Dataset
## ========================================================
## Statistic   N    Mean     St. Dev.     Min       Max    
## --------------------------------------------------------
## expreg     220 4,605.464  880.252     2,905     8,759   
## expspecial 220 8,900.727 3,511.696  3,832.230 53,569.240
## expbil     220 3,037.309 20,259.260     0      295,140  
## expocc     220 1,104.209 2,732.449      0       15,088  
## exptot     220 5,370.250  977.040     3,465     9,868   
## scratio    211   8.107     2.836      2.300     18.400  
## special    220  15.968     3.538      8.100     34.300  
## lunch      220  15.316     15.060     0.400     76.200  
## stratio    220  17.344     2.277     11.400     27.000  
## income     220  18.747     5.808      9.686     46.855  
## score4     220  709.827    15.126      658       740    
## score8     180  698.411    21.053      641       747    
## salary     195  35.993     3.191     24.965     44.494  
## english    220   1.118     2.901      0.000     24.494  
## --------------------------------------------------------
# Visualize missing values in the dataset
options(repr.plot.width = 6, repr.plot.height = 3)
vis_miss(df)

#Remove null values
df <- na.omit(df)
stargazer(df, type = "text", title = "Summary of Cleaned Dataset", summary = TRUE)
## 
## Summary of Cleaned Dataset
## ========================================================
## Statistic   N    Mean     St. Dev.     Min       Max    
## --------------------------------------------------------
## expreg     155 4,637.394  768.997     3,023     7,944   
## expspecial 155 8,592.224 1,709.364  3,832.230 15,740.580
## expbil     155 4,118.619 24,050.970     0      295,140  
## expocc     155 1,318.368 2,905.513      0       15,088  
## exptot     155 5,389.645  861.020     3,868     8,623   
## scratio    155   8.174     2.656      2.600     16.600  
## special    155  15.957     3.266     10.400     26.000  
## lunch      155  16.706     16.605     0.500     76.200  
## stratio    155  17.370     2.178     11.400     27.000  
## income     155  18.350     5.378      9.686     46.855  
## score4     155  708.335    16.078      658       740    
## score8     155  697.626    21.471      641       747    
## salary     155  36.169     3.227     24.965     44.494  
## english    155   1.346     3.257      0.000     24.494  
## --------------------------------------------------------

2.2 Main Regression Model Specification

Specify your linear regression

\(\text{8th grade score} = \beta_0 + \beta_1 \cdot \text{Income per capita}_i + \beta_2 \cdot \text{Student-teacher ratio}_i + \beta_3 \cdot \text{Total expenditure per pupil}_i + \epsilon_i\)

The goal is to predict the 8th-grade score based on factors such as income, student-teacher ratio, and expenditures. The hypothesis is that districts with higher income & expenditure with a lower student-teacher ratio are likely to have better resources, which may lead to higher test scores.

2.3 Estimation

Estimate your model.

subset_df <- df[, c("income", "stratio", "score8", "exptot")]

# Fit the linear regression model
model <- lm(score8 ~ income + stratio + exptot, data = subset_df)

# Display the summary of the regression model
summary(model)
## 
## Call:
## lm(formula = score8 ~ income + stratio + exptot, data = subset_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.703  -7.634  -0.295   6.848  47.139 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 714.023990  12.519185  57.034  < 2e-16 ***
## income        3.486270   0.211337  16.496  < 2e-16 ***
## stratio      -2.207935   0.484608  -4.556 1.07e-05 ***
## exptot       -0.007796   0.001369  -5.696 6.23e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.12 on 151 degrees of freedom
## Multiple R-squared:  0.6875, Adjusted R-squared:  0.6813 
## F-statistic: 110.7 on 3 and 151 DF,  p-value: < 2.2e-16
  • Income (3.49): For each one-unit increase in income (per capita), the predicted score8 increases by 3.49 points, holding other variables constant. This makes sense, as wealthier districts are likely to have better resources and facilities, contributing to higher test scores.

  • Student-teacher ratio (stratio) (-2.21): For each one-unit increase in the student-teacher ratio, the predicted score8 decreases by 2.21 points, holding other variables constant. This is logical because a higher student-teacher ratio usually means less individual attention for students, leading to lower test scores.

  • Total expenditures per pupil (exptot) (-0.0078): For each one-unit increase in total expenditures per pupil, the predicted score8 decreases by 0.0078 points. This result is counterintuitive because I would expect that higher spending would lead to better test scores.

vif(model)
##   income  stratio   exptot 
## 1.353946 1.167968 1.455779

3 Heteroskedasticty

3.1 Residual Analysis

# Create a residual plot
par(mfrow = c(2,2))
plot(model)

3.2 Formal Tests with packages

library("skedastic")
require("lmtest")

3.2.1 White test for heteroskedasticity

# Conduct White test for heteroscedasticity
skedastic_package_white  <- white(mainlm =  model, 
                                  interactions = TRUE
                                  )         
# View the test results
skedastic_package_white
## # A tibble: 1 × 5
##   statistic p.value parameter method       alternative
##       <dbl>   <dbl>     <dbl> <chr>        <chr>      
## 1      22.4 0.00764         9 White's Test greater

It has a p-value of 0.0076 - this suggests the presence of heteroscedasticity!

3.3 Manually Test with “Auxillary regression”

We will model the squared error terms as a function of predictors and the squared terms of the predictor. This is how you can create the terms -

3.3.1 White test for heteroskedasticity

3.3.1.1 Create the residual squared variable i.e. (y or dependent variable)

subset_df$residuals <- resid(object = model)

  summary(subset_df$residuals) # mean should be zero
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -32.7026  -7.6342  -0.2946   0.0000   6.8483  47.1394
subset_df$squared_residuals <- (subset_df$residuals)^2 

  summary(subset_df$squared_residuals) # should not have any negative values
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0036    8.4852   53.0399  143.1485  173.4406 2222.1204

3.3.1.2 Create the X variables

white_auxillary_reg <- lm(formula = squared_residuals ~ exptot    +   stratio   +  income  +
                                                     I(exptot^2)  + I(stratio^2)+ I(income^2) +
                                               exptot:stratio + stratio:income + income:exptot,
                             data = subset_df
                   ) 

white_auxillary_reg_summary <- summary(white_auxillary_reg)
white_auxillary_reg_summary
## 
## Call:
## lm(formula = squared_residuals ~ exptot + stratio + income + 
##     I(exptot^2) + I(stratio^2) + I(income^2) + exptot:stratio + 
##     stratio:income + income:exptot, data = subset_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -285.58 -113.40  -59.40   29.77 2053.91 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    -3.422e+02  1.609e+03  -0.213   0.8319  
## exptot         -5.696e-01  3.938e-01  -1.446   0.1502  
## stratio         2.355e+02  1.213e+02   1.941   0.0542 .
## income         -1.399e+01  6.685e+01  -0.209   0.8346  
## I(exptot^2)     6.251e-05  2.979e-05   2.098   0.0376 *
## I(stratio^2)   -8.169e+00  3.210e+00  -2.544   0.0120 *
## I(income^2)     1.826e+00  7.139e-01   2.558   0.0116 *
## exptot:stratio  9.549e-03  9.710e-03   0.983   0.3270  
## stratio:income  1.017e-01  2.895e+00   0.035   0.9720  
## exptot:income  -1.209e-02  7.475e-03  -1.618   0.1079  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248.6 on 145 degrees of freedom
## Multiple R-squared:  0.1446, Adjusted R-squared:  0.09154 
## F-statistic: 2.724 on 9 and 145 DF,  p-value: 0.005785
chisq_p_value <-  pchisq(q = white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg), 
                         df = 9,      
                         lower.tail = FALSE  )
chisq_p_value
## [1] 0.007642894

We get the exact same pvalue from the package skedastic and by hand above.

Lets also ensure that we can replicate the test statistic.

skedastic_package_white
## # A tibble: 1 × 5
##   statistic p.value parameter method       alternative
##       <dbl>   <dbl>     <dbl> <chr>        <chr>      
## 1      22.4 0.00764         9 White's Test greater
# TEST CRITICAL VALUE 
white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)
## [1] 22.41859
test_stat <- white_auxillary_reg_summary$r.squared * nobs(white_auxillary_reg)

critical_value <- qchisq(p = 0.05, df = 9, lower.tail = FALSE)
if (test_stat > critical_value) {
  print("Reject the null hypothesis of homoskedasticity (evidence of heteroskedasticity).")
} else {
  print("Fail to reject the null hypothesis of homoskedasticity.")
}
## [1] "Reject the null hypothesis of homoskedasticity (evidence of heteroskedasticity)."

4 Extra: BP test

4.1 Breusch-Pagan test for heteroskedasticity

Same conclusion, which is usually the case.

The Breusch-Pagan test fits a linear regression model to the residuals of a linear regression model (by default the same explanatory variables are taken as in the main regression model) and rejects if too much of the variance is explained by the additional explanatory variables.

Under \(H_0\) the test statistic of the Breusch-Pagan test follows a chi-squared distribution with parameter (the number of regressors without the constant in the model) degrees of freedom.

lmtest_package_bp <- bptest(formula = model)
                      
# View the test results
lmtest_package_bp
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 7.0772, df = 3, p-value = 0.06948
skedastic_package_bp <- skedastic::breusch_pagan(model)

lmtest_package_bp
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 7.0772, df = 3, p-value = 0.06948
skedastic_package_bp
## # A tibble: 1 × 5
##   statistic p.value parameter method                alternative
##       <dbl>   <dbl>     <dbl> <chr>                 <chr>      
## 1      7.08  0.0695         3 Koenker (studentised) greater

lmtest::bptest gives the exact same outpout as skedastic::breusch_pagan, as expected.

The test statistic value is 7.07, follows a chi-squared distribution with parameter (the number of regressors without the constant in the model) degrees of freedom, and it has a p-value of 0.069- thus we reject the null of homoskedasticity !