#Heteroskedasticity refers to the unequal variance of the errors (residuals) in a regression model across the range of the independent variable(s) (predictor(s)). This means that the spread of the residuals is not constant, which can affect the accuracy of the regression model's predictions and lead to biased standard errors and coefficient estimates. It's important to distinguish between heteroskedasticity and other econometric issues like multicollinearity or serial correlation. Multicollinearity occurs when two or more predictor variables in a regression model are highly correlated with each other, which can make it difficult to estimate the unique contribution of each predictor variable to the dependent variable. Serial correlation (also known as autocorrelation) occurs when the errors in a regression model are correlated across time, which violates the assumption of independence between the errors
#The Breusch-Pagan test estimates a regression model of the squared residuals on the independent variables, and then tests whether the coefficients of the independent variables in this regression are jointly statistically significant The White test estimates a regression model of the squared residuals on the independent variables,their squared terms, and their cross-products The alternative hypothesis is that the variance of the errors is not constant, indicating heteroskedasticity. I find the logic to be reasonable
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.3
str(road)
## 'data.frame': 26 obs. of 6 variables:
## $ deaths : int 968 43 588 640 4743 566 325 118 115 1545 ...
## $ drivers: int 158 11 91 92 952 109 167 30 35 298 ...
## $ popden : num 64 0.4 12 34 100 ...
## $ rural : num 66 5.9 33 73 118 73 5.1 3.4 0 57 ...
## $ temp : int 62 30 64 51 65 42 37 41 44 67 ...
## $ fuel : num 119 6.2 65 74 105 78 95 20 23 216 ...
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
glimpse(road)
## Rows: 26
## Columns: 6
## $ deaths <int> 968, 43, 588, 640, 4743, 566, 325, 118, 115, 1545, 1302, 262, …
## $ drivers <int> 158, 11, 91, 92, 952, 109, 167, 30, 35, 298, 203, 41, 544, 254…
## $ popden <dbl> 64.0, 0.4, 12.0, 34.0, 100.0, 17.0, 518.0, 226.0, 12524.0, 91.…
## $ rural <dbl> 66.0, 5.9, 33.0, 73.0, 118.0, 73.0, 5.1, 3.4, 0.0, 57.0, 83.0,…
## $ temp <int> 62, 30, 64, 51, 65, 42, 37, 41, 44, 67, 54, 36, 33, 37, 30, 42…
## $ fuel <dbl> 119.0, 6.2, 65.0, 74.0, 105.0, 78.0, 95.0, 20.0, 23.0, 216.0, …
Road <- MASS:: road
#Chosen “deaths” as my dependent variable, and “fuel”, “rural” and “drivers” as my independent variables.
subset <- road[, c("deaths", "drivers", "rural", "fuel")]
library(MASS)
# Fit a linear model with deaths as the dependent variable and drivers, rural, and fuel as the independent variables
lm_model <- lm(deaths ~ drivers + rural + fuel, data = subset)
lm_model
##
## Call:
## lm(formula = deaths ~ drivers + rural + fuel, data = subset)
##
## Coefficients:
## (Intercept) drivers rural fuel
## 91.515 4.605 2.526 -1.081
# Print a summary of the linear model
summary(lm_model)
##
## Call:
## lm(formula = deaths ~ drivers + rural + fuel, data = subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -445.67 -144.03 -31.76 107.98 884.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.5148 111.2220 0.823 0.419
## drivers 4.6046 0.3746 12.293 2.5e-11 ***
## rural 2.5261 1.8052 1.399 0.176
## fuel -1.0811 0.8660 -1.248 0.225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 280.9 on 22 degrees of freedom
## Multiple R-squared: 0.9226, Adjusted R-squared: 0.912
## F-statistic: 87.37 on 3 and 22 DF, p-value: 2.239e-12
# Create a plot of residuals vs. fitted values
plot(lm_model$fitted.values, lm_model$residuals, xlab = "Fitted Values", ylab = "Residuals")
# Add a horizontal line at y = 0
geom_hline(yintercept = 0, color = "red")
## mapping: yintercept = ~yintercept
## geom_hline: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
# Create a normal probability plot of residuals
qqnorm(lm_model$residuals, main = "Normal Probability Plot of Residuals")
# Add a reference line for comparison
qqline(lm_model$residuals, col = "red")
# Load the road dataset
data(road)
# Load the road dataset
data(road)
# Load required packages
library(ggplot2)
library(MASS)
# Load the road dataset
data(road)
# Create a new categorical variable based on the drivers variable
road$drivers_group <- cut(road$drivers, breaks=c(0,10,20,30,40,50,60,70,80,90,100),
include.lowest=TRUE)
# Create a scatter plot of deaths against fuel, rural, and drivers
ggplot(road, aes(x=fuel, y=deaths, color=rural, shape=drivers_group)) +
geom_point() +
labs(x = "Fuel consumption", y = "Number of deaths", color = "Rural",
shape = "Drivers") +
scale_shape_manual(values=c(1,2,3,4,5,6,7,8,9,10)) +
theme_minimal()
## Warning: Removed 17 rows containing missing values (`geom_point()`).
library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(skedastic)
## Warning: package 'skedastic' was built under R version 4.2.3
white_test<- white(mainlm= lm_model, interaction=T)
white_test
## # A tibble: 1 × 5
## statistic p.value parameter method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 2.62 0.978 9 White's Test greater
#We fail to reject homoskedasticity.
# Calculate the residuals and squared residuals
subset$residuals <- resid(lm_model)
subset$squared_residuals <- subset$residuals^2
# Fit the auxiliary regression model
white_auxiliary_reg <- lm(squared_residuals ~ drivers + rural + fuel +
I(drivers^2) + I(rural^2) + I(fuel^2) +
drivers:rural + drivers:fuel + rural:fuel,
data = subset)
# Print the summary of the auxiliary regression
summary(white_auxiliary_reg)
##
## Call:
## lm(formula = squared_residuals ~ drivers + rural + fuel + I(drivers^2) +
## I(rural^2) + I(fuel^2) + drivers:rural + drivers:fuel + rural:fuel,
## data = subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -153569 -46792 -11106 3858 654242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163752.825 113599.860 1.441 0.169
## drivers -4221.136 7973.821 -0.529 0.604
## rural -4374.329 5233.598 -0.836 0.416
## fuel 6475.396 12224.267 0.530 0.604
## I(drivers^2) 1.564 12.132 0.129 0.899
## I(rural^2) 33.180 49.730 0.667 0.514
## I(fuel^2) -8.337 33.748 -0.247 0.808
## drivers:rural 16.540 109.764 0.151 0.882
## drivers:fuel 4.773 16.429 0.291 0.775
## rural:fuel -34.284 157.600 -0.218 0.831
##
## Residual standard error: 181600 on 16 degrees of freedom
## Multiple R-squared: 0.1006, Adjusted R-squared: -0.4053
## F-statistic: 0.1989 on 9 and 16 DF, p-value: 0.9907
# Calculate the test statistic and p-value
white_test_stat <- summary(white_auxiliary_reg)$r.squared * nrow(subset)
p_value <- 1 - pchisq(white_test_stat, df = 9)
# Print the test statistic and p-value
cat("White's test statistic:", white_test_stat, "\n")
## White's test statistic: 2.615834
cat("p-value:", p_value, "\n")
## p-value: 0.9776043
#We get the same p value as 0.9776043.
summary(white_auxiliary_reg)$r.squared * nobs(white_auxiliary_reg)
## [1] 2.615834
#Critical value is less than the test statistic, failed to reject null hypothesis.