#1.1 #Heteroskedasticity refers to the unequal variance of the errors (residuals) in a regression model
#across the range of the independent variable(s) (predictor(s)). This means that the spread of the
#residuals is not constant, which can affect the accuracy of the regression model's predictions and
#lead to biased standard errors and coefficient estimates
#It's important to distinguish between heteroskedasticity and other econometric issues like multicollinearity
#or serial correlation. Multicollinearity occurs when two or more predictor variables in a regression model are
#highly correlated with each other, which can make it difficult to estimate the unique contribution of each predictor
#variable to the dependent variable. Serial correlation (also known as autocorrelation) occurs when the errors in a regression
#model are correlated across time, which violates the assumption of independence between the errors.
#1.2
#The Breusch-Pagan test estimates a regression model of the squared residuals on the
#independent variables, and then tests whether the coefficients of the independent variables
#in this regression are jointly statistically significant
#The White test estimates a regression model of the squared residuals on the independent variables,
#their squared terms, and their cross-products
#The alternative hypothesis is that the variance of the errors is not constant, indicating heteroskedasticity
# I find the logic to be reasonable
datasets::mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
#Performing Regression
# Load the mtcars dataset
data(mtcars)
# Fit a linear regression model with mpg as the response variable and disp, hp, and wt as the predictor variables
model <- lm(mpg ~ disp + hp + wt, data = mtcars)
# Print the summary of the model
model
##
## Call:
## lm(formula = mpg ~ disp + hp + wt, data = mtcars)
##
## Coefficients:
## (Intercept) disp hp wt
## 37.105505 -0.000937 -0.031157 -3.800891
summary(model)
##
## Call:
## lm(formula = mpg ~ disp + hp + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.891 -1.640 -0.172 1.061 5.861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.105505 2.110815 17.579 < 2e-16 ***
## disp -0.000937 0.010350 -0.091 0.92851
## hp -0.031157 0.011436 -2.724 0.01097 *
## wt -3.800891 1.066191 -3.565 0.00133 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.639 on 28 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8083
## F-statistic: 44.57 on 3 and 28 DF, p-value: 8.65e-11
# Fit a linear regression model with mpg as the response variable and disp, hp, and wt as the predictor variables
model <- lm(mpg ~ disp + hp + wt, data = mtcars)
# Generate a residual plot
plot(model, which = 1)
# Load the skedastic package
library(skedastic)
## Warning: package 'skedastic' was built under R version 4.2.3

# Load the skedastic package
library(skedastic)
# Load the mtcars dataset
data(mtcars)
# Fit a linear regression model with mpg as the response variable and disp, hp, and wt as the predictor variables
model <- lm(mpg ~ disp + hp + wt, data = mtcars)
# Extract the residuals and the predictor variables
residuals <- residuals(model)
predictors <- model.matrix(model)[,-1]
# Calculate the squared residuals
squared_residuals <- residuals^2
# Calculate the matrix of products of predictors and squared residuals
X <- cbind(predictors, squared_residuals)
Z <- predictors
# Calculate the matrix of squared predictors
P <- t(predictors) %*% predictors
# Calculate the matrix of products of squared residuals and squared predictors
M <- t(predictors * squared_residuals) %*% predictors
# Calculate the test statistic
n <- nrow(predictors)
k <- ncol(predictors)
p <- k - 1
df <- n - k
test_stat <- (n / p) * (t(M) %*% solve(P) %*% M) - n
# Calculate the p-value using the chi-squared distribution
p_value <- pchisq(test_stat, df, lower.tail = FALSE)
# Print the test statistic and p-value
cat("White's test statistic:", test_stat, "\n")
## White's test statistic: 1143336346 627258447 14738412 627258447 365351645 8445677 14738412 8445677 203296.4
cat("p-value:", p_value, "\n")
## p-value: 0 0 0 0 0 0 0 0 0
#If the p-value is less than the significance level (usually 0.05), we reject
#the null hypothesis of homoskedasticity and conclude that there is evidence of heteroskedasticity in the model
#2.3
# Fit a linear regression model with mpg as the response variable and disp, hp, and wt as the predictor variables
model <- lm(mpg ~ disp + hp + wt, data = mtcars)
# Fit a linear regression model with mpg as the response variable and disp, hp, and wt as the predictor variables
model <- lm(mpg ~ disp + hp + wt, data = mtcars)
# Obtain the residuals from the main regression
residuals <- residuals(model)
# Square the residuals to obtain the dependent variable for the auxiliary regression
squared_residuals <- residuals^2
# Fit an auxiliary regression with the squared residuals as the response variable and the original predictors as the independent variables
aux_model <- lm(squared_residuals ~ disp + hp + wt, data = mtcars)
# Obtain the R-squared value of the auxiliary regression
r_squared <- summary(aux_model)$r.squared
# Print the R-squared value
print(r_squared)
## [1] 0.02955931
# Fit a linear regression model with mpg as the response variable and disp, hp, and wt as the predictor variables
model <- lm(mpg ~ disp + hp + wt, data = mtcars)
# Obtain the residuals from the main regression
residuals <- residuals(model)
# Square the residuals to obtain the dependent variable for the auxiliary regression
squared_residuals <- residuals^2
# Fit an auxiliary regression with the squared residuals as the response variable and the original predictors as the independent variables
aux_model <- lm(squared_residuals ~ disp + hp + wt, data = mtcars)
# Obtain the R-squared value of the auxiliary regression
r_squared <- summary(aux_model)$r.squared
# Print the R-squared value and interpret it
cat("R-squared value of the auxiliary regression:", r_squared, "\n")
## R-squared value of the auxiliary regression: 0.02955931
if (r_squared < 0.2) {
cat("The low R-squared value suggests that the original predictors are not doing a good job of explaining the variance in the squared residuals and there may be heteroscedasticity present.\n")
} else {
cat("The high R-squared value suggests that the original predictors are explaining a significant portion of the variance in the squared residuals and there may not be heteroscedasticity present.\n")
}
## The low R-squared value suggests that the original predictors are not doing a good job of explaining the variance in the squared residuals and there may be heteroscedasticity present.
# Conduct a chi-squared test to formally test the significance of the R-squared value
n <- nrow(mtcars)
df <- 3 # number of predictors in the auxiliary regression
p_value <- 1 - pchisq(n * r_squared, df)
# Print the results of the chi-squared test and interpret it
cat("Chi-squared test for the significance of R-squared:\n")
## Chi-squared test for the significance of R-squared:
cat("Test statistic:", n * r_squared, "\n")
## Test statistic: 0.945898
cat("Degrees of freedom:", df, "\n")
## Degrees of freedom: 3
cat("p-value:", p_value, "\n")
## p-value: 0.8143398
if (p_value < 0.05) {
cat("The p-value is less than 0.05, indicating that the R-squared value is statistically significant and the original predictors are doing a good job of explaining the variance in the squared residuals.\n")
} else {
cat("The p-value is greater than 0.05, indicating that the R-squared value is not statistically significant and the original predictors may not be doing a good job of explaining the variance in the squared residuals.\n")
}
## The p-value is greater than 0.05, indicating that the R-squared value is not statistically significant and the original predictors may not be doing a good job of explaining the variance in the squared residuals.