I have used database wooldridge package with database hprice1, p. 73

library(wooldridge)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
library(outliers)

Heteroskedasticity

Heteroskedasticity means violation of the constant error dispersion assumption.

Wooldridge database of the hprice1 - prices of the houses

# Load the longley dataset
data(hprice1)

# Estimate a linear regression model of GNP on all other variables
model1 <<- lm(price ~ +1 + lotsize   + sqrft + bdrms, data = hprice1)
summary(model1)
## 
## Call:
## lm(formula = price ~ +1 + lotsize + sqrft + bdrms, data = hprice1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -120.026  -38.530   -6.555   32.323  209.376 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.177e+01  2.948e+01  -0.739  0.46221    
## lotsize      2.068e-03  6.421e-04   3.220  0.00182 ** 
## sqrft        1.228e-01  1.324e-02   9.275 1.66e-14 ***
## bdrms        1.385e+01  9.010e+00   1.537  0.12795    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.83 on 84 degrees of freedom
## Multiple R-squared:  0.6724, Adjusted R-squared:  0.6607 
## F-statistic: 57.46 on 3 and 84 DF,  p-value: < 2.2e-16
plot(model1)     # various pictures describing the behaviour of the errors

bptest(model1)   # Breusch-Pagan test of the heteroskedasticity
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 14.092, df = 3, p-value = 0.002782

In the previous pictures, there are some problems with the outliers. The scale-location picture depict also some aoutliers, which can cause the heteroskedasticity problem. The Breusch - Pagan test proved the presence of the heteroskedasticity on the 10 percent significance level.

Justification of the model - normalization by dividing all the variables by the variable causing the heteroskedasticity

attach(hprice1)
plot(lotsize^2,model1$residuals^2)

gqtest(model1, order.by = lotsize )
## 
##  Goldfeld-Quandt test
## 
## data:  model1
## GQ = 1.6343, df1 = 40, df2 = 40, p-value = 0.06225
## alternative hypothesis: variance increases from segment 1 to 2

Based on the Goldfeld Quandt test we assume the lotsize variable can cause the heteroskedasticity. That is,

# Estimate a linear regression model on all other variables
model2 <<- lm(I(price/lotsize) ~ +1 + I(1/lotsize)   + I(sqrft/lotsize) + I(bdrms/lotsize), data = hprice1)
summary(model2)
## 
## Call:
## lm(formula = I(price/lotsize) ~ +1 + I(1/lotsize) + I(sqrft/lotsize) + 
##     I(bdrms/lotsize), data = hprice1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.024877 -0.003984 -0.000638  0.003628  0.037068 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.007358   0.001722   4.274 5.04e-05 ***
## I(1/lotsize)     21.904577  30.415953   0.720    0.473    
## I(sqrft/lotsize)  0.097292   0.008917  10.911  < 2e-16 ***
## I(bdrms/lotsize)  3.837127   7.042928   0.545    0.587    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007588 on 84 degrees of freedom
## Multiple R-squared:  0.945,  Adjusted R-squared:  0.943 
## F-statistic: 480.8 on 3 and 84 DF,  p-value: < 2.2e-16
plot(model2)

bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 5.9642, df = 3, p-value = 0.1134

Dealing the heteroskedasticity problem with elimination outliers

plot(model1)

bptest(model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 14.092, df = 3, p-value = 0.002782

The 77th observation seems to be the outlier, which signifficantly damages the estimations. let us exclude it.

udaje <- hprice1[-77,]
model3 <<- lm(price ~ +1 + lotsize   + sqrft + bdrms, data = udaje)
summary(model3)
## 
## Call:
## lm(formula = price ~ +1 + lotsize + sqrft + bdrms, data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.072  -30.340   -0.739   31.913  211.138 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -28.301170  25.149383  -1.125  0.26370    
## lotsize       0.009195   0.001363   6.748 1.88e-09 ***
## sqrft         0.086495   0.012949   6.680 2.55e-09 ***
## bdrms        20.481920   7.767117   2.637  0.00998 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51 on 83 degrees of freedom
## Multiple R-squared:  0.7646, Adjusted R-squared:  0.7561 
## F-statistic: 89.89 on 3 and 83 DF,  p-value: < 2.2e-16
plot(model3)

bptest(model3)
## 
##  studentized Breusch-Pagan test
## 
## data:  model3
## BP = 2.6144, df = 3, p-value = 0.455

Excluding the variable, the heteroskedasticity dismissed.plot(sqrft2,model1$residuals2) gqtest(model1, order.by = sqrft )

Treating the original model heteroskedasticity with logarithmic transformation

# Estimate a linear regression model of GNP on all other variables
model2 <<- lm(I(log(price)) ~ +1 + I(log(lotsize))   + I(log(sqrft)) + I(log(bdrms)), data = hprice1)
summary(model2)
## 
## Call:
## lm(formula = I(log(price)) ~ +1 + I(log(lotsize)) + I(log(sqrft)) + 
##     I(log(bdrms)), data = hprice1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67739 -0.09737 -0.00452  0.10663  0.66739 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.42882    0.64003  -2.232   0.0282 *  
## I(log(lotsize))  0.16954    0.03849   4.405 3.10e-05 ***
## I(log(sqrft))    0.71687    0.09345   7.671 2.75e-11 ***
## I(log(bdrms))    0.09925    0.10201   0.973   0.3334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1855 on 84 degrees of freedom
## Multiple R-squared:  0.6394, Adjusted R-squared:  0.6265 
## F-statistic: 49.64 on 3 and 84 DF,  p-value: < 2.2e-16
plot(model2)

bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 4.9071, df = 3, p-value = 0.1787

Treating the problem with the White’s heteroskedasticity consistent matrix

White’s heteroskedasticity consistent matrix.

\[(X^TX)^{-1}(X^T diag(e_1^2,e_2^2, \dots, e_n^2)X)(X^TX)^{-1}\]

# Load libraries
library("lmtest")
library("sandwich")

# Robust t test
coeftest(model1, vcov = vcovHC(model1, type = "HC0"))
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept) -21.7703081  36.2843444 -0.6000   0.55013    
## lotsize       0.0020677   0.0012227  1.6912   0.09451 .  
## sqrft         0.1227782   0.0173178  7.0897 3.883e-10 ***
## bdrms        13.8525217   8.2836880  1.6723   0.09819 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1