Include knit_hooks$set(webgl = hook_webgl) in the setup chunk for rgl graphics.

In this document, we are going to examine the consequences of correlated explanatory variables on statistical models.

We are going to work with the mtcars data, available in base R. For our purposes, we are only going to use the following variables:

Variable
mpg Miles per gallon
hp horse power
wt weight

for educational purposes, we are going to create very correlated hp and wt variables.

1 Exploratory Data Analysis

plot(data)

cor(data) %>% round(2) %>% kable
mpg hp wt
mpg 1.00 -0.78 -0.76
hp -0.78 1.00 0.97
wt -0.76 0.97 1.00
# library(rgl)
# plot3d(data)
# plot_ly(data, x=~hp, y=~wt, z=~mpg, type="scatter3d", mode="markers")

2 Modelling

2.1 Simple linear Regression

par(mfrow=c(1,2))
summary(mod1 <- lm(mpg~hp, data=data))
## 
## Call:
## lm(formula = mpg ~ hp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7121 -2.1122 -0.8854  1.5819  8.2360 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.09886    1.63392  18.421  < 2e-16 ***
## hp          -0.06823    0.01012  -6.742 1.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.863 on 30 degrees of freedom
## Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
## F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07
plot(data$hp, data$mpg)
abline(mod1)

summary(mod2 <- lm(mpg~wt, data=data))
## 
## Call:
## lm(formula = mpg ~ wt, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2033 -2.5261 -0.6577  2.0508  8.8873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.7940     2.1169  15.491 7.42e-16 ***
## wt           -4.4219     0.6946  -6.366 5.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.996 on 30 degrees of freedom
## Multiple R-squared:  0.5746, Adjusted R-squared:  0.5604 
## F-statistic: 40.52 on 1 and 30 DF,  p-value: 5.04e-07
plot(data$wt, data$mpg)
abline(mod2)

summary(mod3 <- lm(wt~hp, data=data))
## 
## Call:
## lm(formula = wt ~ hp, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41663 -0.19074 -0.05022  0.19443  0.61574 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.734510   0.112542   6.527 3.23e-07 ***
## hp          0.014577   0.000697  20.914  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2661 on 30 degrees of freedom
## Multiple R-squared:  0.9358, Adjusted R-squared:  0.9337 
## F-statistic: 437.4 on 1 and 30 DF,  p-value: < 2.2e-16
plot(data$hp, data$wt)
abline(mod3)

On the last figure, we can see that the two explanatory variables hp and wt are quite strongly correlated.

2.2 Multiple linear Regression

summary(mod4 <- lm(mpg~wt+hp, data=data))
## 
## Call:
## lm(formula = mpg ~ wt + hp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8997 -2.2392 -0.8272  1.6953  8.2867 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.57817    2.58255  11.840 1.25e-12 ***
## wt          -0.65255    2.69325  -0.242    0.810    
## hp          -0.05872    0.04058  -1.447    0.159    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.925 on 29 degrees of freedom
## Multiple R-squared:  0.6032, Adjusted R-squared:  0.5759 
## F-statistic: 22.05 on 2 and 29 DF,  p-value: 1.509e-06
summary(mod5 <- lm(mpg~I(wt-hp) + hp, data=data))
## 
## Call:
## lm(formula = mpg ~ I(wt - hp) + hp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8997 -2.2392 -0.8272  1.6953  8.2867 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.5782     2.5825  11.840 1.25e-12 ***
## I(wt - hp)   -0.6526     2.6933  -0.242    0.810    
## hp           -0.7113     2.6540  -0.268    0.791    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.925 on 29 degrees of freedom
## Multiple R-squared:  0.6032, Adjusted R-squared:  0.5759 
## F-statistic: 22.05 on 2 and 29 DF,  p-value: 1.509e-06

Note that both variables are not significant anymore!

library(reshape2)
hp <- seq(min(data$hp),max(data$hp),len=30)
wt <- seq(min(data$wt),max(data$wt),len=30)

carsfit <- lm(mpg~hp+wt,data)
plot.cars <- expand.grid(hp = hp,wt = wt)
plot.cars$mpg.pred <- predict(carsfit,newdata=plot.cars)
carsheight <- dcast(plot.cars,hp~wt,value.var="mpg.pred")[-1]

par3d(scale=c(0.025,1.8,0.3),cex=.5)
points3d(data$hp,data$wt,data$mpg)
surface3d(hp,wt,as.matrix(carsheight),col="blue",alpha=.5)
axes3d()
title3d(xlab="HP",ylab="Weight",zlab="MPG")

You must enable Javascript to view this page properly.

2.3 MSE

mods <- list(mod1,mod2,mod4,mod5)
MSE <- NULL
for(i in 1:4){
   MSE[i] <- mean((predict(mods[[i]],data) - data$mpg)^2)
}
data.frame(Model = c("mod1","mod2","mod4","mod5"), MSE = MSE %>% round(2)) %>% kable
Model MSE
mod1 13.99
mod2 14.97
mod4 13.96
mod5 13.96

3 Conclusion

Be aware that explanatory variables in a linear regression have to be interpreted ceteris paribus. Therefore, including correlated explanatory variables in a linear regression reduces the information available and therefore the predicting power of the model.