There is a very small data set ‘mileage.csv’ that shows car mileage versus car speed for a random sample of vehicles. Load the data set, conduct some exploratory analysis, obtain a least squares line for the data and check some associated diagnostics.

cars <- read.csv("mileage.csv", header=TRUE)
cars
##    case mpg spd
## 1     1  22  35
## 2     2  20  35
## 3     3  28  40
## 4     4  31  40
## 5     5  37  45
## 6     6  38  45
## 7     7  41  50
## 8     8  39  50
## 9     9  34  55
## 10   10  37  55
## 11   11  27  60
## 12   12  30  60
plot(cars$mpg ~ cars$spd)
fit <- lm(cars$mpg ~ cars$spd)
summary(fit)
## 
## Call:
## lm(formula = cars$mpg ~ cars$spd)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.143 -5.929  0.500  5.914  8.171 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  16.2571    10.4963   1.549    0.152
## cars$spd      0.3314     0.2175   1.524    0.159
## 
## Residual standard error: 6.433 on 10 degrees of freedom
## Multiple R-squared:  0.1885, Adjusted R-squared:  0.1073 
## F-statistic: 2.322 on 1 and 10 DF,  p-value: 0.1585
plot(cars$mpg ~ cars$spd)
abline(fit)

plot(fit, which = 1:2)

Try quadratic model

cars <- read.csv("mileage.csv", header=TRUE)

cars$spd_squared <- cars$spd^2
cars
##    case mpg spd spd_squared
## 1     1  22  35        1225
## 2     2  20  35        1225
## 3     3  28  40        1600
## 4     4  31  40        1600
## 5     5  37  45        2025
## 6     6  38  45        2025
## 7     7  41  50        2500
## 8     8  39  50        2500
## 9     9  34  55        3025
## 10   10  37  55        3025
## 11   11  27  60        3600
## 12   12  30  60        3600
quad_fit <- lm(mpg ~ spd + spd_squared, data = cars)

summary(quad_fit)
## 
## Call:
## lm(formula = mpg ~ spd + spd_squared, data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.03214 -0.58929  0.03393  1.10893  2.10000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.826e+02  1.768e+01  -10.33 2.73e-06 ***
## spd          8.983e+00  7.616e-01   11.80 8.91e-07 ***
## spd_squared -9.107e-02  7.993e-03  -11.39 1.20e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.727 on 9 degrees of freedom
## Multiple R-squared:  0.9474, Adjusted R-squared:  0.9357 
## F-statistic: 81.03 on 2 and 9 DF,  p-value: 1.757e-06
plot(quad_fit, which = 1:2)

library("car")

vif(quad_fit)
##         spd spd_squared 
##    170.2187    170.2187

Plot of Quadratic regression line with data points.

# Create a set of x-values for the line plot and use these to make new data for plotting our line.
xvalues <- seq(35, 65, 0.1)

# Have to use the same variable names as original data (is this a bug?) This gives us the y-hats to plot our line.
newcars = data.frame(spd=xvalues, spd_squared=xvalues^2)
predmpgs <- predict(quad_fit, newcars)

#Now plot the points and the line
plot(cars$mpg ~ cars$spd, xlab = "Speed (MPH)", ylab = "MPG")
lines(xvalues, predmpgs)

#Diagnostic plots
plot(quad_fit, which = 1:2)

VIF Calculation for quad_fit

# Could Collinearity be a problem
library("car")
vif(quad_fit)
##         spd spd_squared 
##    170.2187    170.2187

Center the predictor and obtain a new model

# Could Collinearity be a problem
xbar <- mean(cars$spd)
xbar
## [1] 47.5
cars$spdc <- cars$spd - xbar
cars$spdc2 <- cars$spdc^2
cars
##    case mpg spd spd_squared  spdc  spdc2
## 1     1  22  35        1225 -12.5 156.25
## 2     2  20  35        1225 -12.5 156.25
## 3     3  28  40        1600  -7.5  56.25
## 4     4  31  40        1600  -7.5  56.25
## 5     5  37  45        2025  -2.5   6.25
## 6     6  38  45        2025  -2.5   6.25
## 7     7  41  50        2500   2.5   6.25
## 8     8  39  50        2500   2.5   6.25
## 9     9  34  55        3025   7.5  56.25
## 10   10  37  55        3025   7.5  56.25
## 11   11  27  60        3600  12.5 156.25
## 12   12  30  60        3600  12.5 156.25
quad_fit_c <- lm(mpg ~ spdc + spdc2, data = cars)

Output a summary of the model and plot points along with new curve. You should verify that the two models (centered and uncentered) are the same.

summary(quad_fit_c)
## 
## Call:
## lm(formula = mpg ~ spdc + spdc2, data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.03214 -0.58929  0.03393  1.10893  2.10000 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.640625   0.766888  50.386  2.4e-12 ***
## spdc         0.331429   0.058372   5.678 0.000303 ***
## spdc2       -0.091071   0.007993 -11.394  1.2e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.727 on 9 degrees of freedom
## Multiple R-squared:  0.9474, Adjusted R-squared:  0.9357 
## F-statistic: 81.03 on 2 and 9 DF,  p-value: 1.757e-06
# get new y-hats for lots of x-values to make our curve
# again, we have to use the same variable names used for quad_fit_c
newcars_c = data.frame(spdc=xvalues-xbar, spdc2=(xvalues-xbar)^2)

#y-hats for these x-values
predmpgs_c <- predict(quad_fit_c, newcars_c)
head(predmpgs_c)
##        1        2        3        4        5        6 
## 20.26786 20.52777 20.78586 21.04213 21.29657 21.54920
#Plot points and the new curve
plot(cars$mpg ~ cars$spd, xlab = "Speed (MPH)", ylab = "MPG")
lines(xvalues, predmpgs_c)

As you can see it is exactly the same curve! So why go to the trouble?

Check VIF with centered data

# Could Collinearity be a problem
vif(quad_fit_c)
##  spdc spdc2 
##     1     1