There is a very small data set ‘mileage.csv’ that shows car mileage versus car speed for a random sample of vehicles. Load the data set, conduct some exploratory analysis, obtain a least squares line for the data and check some associated diagnostics.
cars <- read.csv("mileage.csv", header=TRUE)
cars
## case mpg spd
## 1 1 22 35
## 2 2 20 35
## 3 3 28 40
## 4 4 31 40
## 5 5 37 45
## 6 6 38 45
## 7 7 41 50
## 8 8 39 50
## 9 9 34 55
## 10 10 37 55
## 11 11 27 60
## 12 12 30 60
plot(cars$mpg ~ cars$spd)
fit <- lm(cars$mpg ~ cars$spd)
summary(fit)
##
## Call:
## lm(formula = cars$mpg ~ cars$spd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.143 -5.929 0.500 5.914 8.171
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.2571 10.4963 1.549 0.152
## cars$spd 0.3314 0.2175 1.524 0.159
##
## Residual standard error: 6.433 on 10 degrees of freedom
## Multiple R-squared: 0.1885, Adjusted R-squared: 0.1073
## F-statistic: 2.322 on 1 and 10 DF, p-value: 0.1585
plot(cars$mpg ~ cars$spd)
abline(fit)
plot(fit, which = 1:2)
cars <- read.csv("mileage.csv", header=TRUE)
cars$spd_squared <- cars$spd^2
cars
## case mpg spd spd_squared
## 1 1 22 35 1225
## 2 2 20 35 1225
## 3 3 28 40 1600
## 4 4 31 40 1600
## 5 5 37 45 2025
## 6 6 38 45 2025
## 7 7 41 50 2500
## 8 8 39 50 2500
## 9 9 34 55 3025
## 10 10 37 55 3025
## 11 11 27 60 3600
## 12 12 30 60 3600
quad_fit <- lm(mpg ~ spd + spd_squared, data = cars)
summary(quad_fit)
##
## Call:
## lm(formula = mpg ~ spd + spd_squared, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.03214 -0.58929 0.03393 1.10893 2.10000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.826e+02 1.768e+01 -10.33 2.73e-06 ***
## spd 8.983e+00 7.616e-01 11.80 8.91e-07 ***
## spd_squared -9.107e-02 7.993e-03 -11.39 1.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.727 on 9 degrees of freedom
## Multiple R-squared: 0.9474, Adjusted R-squared: 0.9357
## F-statistic: 81.03 on 2 and 9 DF, p-value: 1.757e-06
plot(quad_fit, which = 1:2)
library("car")
vif(quad_fit)
## spd spd_squared
## 170.2187 170.2187
# Create a set of x-values for the line plot and use these to make new data for plotting our line.
xvalues <- seq(35, 65, 0.1)
# Have to use the same variable names as original data (is this a bug?) This gives us the y-hats to plot our line.
newcars = data.frame(spd=xvalues, spd_squared=xvalues^2)
predmpgs <- predict(quad_fit, newcars)
#Now plot the points and the line
plot(cars$mpg ~ cars$spd, xlab = "Speed (MPH)", ylab = "MPG")
lines(xvalues, predmpgs)
#Diagnostic plots
plot(quad_fit, which = 1:2)
# Could Collinearity be a problem
library("car")
vif(quad_fit)
## spd spd_squared
## 170.2187 170.2187
# Could Collinearity be a problem
xbar <- mean(cars$spd)
xbar
## [1] 47.5
cars$spdc <- cars$spd - xbar
cars$spdc2 <- cars$spdc^2
cars
## case mpg spd spd_squared spdc spdc2
## 1 1 22 35 1225 -12.5 156.25
## 2 2 20 35 1225 -12.5 156.25
## 3 3 28 40 1600 -7.5 56.25
## 4 4 31 40 1600 -7.5 56.25
## 5 5 37 45 2025 -2.5 6.25
## 6 6 38 45 2025 -2.5 6.25
## 7 7 41 50 2500 2.5 6.25
## 8 8 39 50 2500 2.5 6.25
## 9 9 34 55 3025 7.5 56.25
## 10 10 37 55 3025 7.5 56.25
## 11 11 27 60 3600 12.5 156.25
## 12 12 30 60 3600 12.5 156.25
quad_fit_c <- lm(mpg ~ spdc + spdc2, data = cars)
summary(quad_fit_c)
##
## Call:
## lm(formula = mpg ~ spdc + spdc2, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.03214 -0.58929 0.03393 1.10893 2.10000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.640625 0.766888 50.386 2.4e-12 ***
## spdc 0.331429 0.058372 5.678 0.000303 ***
## spdc2 -0.091071 0.007993 -11.394 1.2e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.727 on 9 degrees of freedom
## Multiple R-squared: 0.9474, Adjusted R-squared: 0.9357
## F-statistic: 81.03 on 2 and 9 DF, p-value: 1.757e-06
# get new y-hats for lots of x-values to make our curve
# again, we have to use the same variable names used for quad_fit_c
newcars_c = data.frame(spdc=xvalues-xbar, spdc2=(xvalues-xbar)^2)
#y-hats for these x-values
predmpgs_c <- predict(quad_fit_c, newcars_c)
head(predmpgs_c)
## 1 2 3 4 5 6
## 20.26786 20.52777 20.78586 21.04213 21.29657 21.54920
#Plot points and the new curve
plot(cars$mpg ~ cars$spd, xlab = "Speed (MPH)", ylab = "MPG")
lines(xvalues, predmpgs_c)
As you can see it is exactly the same curve! So why go to the trouble?
# Could Collinearity be a problem
vif(quad_fit_c)
## spdc spdc2
## 1 1