There is a very small data set ‘mileage.csv’ that shows car mileage versus car speed for a random sample of vehicles. Load the data set, conduct some exploratory analysis, obtain a least squares line for the data and check some associated diagnostics.
center <- read.csv("centering_2.csv", header=TRUE)
center
## case y x y.1
## 1 1 19999995 100000 19999992
## 2 2 20000002 100000 20000043
## 3 3 19200352 100002 19200413
## 4 4 19200365 100002 19200416
## 5 5 18400780 100004 18400785
## 6 6 18400761 100004 18400783
## 7 7 17601123 100006 17601116
## 8 8 17601115 100006 17601084
## 9 9 16801463 100008 16801469
## 10 10 16801485 100008 16801450
## 11 11 16001814 100010 16001790
## 12 12 16001853 100010 16001794
center$x2 <- center$x^2
center
## case y x y.1 x2
## 1 1 19999995 100000 19999992 10000000000
## 2 2 20000002 100000 20000043 10000000000
## 3 3 19200352 100002 19200413 10000400004
## 4 4 19200365 100002 19200416 10000400004
## 5 5 18400780 100004 18400785 10000800016
## 6 6 18400761 100004 18400783 10000800016
## 7 7 17601123 100006 17601116 10001200036
## 8 8 17601115 100006 17601084 10001200036
## 9 9 16801463 100008 16801469 10001600064
## 10 10 16801485 100008 16801450 10001600064
## 11 11 16001814 100010 16001790 10002000100
## 12 12 16001853 100010 16001794 10002000100
quad_fit <- lm(y ~ x + x2, data = center)
summary(quad_fit)
##
## Call:
## lm(formula = y ~ x + x2, data = center)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.809 -12.121 0.475 9.374 38.523
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.000e+10 1.701e+05 235159 <2e-16 ***
## x -3.998e+05 1.701e+00 -235053 <2e-16 ***
## x2 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.13 on 10 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 5.525e+10 on 1 and 10 DF, p-value: < 2.2e-16
#library("car")
#vif(quad_fit)
# Could Collinearity be a problem
#library("car")
#vif(quad_fit)
# Could Collinearity be a problem
xbar <- mean(center$x)
xbar
## [1] 100005
center$xc <- center$x - xbar
center$xc2 <- center$xc^2
center
## case y x y.1 x2 xc xc2
## 1 1 19999995 100000 19999992 10000000000 -5 25
## 2 2 20000002 100000 20000043 10000000000 -5 25
## 3 3 19200352 100002 19200413 10000400004 -3 9
## 4 4 19200365 100002 19200416 10000400004 -3 9
## 5 5 18400780 100004 18400785 10000800016 -1 1
## 6 6 18400761 100004 18400783 10000800016 -1 1
## 7 7 17601123 100006 17601116 10001200036 1 1
## 8 8 17601115 100006 17601084 10001200036 1 1
## 9 9 16801463 100008 16801469 10001600064 3 9
## 10 10 16801485 100008 16801450 10001600064 3 9
## 11 11 16001814 100010 16001790 10002000100 5 25
## 12 12 16001853 100010 16001794 10002000100 5 25
quad_fit_c <- lm(y ~ xc + xc2, data = center)
summary(quad_fit_c)
##
## Call:
## lm(formula = y ~ xc + xc2, data = center)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.857 -13.108 2.477 8.215 27.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.800e+07 7.801e+00 2.307e+06 <2e-16 ***
## xc -3.998e+05 1.484e+00 -2.693e+05 <2e-16 ***
## xc2 -1.033e+00 5.082e-01 -2.032e+00 0.0727 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.56 on 9 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.627e+10 on 2 and 9 DF, p-value: < 2.2e-16
library("car")
## Warning: package 'car' was built under R version 3.2.5
vif(quad_fit_c )
## xc xc2
## 1 1
```