## Downloading github repo hadley/fueleconomy@master
## Installing fueleconomy
## '/Library/Frameworks/R.framework/Resources/bin/R' --vanilla CMD INSTALL \
## '/private/var/folders/hm/8t_hdfyd4pnbr0lg32923mkw0000gn/T/Rtmpap3axv/devtools6056471b567/hadley-fueleconomy-188fa25' \
## --library='/Library/Frameworks/R.framework/Versions/3.1/Resources/library' \
## --install-tests
Before I proceed with analyzing this data set, I want to remove any entries (rows) that contain ‘na’ values:
v = vehicles[,c('year','displ','hwy')]
v = v[complete.cases(v),]
I also want to subtract 1984 from the year values, so that our intercept values will make more sense during the interpretation of the model statistics. The year values will now mean “years since 1984”.
v$year = v$year - 1984
plot(v,pch=19, cex=1, col="#00000010")
The only obvious outliers here are where we seem to have data points where engine displacement = 0. This cannot be right, so I’ll remove these entries and replot the scattergram.
v = v[v$displ > 0,]
plot(v,pch=19, cex=1, col="#00000010")
cor(v)
## year displ hwy
## year 1.00000000 0.06278627 0.2075190
## displ 0.06278627 1.00000000 -0.7186687
## hwy 0.20751902 -0.71866866 1.0000000
I’ve chosen car model year and engine displacement as independent variables because these are vehicle specifications that cannot be changed. The highway fuel economy, on the other hand, is likely a function of a variety of factors. These likely include (but are not limited to) the model year and engine displacement.
mpg.lm_H1 <- lm(v$hwy ~ v$displ)
summary(mpg.lm_H1)
##
## Call:
## lm(formula = v$hwy ~ v$displ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5284 -2.5284 -0.3967 2.3138 30.4187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.60769 0.05799 579.6 <2e-16 ***
## v$displ -3.02642 0.01603 -188.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.981 on 33380 degrees of freedom
## Multiple R-squared: 0.5165, Adjusted R-squared: 0.5165
## F-statistic: 3.566e+04 on 1 and 33380 DF, p-value: < 2.2e-16
mpg.lm_H2 <- lm(v$hwy ~ v$displ + v$year)
summary(mpg.lm_H2)
##
## Call:
## lm(formula = v$hwy ~ v$displ + v$year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6082 -2.3083 -0.3834 1.8613 30.1202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.49420 0.06159 511.38 <2e-16 ***
## v$displ -3.09348 0.01496 -206.82 <2e-16 ***
## v$year 0.15494 0.00217 71.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.708 on 33379 degrees of freedom
## Multiple R-squared: 0.5806, Adjusted R-squared: 0.5805
## F-statistic: 2.31e+04 on 2 and 33379 DF, p-value: < 2.2e-16
Here is the 3D scatterplot showing how well our model (red plane with 95% confidence intervals shown in red) is representing our data.
#install.packages("scatterplot3d")
library(scatterplot3d)
H2_3d <- scatterplot3d(v$displ, v$year, v$hwy, pch=19, color="#00000010", main = "Highway fuel economy vs. engine displacement and model year", bg = 'ivory4', xlab = "Displacement (L)", ylab = "Model year", zlab = "Highway fuel economy (mpg)", axis=TRUE)
H2_3d$plane3d(mpg.lm_H2, col = 'blue')
H2_3d$plane3d(confint(mpg.lm_H2)[,1],col="red")
H2_3d$plane3d(confint(mpg.lm_H2)[,2],col="red")
mpg.res = resid(mpg.lm_H2)
mpg.stdres = rstandard(mpg.lm_H2)
par(mfrow=c(2,2))
plot(fitted(mpg.lm_H2), mpg.stdres, xlab="Fitted values", ylab="Standardized residual", main="(a) Std. residuals vs. fitted values")
abline(0,0,col='red')
hist(mpg.stdres,breaks=80, xlab="Standardized residual", main="(b) Std. residuals distribution")
boxplot(mpg.stdres, xlab="MPH ~ Displacement + Year", ylab="Standardized residual", main="(c) Std. residuals boxplot")
qqnorm(mpg.stdres, xlab="Normal quantiles", ylab="Standardized residual quantiles", main="(d) QQ plot: std. residuals vs. normal")
qqline(mpg.stdres,col='red')
par(mfrow=c(1,1))
par(mfrow=c(1,2))
plot(v$displ, mpg.stdres, xlab="Index", ylab="Standardized residual", main="Std. residuals vs. engine displacement", pch=19, col="#00000010")
abline(0,0,col='red')
plot(v$year, mpg.stdres, xlab="Index", ylab="Standardized residual", main="Std. residuals vs. model year", pch=19, col="#00000010")
abline(0,0,col='red')
par(mfrow=c(1,1))
plot(fitted(mpg.lm_H2), mpg.stdres, xlab="Fitted values", ylab="Standardized residual", main="(a) Std. residuals vs. fitted values")
abline(0,0,col='red',lwd=5)
lines(smooth.spline(fitted(mpg.lm_H2), mpg.stdres,df=15),col='blue',lwd=5)
This shows us that the means of the residuals are non-zero for a significant chunk of the fitted values. This likely means that our linear model is attempting to model a non-linear phenomenon.
summary(mpg.lm_H2)
##
## Call:
## lm(formula = v$hwy ~ v$displ + v$year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6082 -2.3083 -0.3834 1.8613 30.1202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.49420 0.06159 511.38 <2e-16 ***
## v$displ -3.09348 0.01496 -206.82 <2e-16 ***
## v$year 0.15494 0.00217 71.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.708 on 33379 degrees of freedom
## Multiple R-squared: 0.5806, Adjusted R-squared: 0.5805
## F-statistic: 2.31e+04 on 2 and 33379 DF, p-value: < 2.2e-16
dataset <- data.frame(y=v$hwy, x=v$displ)
model1 <- VAR(dataset, p = 1)
whites.htest(model1)
##
## White's Test for Heteroskedasticity:
## ====================================
##
## No Cross Terms
##
## H0: Homoskedasticity
## H1: Heteroskedasticity
##
## Test Statistic:
## 6201.3419
##
## Degrees of Freedom:
## 12
##
## P-value:
## 0.0000
dataset <- data.frame(y=v$hwy, x=v$year)
model1 <- VAR(dataset, p = 1)
whites.htest(model1)
##
## White's Test for Heteroskedasticity:
## ====================================
##
## No Cross Terms
##
## H0: Homoskedasticity
## H1: Heteroskedasticity
##
## Test Statistic:
## 4233.3897
##
## Degrees of Freedom:
## 12
##
## P-value:
## 0.0000