## Downloading github repo hadley/fueleconomy@master
## Installing fueleconomy
## '/Library/Frameworks/R.framework/Resources/bin/R' --vanilla CMD INSTALL \
## '/private/var/folders/hm/8t_hdfyd4pnbr0lg32923mkw0000gn/T/RtmpD8HfKV/devtools62d14cbe295/hadley-fueleconomy-188fa25' \
## --library='/Library/Frameworks/R.framework/Versions/3.1/Resources/library' \
## --install-tests
Before I proceed with analyzing this data set, I want to remove any entries (rows) that contain ‘na’ values:
v = vehicles[,c('year','displ','hwy')]
v = v[complete.cases(v),]
I also want to subtract 1984 from the year values, so that our intercept values will make more sense during the interpretation of the model statistics. The year values will now mean “years since 1984”.
v$year = v$year - 1984
plot(v,pch=19, cex=1, col="#00000010")
The only obvious outliers here are where we seem to have data points where engine displacement = 0. This cannot be right, so I’ll remove these entries
After initially creating this model using all 33K+ cases and finding a very large effect size of 0.761 using \(R^2\)=0.5608, \(alpha\)=0.05 and \(beta\)=0.95 (see Interpretting the model: Item #4), it seems likely that we should downsample the data in order to approach a more reasonable sample size. G*Power suggested a sample size of 10, but this seems too small. Instead, I’m downsampling to 100 cases and proceeding with the analysis.
sample_size=100
vector=nrow(v)
set.seed(100)
idx=sample(vector, sample_size, replace=FALSE)
v=v[idx,]
After removing the outliers and downsampling, here is the updated scattergram:
v = v[v$displ > 0,]
plot(v,pch=19, cex=1)
cor(v)
## year displ hwy
## year 1.00000000 -0.09698428 0.3164187
## displ -0.09698428 1.00000000 -0.7530461
## hwy 0.31641873 -0.75304611 1.0000000
I’ve chosen car model year and engine displacement as independent variables because these are vehicle specifications that cannot be changed. The highway fuel economy, on the other hand, is likely a function of a variety of factors. These likely include (but are not limited to) the model year and engine displacement.
mpg.lm_H1 <- lm(v$hwy ~ v$displ)
summary(mpg.lm_H1)
##
## Call:
## lm(formula = v$hwy ~ v$displ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5108 -2.7389 -0.8823 2.4116 15.9280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.0250 1.0035 32.91 <2e-16 ***
## v$displ -3.0408 0.2684 -11.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.816 on 98 degrees of freedom
## Multiple R-squared: 0.5671, Adjusted R-squared: 0.5627
## F-statistic: 128.4 on 1 and 98 DF, p-value: < 2.2e-16
mpg.lm_H2 <- lm(v$hwy ~ v$displ + v$year)
summary(mpg.lm_H2)
##
## Call:
## lm(formula = v$hwy ~ v$displ + v$year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3239 -1.8671 -0.6823 1.5766 14.7768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.51339 1.13253 26.943 < 2e-16 ***
## v$displ -2.94457 0.25163 -11.702 < 2e-16 ***
## v$year 0.16081 0.04078 3.943 0.000152 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.561 on 97 degrees of freedom
## Multiple R-squared: 0.6269, Adjusted R-squared: 0.6192
## F-statistic: 81.48 on 2 and 97 DF, p-value: < 2.2e-16
Here is the 3D scatterplot showing how well our model (red plane with 95% confidence intervals shown in red) is representing our data.
#install.packages("scatterplot3d")
library(scatterplot3d)
H2_3d <- scatterplot3d(v$displ, v$year, v$hwy, pch=19, main = "Highway fuel economy vs. engine displacement and model year", bg = 'ivory4', xlab = "Displacement (L)", ylab = "Model year", zlab = "Highway fuel economy (mpg)", axis=TRUE)
H2_3d$plane3d(mpg.lm_H2, col = 'blue')
H2_3d$plane3d(confint(mpg.lm_H2)[,1],col="red")
H2_3d$plane3d(confint(mpg.lm_H2)[,2],col="red")
We’ll make a series of plots that will tell us whether our residuals are normally distributed:
mpg.res = resid(mpg.lm_H2)
mpg.stdres = rstandard(mpg.lm_H2)
par(mfrow=c(2,2))
plot(fitted(mpg.lm_H2), mpg.stdres, xlab="Fitted values", ylab="Standardized residual", main="(a) Std. residuals vs. fitted values")
abline(0,0,col='red')
hist(mpg.stdres,breaks=80, xlab="Standardized residual", main="(b) Std. residuals distribution")
boxplot(mpg.stdres, xlab="MPH ~ Displacement + Year", ylab="Standardized residual", main="(c) Std. residuals boxplot")
qqnorm(mpg.stdres, xlab="Normal quantiles", ylab="Standardized residual quantiles", main="(d) QQ plot: std. residuals vs. normal")
qqline(mpg.stdres,col='red')
par(mfrow=c(1,1))
This is where we look for either homo- or heteroscedasticy. We can check this visually by referring to scatterplots of the residuals plotted against each of the independent variable values.
par(mfrow=c(1,2))
plot(v$displ, mpg.stdres, xlab="Index", ylab="Standardized residual", main="Std. residuals vs. engine displacement", pch=19)
abline(0,0,col='red')
plot(v$year, mpg.stdres, xlab="Index", ylab="Standardized residual", main="Std. residuals vs. model year", pch=19)
abline(0,0,col='red')
par(mfrow=c(1,1))
plot(fitted(mpg.lm_H2), mpg.stdres, xlab="Fitted values", ylab="Standardized residual", main="(a) Std. residuals vs. fitted values")
abline(0,0,col='red',lwd=5)
lines(smooth.spline(fitted(mpg.lm_H2), mpg.stdres,df=15),col='blue',lwd=5)
This shows us that the means of the residuals are close to zero for the central fitted values, but deviate from zero at both extremes. This likely means that our linear model is attempting to model a non-linear phenomenon.
summary(mpg.lm_H2)
##
## Call:
## lm(formula = v$hwy ~ v$displ + v$year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3239 -1.8671 -0.6823 1.5766 14.7768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.51339 1.13253 26.943 < 2e-16 ***
## v$displ -2.94457 0.25163 -11.702 < 2e-16 ***
## v$year 0.16081 0.04078 3.943 0.000152 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.561 on 97 degrees of freedom
## Multiple R-squared: 0.6269, Adjusted R-squared: 0.6192
## F-statistic: 81.48 on 2 and 97 DF, p-value: < 2.2e-16
In the first case, the y value is the dependent variable ‘highway fuel economy’ and the x value is the independent variable ‘engine displacement’:
dataset <- data.frame(y=v$hwy, x=v$displ)
model1 <- VAR(dataset, p = 1)
whites.htest(model1)
##
## White's Test for Heteroskedasticity:
## ====================================
##
## No Cross Terms
##
## H0: Homoskedasticity
## H1: Heteroskedasticity
##
## Test Statistic:
## 11.6203
##
## Degrees of Freedom:
## 12
##
## P-value:
## 0.4766
dataset <- data.frame(y=v$hwy, x=v$year)
model1 <- VAR(dataset, p = 1)
whites.htest(model1)
##
## White's Test for Heteroskedasticity:
## ====================================
##
## No Cross Terms
##
## H0: Homoskedasticity
## H1: Heteroskedasticity
##
## Test Statistic:
## 11.5676
##
## Degrees of Freedom:
## 12
##
## P-value:
## 0.4810
Here’s a summary of the findings.
mean(mpg.stdres)
## [1] 0.002334032