#Access the required library
library(boot)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
Auto <- read.table("C://Users/DTY670/Desktop/STA6543/Excercise/Auto-1.txt", header = TRUE)
head(Auto)
## mpg horsepower
## 1 18 130
## 2 15 165
## 3 18 150
## 4 16 150
## 5 17 140
## 6 15 198
summary(Auto)
## mpg horsepower
## Min. : 9.00 Min. : 46.0
## 1st Qu.:17.00 1st Qu.: 75.0
## Median :22.75 Median : 93.5
## Mean :23.45 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:126.0
## Max. :46.60 Max. :230.0
#Access Auto data
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
#scatterplot
plot(horsepower, mpg,
xlab = "Horsepower",
ylab = "MPG",
main = "MPG vs Horsepower")
We observe from this scatter plot that the relationship between Horsepower and mpg appears to be negative and slightly nonlinear.
#least square regression
fitlm <-train(mpg ~ horsepower, data = Auto, method = 'lm')
summary(fitlm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
The least square regression equation is: \(\hat{mpg}\) = 39.94 - 0.158 x horsepower, indicating that every unit increase in horsepower, mpg decreases by 0.158 mpg.
# Barplot of Dvds
summary(fitlm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
The \(R^2\) value is 0.6059, indicating that the fitted lineare model explains about 60.6% of the variability in the mpg by horsepower.
# Barplot of the residuals
boxplot(residuals(fitlm), horizontal = T, ylab = "Residuals", main = "Boxplot of residuals for horsepower vs mpg")
We observe from this boxplot that the data do contain some possible outliers, represented by little circles.
# Fit a single linear model and conduct 10-fold CV to estimate the error
set.seed(1)
train_control <- trainControl(method = "cv", number = 10)
lm1Fit <- train(mpg ~ horsepower,
data = Auto,
method = "lm",
trControl = train_control)
lm1Fit
## Linear Regression
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 355, 352, 352, 353, 352, 353, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.892631 0.6197697 3.84403
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm1Fit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# Draw the scatter plot with the fitted line and the scatter plot between the observed and fitted values
par(mfrow = c(1,2))
plot(horsepower, mpg, xlab = "Horsepower", ylab = "MPG")
lines(horsepower, fitted(lm1Fit), col=2, lwd=2)
plot(mpg, fitted(lm1Fit), xlim = c(10, 40), ylim=c(10, 40))
set.seed(1)
#Fit a quadratic model and conduct 10-fold CV to estimate the error
#Create the squared term of horsepower, called horsepower2
Auto$horsepower2 = Auto$horsepower^2
#sort the data in an ascending order
Auto = Auto[order(Auto[,3],decreasing=FALSE),]
set.seed(1)
lm2Fit <- train(mpg ~ horsepower + horsepower2,
data = Auto,
method = "lm",
trControl = train_control)
lm2Fit
## Linear Regression
##
## 392 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 355, 352, 352, 353, 352, 353, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.329612 0.6947287 3.273222
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm2Fit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## horsepower -0.4661896 0.0311246 -14.98 <2e-16 ***
## horsepower2 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
# Draw the scatter plot with the fitted line and the scatter plot between the observed and fitted values
par(mfrow = c(1,2))
plot(Auto$horsepower, Auto$mpg, xlab = "Horsepower", ylab = "MPG")
lines(Auto$horsepower, fitted(lm2Fit), col=2, lwd=2)
plot(Auto$mpg, fitted(lm2Fit), xlim = c(10, 40), ylim=c(10, 40))
#Fit a quadratic model and conduct 10-fold CV to estimate the error
set.seed(1)
marsFit <- train(mpg ~ horsepower,
data = Auto,
method = "earth",
tuneLength = 15,
trControl = train_control)
marsFit
## Multivariate Adaptive Regression Spline
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 355, 352, 352, 353, 352, 353, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.801547 0.6299820 3.794938
## 3 4.347131 0.6924163 3.314007
## 4 4.309786 0.6972659 3.240772
## 5 4.309786 0.6972659 3.240772
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
#Tuning parameter
plot(marsFit)
summary(marsFit)
## Call: earth(x=c(46,46,48,48,4...), y=c(26,26,43.1,44...), keepxy=TRUE,
## degree=1, nprune=4)
##
## coefficients
## (Intercept) 20.3320268
## h(103-horsepower) 0.3147486
## h(horsepower-120) -0.1780548
## h(horsepower-160) 0.1761460
##
## Selected 4 of 5 terms, and 1 of 1 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 5 terms
## Importance: horsepower
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 19.05576 RSS 7205.459 GRSq 0.6879887 RSq 0.697491
# Draw the scatter plot with the fitted line and the scatter plot between the observed and fitted values
par(mfrow = c(1,2))
plot(Auto$horsepower, Auto$mpg, xlab = "Horsepower", ylab = "MPG")
lines(Auto$horsepower,fitted(marsFit), col=2, lwd=2)
plot(Auto$mpg, fitted(marsFit), xlim = c(10, 40), ylim=c(10, 40))
# Compare the model performance using caret's postResample function
# as these models are fitted under same trControl function
postResample(pred = fitted(lm1Fit), obs = mpg) #do not use Auto$mpg as Auto$mpg has been sorted.
## RMSE Rsquared MAE
## 4.8932262 0.6059483 3.8275871
postResample(pred = fitted(lm2Fit), obs = Auto$mpg)
## RMSE Rsquared MAE
## 4.357151 0.687559 3.249487
postResample(pred = fitted(marsFit), obs = Auto$mpg)
## RMSE Rsquared MAE
## 4.287339 0.697491 3.189058
We observe from the above outputs that the MARS model performed best, with the highest value of \(R^2\) ((0.6975) and the lowest values of RMSE (4.287339) and MAE (3.189058). Thus we would prefer the MARS model over the quadratic model, which performed better than the linear model from a predictive perspective.