auto <- read.table('/Users/claire/STA 6543 – Predictive Modeling/Exercise 1/Auto-1.txt', header = T)
plot(auto$mpg ~ auto$horsepower, xlab = "Horsepower", ylab = "MPG")
The scatterplot shows a clear negative relationship between horsepower and MPG, with higher horsepower associated with lower fuel efficiency. The relationship is nonlinear, appearing more curved (approximately quadratic).
fit = lm(auto$mpg ~ auto$horsepower)
summary(fit)
##
## Call:
## lm(formula = auto$mpg ~ auto$horsepower)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## auto$horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
Least square regression equation: mpg = 39.935861 - 0.157845 * horsepower. Circled results are submitted separately on Canvas.
Multiple R-squared = 0.6059
boxplot(residuals(fit), ylab = "Residuals", main = "Boxplot of Residuals for MPG vs Horsepower")
# Fit a single linear model and conduct 10-fold CV to estimate the error
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(1)
lm1Fit <- train(mpg ~ horsepower,
data = auto,
method = "lm",
trControl = trainControl(method= "cv"))
lm1Fit
## Linear Regression
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 355, 352, 352, 353, 352, 353, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.892631 0.6197697 3.84403
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm1Fit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# Draw the scatter plot with the fitted line and the scatter plot between the observed and fitted values
par(mfrow = c(1,2))
plot(auto$horsepower, auto$mpg, xlab = "Horsepower", ylab = "MPG")
lines(auto$horsepower, fitted(lm1Fit), col=2, lwd=2)
Observed = auto$mpg
Predicted = fitted(lm1Fit)
plot(Observed, Predicted, ylim=c(12, 70))
# Fit a quadratic model and conduct 10-fold CV to estimate the error
#Create the squared term of horsepower, called horsepower2
auto$horsepower2 = auto$horsepower^2
#sort the data in an ascending order
auto = auto[order(auto[,3],decreasing=FALSE),]
set.seed(1)
lm2Fit <- train(mpg ~ horsepower + horsepower2,
data = auto,
method = "lm",
trControl = trainControl(method= "cv"))
lm2Fit
## Linear Regression
##
## 392 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 355, 352, 352, 353, 352, 353, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.329612 0.6947287 3.273222
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm2Fit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## horsepower -0.4661896 0.0311246 -14.98 <2e-16 ***
## horsepower2 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
# Draw the scatter plot with the fitted line and the scatter plot between the observed and fitted values
par(mfrow = c(1,2))
plot(auto$horsepower, auto$mpg, xlab = "Horsepower", ylab = "MPG")
lines(auto$horsepower, fitted(lm2Fit), col=2, lwd=2)
Observed = auto$mpg
Predicted = fitted(lm2Fit)
plot(Observed, Predicted, ylim=c(12, 70))
# Fit a mars model with optimal tuning parameters that you choose and conduct 10-fold CV to estimate the error
library(earth)
## Warning: package 'earth' was built under R version 4.5.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.5.2
set.seed(1)
marsFit <- train(mpg ~ horsepower,
data = auto,
method = "earth",
tuneLength = 15,
trControl = trainControl(method= "cv"))
marsFit
## Multivariate Adaptive Regression Spline
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 355, 352, 352, 353, 352, 353, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.801547 0.6299820 3.794938
## 3 4.347131 0.6924163 3.314007
## 4 4.309786 0.6972659 3.240772
## 5 4.309786 0.6972659 3.240772
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
summary(marsFit)
## Call: earth(x=c(46,46,48,48,4...), y=c(26,26,43.1,44...), keepxy=TRUE,
## degree=1, nprune=4)
##
## coefficients
## (Intercept) 20.3320268
## h(103-horsepower) 0.3147486
## h(horsepower-120) -0.1780548
## h(horsepower-160) 0.1761460
##
## Selected 4 of 5 terms, and 1 of 1 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 5 terms
## Importance: horsepower
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 19.05576 RSS 7205.459 GRSq 0.6879887 RSq 0.697491
plot(marsFit)
# Draw the scatter plot with the fitted line and the scatter plot between the observed and fitted values
par(mfrow = c(1,2))
plot(auto$horsepower, auto$mpg, xlab = "Horsepower", ylab = "MPG")
lines(auto$horsepower,fitted(marsFit), col=2, lwd=2)
Observed = auto$mpg
Predicted = fitted(marsFit)
plot(Observed, Predicted, ylim=c(12, 70))
# Get performance values via caret's postResample function
postResample(pred = fitted(lm1Fit), obs = auto$mpg)
## RMSE Rsquared MAE
## 11.3395316 0.1074093 9.2739031
postResample(pred = fitted(lm2Fit), obs = auto$mpg)
## RMSE Rsquared MAE
## 4.357151 0.687559 3.249487
postResample(pred = fitted(marsFit), obs = auto$mpg)
## RMSE Rsquared MAE
## 4.287339 0.697491 3.189058
Among the three fitted models, the MARS model should be preferred. It achieves the lowest RMSE and MAE, as well as the highest R-squared value, indicating superior predictive accuracy and a better overall fit to the data compared with the linear and quadratic regression models.