Auto <- read.table("C://Users/xxsha/OneDrive/Desktop/R_Homework/Auto.txt", header = T)
#The relationship is non linear, as horsepower increased, mpg decreased, but the rate of decrease slows down at higher horsepower values.
plot(Auto$horsepower, Auto$mpg,
xlab = "horsepower", ylab = "mpg",
main = "mpg vs horsepower",
pch = 16, col = "red")

#I dont know how to circle the numbers in R Markdown files but the equation is:
#
#mpg = a + b*horsepower
#
#Where a is the intercept and b is the coefficient
#
#mpg = 39.935861 + (-0.157845)*horsepower
#mpg = 39.935861 - 0.157845*horsepower
#
#The values can be found in the coefficients section directly under "Estimate".
#
lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
#About 60.59% of the variation in mpg is explained with a linear relationship to horsepower
r_squared <- summary(lm_fit)$r.squared
cat("R^2 =", round(r_squared, 4), "\n")
## R^2 = 0.6059
#There are multiple large outliers, not many lower outliers. Suggesting the model underpredicts the mpg for low horsepower values
boxplot(lm_fit$residuals,
main = "Residuals boxplot",
ylab = "Residuals", col = "red")

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(67)
train_control <- trainControl(method = "cv", number = 10)
#10-fold CV
lm_cv <- train(mpg ~ horsepower, data = Auto,
method = "lm", trControl = train_control)
lm_cv
## Linear Regression
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 353, 353, 353, 352, 352, 353, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.895381 0.6155705 3.839302
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#scatter plot with regression line horsepower vs mpg
plot(Auto$horsepower, Auto$mpg, xlab = "horsepower", ylab = "mpg")
abline(lm_fit, col = "red", lwd = 2)

#observed vs fitted values
plot(fitted(lm_fit), Auto$mpg, xlab = "Fitted Values", ylab = "Observed mpg")
abline(0, 1, col = "red", lwd = 2)

Auto$horsepower2 <- Auto$horsepower^2
Auto <- Auto[order(Auto$horsepower, decreasing = FALSE), ]
#quadratic model
quad_fit <- lm(mpg ~ horsepower + horsepower2, data = Auto)
summary(quad_fit)
##
## Call:
## lm(formula = mpg ~ horsepower + horsepower2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## horsepower -0.4661896 0.0311246 -14.98 <2e-16 ***
## horsepower2 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
#10-fold CV
set.seed(67)
quad_cv <- train(mpg ~ horsepower + horsepower2, data = Auto,
method = "lm", trControl = train_control)
quad_cv
## Linear Regression
##
## 392 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 353, 353, 353, 352, 352, 353, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.363352 0.6889964 3.275539
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#scatter plot with fitted curve
plot(Auto$horsepower, Auto$mpg, xlab = "horsepower", ylab = "mpg")
lines(Auto$horsepower, fitted(quad_fit), col = "red", lwd = 2)

#observed vs fitted
plot(fitted(quad_fit), Auto$mpg, xlab = "Fitted Values", ylab = "Observed mpg")
abline(0, 1, col = "red", lwd = 2)

library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
set.seed(67)
#tuning grid
#I can only assume that for what we are doing, the paramaters are almost irrelavent
mars_grid <- expand.grid(degree = 1:5, nprune = 2:10)
#10-fold CV
mars_cv <- train(mpg ~ horsepower, data = Auto,
method = "earth",
trControl = train_control,
tuneGrid = mars_grid)
mars_cv
## Multivariate Adaptive Regression Spline
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 353, 353, 353, 352, 352, 353, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.807370 0.6178994 3.784386
## 1 3 4.374543 0.6905749 3.324605
## 1 4 4.353928 0.6916476 3.264295
## 1 5 4.353928 0.6916476 3.264295
## 1 6 4.353928 0.6916476 3.264295
## 1 7 4.353928 0.6916476 3.264295
## 1 8 4.353928 0.6916476 3.264295
## 1 9 4.353928 0.6916476 3.264295
## 1 10 4.353928 0.6916476 3.264295
## 2 2 4.807370 0.6178994 3.784386
## 2 3 4.374543 0.6905749 3.324605
## 2 4 4.353928 0.6916476 3.264295
## 2 5 4.353928 0.6916476 3.264295
## 2 6 4.353928 0.6916476 3.264295
## 2 7 4.353928 0.6916476 3.264295
## 2 8 4.353928 0.6916476 3.264295
## 2 9 4.353928 0.6916476 3.264295
## 2 10 4.353928 0.6916476 3.264295
## 3 2 4.807370 0.6178994 3.784386
## 3 3 4.374543 0.6905749 3.324605
## 3 4 4.353928 0.6916476 3.264295
## 3 5 4.353928 0.6916476 3.264295
## 3 6 4.353928 0.6916476 3.264295
## 3 7 4.353928 0.6916476 3.264295
## 3 8 4.353928 0.6916476 3.264295
## 3 9 4.353928 0.6916476 3.264295
## 3 10 4.353928 0.6916476 3.264295
## 4 2 4.807370 0.6178994 3.784386
## 4 3 4.374543 0.6905749 3.324605
## 4 4 4.353928 0.6916476 3.264295
## 4 5 4.353928 0.6916476 3.264295
## 4 6 4.353928 0.6916476 3.264295
## 4 7 4.353928 0.6916476 3.264295
## 4 8 4.353928 0.6916476 3.264295
## 4 9 4.353928 0.6916476 3.264295
## 4 10 4.353928 0.6916476 3.264295
## 5 2 4.807370 0.6178994 3.784386
## 5 3 4.374543 0.6905749 3.324605
## 5 4 4.353928 0.6916476 3.264295
## 5 5 4.353928 0.6916476 3.264295
## 5 6 4.353928 0.6916476 3.264295
## 5 7 4.353928 0.6916476 3.264295
## 5 8 4.353928 0.6916476 3.264295
## 5 9 4.353928 0.6916476 3.264295
## 5 10 4.353928 0.6916476 3.264295
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
plot(mars_cv, main = "mars plot")

#summary
summary(mars_cv$finalModel)
## Call: earth(x=c(46,46,48,48,4...), y=c(26,26,43.1,44...), keepxy=TRUE,
## degree=1, nprune=4)
##
## coefficients
## (Intercept) 20.3320268
## h(103-horsepower) 0.3147486
## h(horsepower-120) -0.1780548
## h(horsepower-160) 0.1761460
##
## Selected 4 of 5 terms, and 1 of 1 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 5 terms
## Importance: horsepower
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 19.05576 RSS 7205.459 GRSq 0.6879887 RSq 0.697491
#scatter plot with fitted curve
mars_pred <- predict(mars_cv, newdata = Auto)
plot(Auto$horsepower, Auto$mpg, xlab = "horsepower", ylab = "mpg")
lines(Auto$horsepower, mars_pred, col = "red", lwd = 2)

#observed vs fitted
plot(mars_pred, Auto$mpg, xlab = "Fitted Values", ylab = "Observed MPG")
abline(0, 1, col = "red", lwd = 2)

#Based on both RMSE and R^2 it seems that the MARS model is the best for this data.
#Given the low RMSE and high R^2
#The quadratic model also almost just as good.
#Both of these are better than the linear model
#
results <- resamples(list(Linear = lm_cv,
Quadratic = quad_cv,
MARS = mars_cv))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear, Quadratic, MARS
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear 3.329834 3.567066 3.738487 3.839302 4.089393 4.794216 0
## Quadratic 2.926696 2.990652 3.197085 3.275539 3.536619 3.742347 0
## MARS 2.809236 3.015426 3.249490 3.264295 3.433148 3.842824 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear 4.225464 4.561580 4.908599 4.895381 5.210011 5.601132 0
## Quadratic 3.808652 3.932781 4.313219 4.363352 4.561534 5.531100 0
## MARS 3.728081 3.891303 4.363979 4.353928 4.517151 5.621601 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear 0.4861451 0.5725656 0.6343788 0.6155705 0.6571548 0.7070100 0
## Quadratic 0.4451929 0.6580901 0.6914685 0.6889964 0.7644525 0.8330747 0
## MARS 0.4386788 0.6592741 0.6977118 0.6916476 0.7724408 0.8201727 0
#box plots of RMSE and R^2
bwplot(results, metric = "RMSE", main = "RMSE Comparison")

bwplot(results, metric = "Rsquared", main = "R^2 Comparison")

#summary
cat("Linear RMSE:", round(lm_cv$results$RMSE, 3),
" | R^2:", round(lm_cv$results$Rsquared, 4), "\n")
## Linear RMSE: 4.895 | R^2: 0.6156
cat("Quadratic RMSE:", round(quad_cv$results$RMSE, 3),
" | R^2:", round(quad_cv$results$Rsquared, 4), "\n")
## Quadratic RMSE: 4.363 | R^2: 0.689
cat("MARS RMSE:", round(min(mars_cv$results$RMSE), 3),
" | R^2:", round(max(mars_cv$results$Rsquared), 4), "\n")
## MARS RMSE: 4.354 | R^2: 0.6916