Auto <- read.table("C://Users/xxsha/OneDrive/Desktop/R_Homework/Auto.txt", header = T)
#The relationship is non linear, as horsepower increased, mpg decreased, but the rate of decrease slows down at higher horsepower values.
plot(Auto$horsepower, Auto$mpg,
     xlab = "horsepower", ylab = "mpg",
     main = "mpg vs horsepower",
     pch = 16, col = "red")

#I dont know how to circle the numbers in R Markdown files but the equation is:
#
#mpg = a + b*horsepower
#
#Where a is the intercept and b is the coefficient
#
#mpg = 39.935861 + (-0.157845)*horsepower
#mpg = 39.935861 - 0.157845*horsepower
#
#The values can be found in the coefficients section directly under "Estimate".
#
lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
#About 60.59% of the variation in mpg is explained with a linear relationship to horsepower
r_squared <- summary(lm_fit)$r.squared
cat("R^2 =", round(r_squared, 4), "\n")
## R^2 = 0.6059
#There are multiple large outliers, not many lower outliers. Suggesting the model underpredicts the mpg for low horsepower values
boxplot(lm_fit$residuals,
        main = "Residuals boxplot",
        ylab = "Residuals", col = "red")

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(67)

train_control <- trainControl(method = "cv", number = 10)

#10-fold CV
lm_cv <- train(mpg ~ horsepower, data = Auto,
               method = "lm", trControl = train_control)
lm_cv
## Linear Regression 
## 
## 392 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 353, 353, 353, 352, 352, 353, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.895381  0.6155705  3.839302
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
#scatter plot with regression line horsepower vs mpg
plot(Auto$horsepower, Auto$mpg, xlab = "horsepower", ylab = "mpg")
abline(lm_fit, col = "red", lwd = 2)

#observed vs fitted values
plot(fitted(lm_fit), Auto$mpg, xlab = "Fitted Values", ylab = "Observed mpg")
abline(0, 1, col = "red", lwd = 2)

Auto$horsepower2 <- Auto$horsepower^2
Auto <- Auto[order(Auto$horsepower, decreasing = FALSE), ]

#quadratic model
quad_fit <- lm(mpg ~ horsepower + horsepower2, data = Auto)
summary(quad_fit)
## 
## Call:
## lm(formula = mpg ~ horsepower + horsepower2, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56.9000997  1.8004268   31.60   <2e-16 ***
## horsepower  -0.4661896  0.0311246  -14.98   <2e-16 ***
## horsepower2  0.0012305  0.0001221   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16
#10-fold CV
set.seed(67)
quad_cv <- train(mpg ~ horsepower + horsepower2, data = Auto,
                 method = "lm", trControl = train_control)
quad_cv
## Linear Regression 
## 
## 392 samples
##   2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 353, 353, 353, 352, 352, 353, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.363352  0.6889964  3.275539
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
#scatter plot with fitted curve
plot(Auto$horsepower, Auto$mpg, xlab = "horsepower", ylab = "mpg")
lines(Auto$horsepower, fitted(quad_fit), col = "red", lwd = 2)

#observed vs fitted
plot(fitted(quad_fit), Auto$mpg, xlab = "Fitted Values", ylab = "Observed mpg")
abline(0, 1, col = "red", lwd = 2)

library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
set.seed(67)

#tuning grid
#I can only assume that for what we are doing, the paramaters are almost irrelavent
mars_grid <- expand.grid(degree = 1:5, nprune = 2:10)

#10-fold CV
mars_cv <- train(mpg ~ horsepower, data = Auto,
                 method  = "earth",
                 trControl = train_control,
                 tuneGrid  = mars_grid)
mars_cv
## Multivariate Adaptive Regression Spline 
## 
## 392 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 353, 353, 353, 352, 352, 353, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.807370  0.6178994  3.784386
##   1        3      4.374543  0.6905749  3.324605
##   1        4      4.353928  0.6916476  3.264295
##   1        5      4.353928  0.6916476  3.264295
##   1        6      4.353928  0.6916476  3.264295
##   1        7      4.353928  0.6916476  3.264295
##   1        8      4.353928  0.6916476  3.264295
##   1        9      4.353928  0.6916476  3.264295
##   1       10      4.353928  0.6916476  3.264295
##   2        2      4.807370  0.6178994  3.784386
##   2        3      4.374543  0.6905749  3.324605
##   2        4      4.353928  0.6916476  3.264295
##   2        5      4.353928  0.6916476  3.264295
##   2        6      4.353928  0.6916476  3.264295
##   2        7      4.353928  0.6916476  3.264295
##   2        8      4.353928  0.6916476  3.264295
##   2        9      4.353928  0.6916476  3.264295
##   2       10      4.353928  0.6916476  3.264295
##   3        2      4.807370  0.6178994  3.784386
##   3        3      4.374543  0.6905749  3.324605
##   3        4      4.353928  0.6916476  3.264295
##   3        5      4.353928  0.6916476  3.264295
##   3        6      4.353928  0.6916476  3.264295
##   3        7      4.353928  0.6916476  3.264295
##   3        8      4.353928  0.6916476  3.264295
##   3        9      4.353928  0.6916476  3.264295
##   3       10      4.353928  0.6916476  3.264295
##   4        2      4.807370  0.6178994  3.784386
##   4        3      4.374543  0.6905749  3.324605
##   4        4      4.353928  0.6916476  3.264295
##   4        5      4.353928  0.6916476  3.264295
##   4        6      4.353928  0.6916476  3.264295
##   4        7      4.353928  0.6916476  3.264295
##   4        8      4.353928  0.6916476  3.264295
##   4        9      4.353928  0.6916476  3.264295
##   4       10      4.353928  0.6916476  3.264295
##   5        2      4.807370  0.6178994  3.784386
##   5        3      4.374543  0.6905749  3.324605
##   5        4      4.353928  0.6916476  3.264295
##   5        5      4.353928  0.6916476  3.264295
##   5        6      4.353928  0.6916476  3.264295
##   5        7      4.353928  0.6916476  3.264295
##   5        8      4.353928  0.6916476  3.264295
##   5        9      4.353928  0.6916476  3.264295
##   5       10      4.353928  0.6916476  3.264295
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
plot(mars_cv, main = "mars plot")

#summary
summary(mars_cv$finalModel)
## Call: earth(x=c(46,46,48,48,4...), y=c(26,26,43.1,44...), keepxy=TRUE,
##             degree=1, nprune=4)
## 
##                   coefficients
## (Intercept)         20.3320268
## h(103-horsepower)    0.3147486
## h(horsepower-120)   -0.1780548
## h(horsepower-160)    0.1761460
## 
## Selected 4 of 5 terms, and 1 of 1 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 5 terms
## Importance: horsepower
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 19.05576    RSS 7205.459    GRSq 0.6879887    RSq 0.697491
#scatter plot with fitted curve
mars_pred <- predict(mars_cv, newdata = Auto)

plot(Auto$horsepower, Auto$mpg, xlab = "horsepower", ylab = "mpg")
lines(Auto$horsepower, mars_pred, col = "red", lwd = 2)

#observed vs fitted
plot(mars_pred, Auto$mpg, xlab = "Fitted Values", ylab = "Observed MPG")
abline(0, 1, col = "red", lwd = 2)

#Based on both RMSE and R^2 it seems that the MARS model is the best for this data.
#Given the low RMSE and high R^2
#The quadratic model also almost just as good.
#Both of these are better than the linear model
#
results <- resamples(list(Linear    = lm_cv,
                          Quadratic = quad_cv,
                          MARS      = mars_cv))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Linear, Quadratic, MARS 
## Number of resamples: 10 
## 
## MAE 
##               Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear    3.329834 3.567066 3.738487 3.839302 4.089393 4.794216    0
## Quadratic 2.926696 2.990652 3.197085 3.275539 3.536619 3.742347    0
## MARS      2.809236 3.015426 3.249490 3.264295 3.433148 3.842824    0
## 
## RMSE 
##               Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Linear    4.225464 4.561580 4.908599 4.895381 5.210011 5.601132    0
## Quadratic 3.808652 3.932781 4.313219 4.363352 4.561534 5.531100    0
## MARS      3.728081 3.891303 4.363979 4.353928 4.517151 5.621601    0
## 
## Rsquared 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Linear    0.4861451 0.5725656 0.6343788 0.6155705 0.6571548 0.7070100    0
## Quadratic 0.4451929 0.6580901 0.6914685 0.6889964 0.7644525 0.8330747    0
## MARS      0.4386788 0.6592741 0.6977118 0.6916476 0.7724408 0.8201727    0
#box plots of RMSE and R^2
bwplot(results, metric = "RMSE", main = "RMSE Comparison")

bwplot(results, metric = "Rsquared", main = "R^2 Comparison")

#summary
cat("Linear RMSE:", round(lm_cv$results$RMSE, 3),
    " | R^2:", round(lm_cv$results$Rsquared, 4), "\n")
## Linear RMSE: 4.895  | R^2: 0.6156
cat("Quadratic RMSE:", round(quad_cv$results$RMSE, 3),
    " | R^2:", round(quad_cv$results$Rsquared, 4), "\n")
## Quadratic RMSE: 4.363  | R^2: 0.689
cat("MARS RMSE:", round(min(mars_cv$results$RMSE), 3),
    " | R^2:", round(max(mars_cv$results$Rsquared), 4), "\n")
## MARS RMSE: 4.354  | R^2: 0.6916