library(ggplot2)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(caret)
## Loading required package: lattice
Auto <- read.table("C://Users/Owner/Desktop/Predictive Modeling/Auto-1.txt", header=T)
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(color = "steelblue", alpha = 0.6, size = 2) +
labs(title = "MPG vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon") +
theme_bw()
The scatter plot shows a negative relationship between horsepower and mpg, therefore more powerful engines tend to consume more fuel. The relationship also depicts a possible curvilinear pattern.
linear_model <- lm(mpg ~ horsepower, data = Auto)
cat("\n=== Linear Regression Model Summary ===\n")
##
## === Linear Regression Model Summary ===
summary(linear_model)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
coefficients <- coef(linear_model)
cat("\n=== Regression Equation ===\n")
##
## === Regression Equation ===
cat(sprintf("mpg = %.4f + (%.4f) * horsepower\n",
coefficients[1], coefficients[2]))
## mpg = 39.9359 + (-0.1578) * horsepower
r_squared <- summary(linear_model)$r.squared
adj_r_squared <- summary(linear_model)$adj.r.squared
cat("\n=== R-Sqaured Values ===\n")
##
## === R-Sqaured Values ===
cat(sprintf("R-squared:%.4f (%.2f%%)\n", r_squared, r_squared * 100))
## R-squared:0.6059 (60.59%)
cat(sprintf("Adjusted R-squared: %.4f(%.2f%%)\n", adj_r_squared, adj_r_squared * 100))
## Adjusted R-squared: 0.6049(60.49%)
cat(sprintf("\nInterpretation: %.2f%% of the variation in mpg can be explained\n", r_squared * 100))
##
## Interpretation: 60.59% of the variation in mpg can be explained
cat("by the linear relationship with horsepower.\n")
## by the linear relationship with horsepower.
residuals <- residuals(linear_model)
par(mfrow = c(1,2))
boxplot(residuals,
main = "Boxplot of Residuals",
ylab = "Residuals",
col = "lightblue")
grid()
outliers <- boxplot.stats(residuals)$out
cat("\n=== Outlier Detection ===\n")
##
## === Outlier Detection ===
cat(sprintf("# of potential outliers: %d\n", length(outliers)))
## # of potential outliers: 9
if(length(outliers) > 0) {
cat("Outlier values")
print(outliers)
}
## Outlier values 116 153 154 308 321 324 328 331
## 12.36843 -13.57104 -13.57104 13.56034 16.92405 11.94069 15.23974 13.59964
## 389
## 12.27207
cat("\n=== Linear model with 10-fold CV ===\n")
##
## === Linear model with 10-fold CV ===
set.seed(123)
train_control <- trainControl(method = "cv", number = 10)
linear_cv <- train(mpg ~ horsepower,
data = Auto,
method = "lm",
trControl = train_control)
print(linear_cv)
## Linear Regression
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 353, 352, 353, 353, 353, 354, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.857287 0.6141382 3.832307
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cat(sprintf("\nLinear Model CV RMSE: %.4f\n", linear_cv$results$RMSE))
##
## Linear Model CV RMSE: 4.8573
cat(sprintf("\nLinear Model CV R-Squared: %.4f\n", linear_cv$results$Rsquared))
##
## Linear Model CV R-Squared: 0.6141
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(color = "steelblue", alpha = 0.6, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "red", linewidth = 1.2) +
labs(title = "Linear model: Fitted Line",
x = "Horsepower",
y = "Miles per Gallon") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Auto$fitted_linear <- fitted(linear_model)
ggplot(Auto, aes(x = fitted_linear, y = mpg)) +
geom_point(color = "steelblue", alpha = 0.6, size = 2) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", linewidth = 1.2) +
labs(title = "Linear model: Observed vs Fitted",
x = "Fitted Values",
y = "Observed Values") +
theme_minimal()
#Create the squared term of horsepower, called horsepower2
Auto$horsepower2 = Auto$horsepower^2
#sort the data in an ascending order
Auto = Auto[order(Auto[,3],decreasing=FALSE),]
quadratic_model <- lm(mpg ~ horsepower + horsepower2, data = Auto)
cat("\n=== Quadratic Model Summary ===\n")
##
## === Quadratic Model Summary ===
summary(quadratic_model)
##
## Call:
## lm(formula = mpg ~ horsepower + horsepower2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## horsepower -0.4661896 0.0311246 -14.98 <2e-16 ***
## horsepower2 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
set.seed(123)
quadratic_cv <- train(mpg ~ horsepower + horsepower2,
data = Auto,
method = "lm",
trControl = train_control)
print(quadratic_cv)
## Linear Regression
##
## 392 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 353, 352, 353, 353, 353, 354, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.332463 0.7006322 3.279161
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cat("\n=== Quadratic Model CV RMSE ===\n")
##
## === Quadratic Model CV RMSE ===
cat(sprintf("\nQuadratic Model CV RMSE: %.4f\n", quadratic_cv$results$RMSE))
##
## Quadratic Model CV RMSE: 4.3325
cat(sprintf("\nQuadratic Model CV R-Squared: %.4f\n", quadratic_cv$results$Rsquared))
##
## Quadratic Model CV R-Squared: 0.7006
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(color = "steelblue", size = 2) +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE, color = "red", linewidth = 1.2) +
labs(title = "Quadratic Model: Fitted Curve",
x = "Horsepower",
y = "Miles per Gallon") +
theme_minimal()
fitted_quadratic <- fitted(quadratic_model)
ggplot(Auto, aes(x = fitted_quadratic, y = mpg)) +
geom_point(color = "steelblue", size = 2) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", linewidth = 1.2) +
labs(title = "Quadratic Model: Observed vs Fitted",
x = "Fitted Values",
y = "Observed Values") +
theme_minimal()
cat("\n=== MARS MODEL ===\n")
##
## === MARS MODEL ===
mars_grid <- expand.grid(
degree = 1:3,
nprune = seq(2,20, by = 2)
)
set.seed(123)
mars_cv <- train(mpg ~ horsepower,
data = Auto,
method = "earth",
trControl = train_control,
tuneGrid = mars_grid)
print(mars_cv)
## Multivariate Adaptive Regression Spline
##
## 392 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 353, 352, 353, 353, 353, 354, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.840831 0.6257285 3.818300
## 1 4 4.298322 0.7058160 3.259427
## 1 6 4.298322 0.7058160 3.259427
## 1 8 4.298322 0.7058160 3.259427
## 1 10 4.298322 0.7058160 3.259427
## 1 12 4.298322 0.7058160 3.259427
## 1 14 4.298322 0.7058160 3.259427
## 1 16 4.298322 0.7058160 3.259427
## 1 18 4.298322 0.7058160 3.259427
## 1 20 4.298322 0.7058160 3.259427
## 2 2 4.840831 0.6257285 3.818300
## 2 4 4.298322 0.7058160 3.259427
## 2 6 4.298322 0.7058160 3.259427
## 2 8 4.298322 0.7058160 3.259427
## 2 10 4.298322 0.7058160 3.259427
## 2 12 4.298322 0.7058160 3.259427
## 2 14 4.298322 0.7058160 3.259427
## 2 16 4.298322 0.7058160 3.259427
## 2 18 4.298322 0.7058160 3.259427
## 2 20 4.298322 0.7058160 3.259427
## 3 2 4.840831 0.6257285 3.818300
## 3 4 4.298322 0.7058160 3.259427
## 3 6 4.298322 0.7058160 3.259427
## 3 8 4.298322 0.7058160 3.259427
## 3 10 4.298322 0.7058160 3.259427
## 3 12 4.298322 0.7058160 3.259427
## 3 14 4.298322 0.7058160 3.259427
## 3 16 4.298322 0.7058160 3.259427
## 3 18 4.298322 0.7058160 3.259427
## 3 20 4.298322 0.7058160 3.259427
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
cat(sprintf("Degree: %d\n", mars_cv$bestTune$degree))
## Degree: 1
cat(sprintf("Number of terms(nrpune): %d\n", mars_cv$bestTune$nprune))
## Number of terms(nrpune): 4
cat(sprintf("\nMars Model CV RMSE: %.4f\n", linear_cv$results$RMSE))
##
## Mars Model CV RMSE: 4.8573
cat(sprintf("\nMars Model CV R-Squared: %.4f\n", linear_cv$results$Rsquared))
##
## Mars Model CV R-Squared: 0.6141
mars_model <- earth(mpg ~ horsepower,
data = Auto,
degree = mars_cv$bestTune$degree,
nprune = mars_cv$bestTune$nprune)
cat("\n=== MARS MODEL Summary ===\n")
##
## === MARS MODEL Summary ===
print(summary(mars_model))
## Call: earth(formula=mpg~horsepower, data=Auto, degree=mars_cv$bestTune$degree,
## nprune=mars_cv$bestTune$nprune)
##
## coefficients
## (Intercept) 20.3320268
## h(103-horsepower) 0.3147486
## h(horsepower-120) -0.1780548
## h(horsepower-160) 0.1761460
##
## Selected 4 of 5 terms, and 1 of 1 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 5 terms
## Importance: horsepower
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 19.05576 RSS 7205.459 GRSq 0.6879887 RSq 0.697491
Auto$fitted_mars <- fitted(mars_model)
Auto_sorted <- Auto[order(Auto$horsepower), ]
ggplot(Auto_sorted, aes(x = horsepower, y = mpg)) +
geom_point(color = "steelblue", size = 2) +
geom_line(aes(y = fitted_mars), color = "red", linewidth = 1.3) +
labs(title = "MARS Model: Fitted Line",
x = "Horsepower",
y = "Miles per Gallon") +
theme_minimal()
Auto$fitted_mars <- fitted(mars_model)
ggplot(Auto, aes(x = fitted_mars, y = mpg)) +
geom_point(color = "steelblue", size = 2) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", linewidth = 1.2) +
labs(title = "MARS Model: Observed vs Fitted",
x = "Fitted Values",
y = "Observed Values") +
theme_minimal()
cat("\n=== MODEL Comparison ===\n")
##
## === MODEL Comparison ===
comparison_table <- data.frame(
Model = c("Linear", "Quadratic", "MARS"),
RMSE = c(linear_cv$results$RMSE,
quadratic_cv$results$RMSE,
min(mars_cv$results$RMSE)),
R_squared = c(linear_cv$results$Rsquared,
quadratic_cv$results$Rsquared,
max(mars_cv$results$Rsquared)),
MAE = c(linear_cv$results$MAE,
quadratic_cv$results$MAE,
min(mars_cv$results$MAE))
)
print(comparison_table)
## Model RMSE R_squared MAE
## 1 Linear 4.857287 0.6141382 3.832307
## 2 Quadratic 4.332463 0.7006322 3.279161
## 3 MARS 4.298322 0.7058160 3.259427
best_rmse <- which.min(comparison_table$RMSE)
best_r2 <- which.max(comparison_table$R_squared)
cat(sprintf("Best model by RMSE : %s (RMSE = %.4f)\n", comparison_table$Model[best_rmse], comparison_table$RMSE[best_rmse]))
## Best model by RMSE : MARS (RMSE = 4.2983)
cat(sprintf("Best model by R-squared : %s (R^2 = %.4f)\n", comparison_table$Model[best_r2], comparison_table$R_squared[best_r2]))
## Best model by R-squared : MARS (R^2 = 0.7058)
The quadratic model performs better than the linear model because the relationship between horsepower and mpg is non-linear. The MARS model may capture the relationship more accurately, particularly if there are varying patterns across horsepower values. In conclusion, choose the model with the lowest CV RMSE and highest R-squared, which balance predictive accuracy with complexity.