Write a program where
setting our seed for repeatability and setting up our x’s and y’s as
well as splitting our train and test data set 80/20 (100)
preped_data <- preparing_data(auto_data, target = "mpg")
X <- preped_data$X
y <- preped_data$y
#splitting into training and testing sets (80/20 split)
set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]
Choosing our top 5 overall models for MPG
best_subset_selection <- function(X, y, num_models = 5) {
train_df <- as.data.frame(cbind(y, X))
form <- as.formula("y ~ .")
regfit <- regsubsets(form, data = train_df, nvmax = ncol(X), method = "exhaustive")
reg_summary <- summary(regfit)
cp_vals <- reg_summary$cp
valid_idx <- which(!is.na(cp_vals))
best_indices <- valid_idx[order(cp_vals[valid_idx])][1:num_models]
best_formulas <- list()
for(i in best_indices) {
included <- reg_summary$which[i, ]
predictors <- names(included)[included][-1] #excludes intercept
if(length(predictors) == 0) next #skipping nulls
best_formulas[[paste0("Model_", i)]] <- as.formula(paste("y ~", paste(predictors, collapse = " + ")))
}
list(best_formulas = best_formulas,
cp_vals = cp_vals[best_indices])
}
best_lars_model <- function(X_train, y_train) {
# Fit the LARS model with type "lasso"
lars_model <- lars(X_train, y_train, type = "lasso")
# Use cross-validation to select the best step
cv_model <- cv.lars(X_train, y_train, type = "lasso")
best_step <- cv_model$index[which.min(cv_model$cv)]
# Identify nonzero coefficients at the best step
nonzero_coefs <- which(lars_model$beta[best_step, ] != 0)
cat("Non-zero coefficients in LARS model at best step (step", best_step, "):",
paste(nonzero_coefs, collapse = ", "), "\n")
list(lars_model = lars_model,
best_step = best_step,
nonzero_coefs = nonzero_coefs)
}
bs_results <- best_subset_selection(X_train, y_train, num_models = 5)
best_formulas1 <- bs_results$best_formulas
cat("Top candidate models by Cp:\n")
## Top candidate models by Cp:
print(best_formulas1)
## $Model_9
## y ~ displacement + horsepower + weight + year + `cylinders:acceleration` +
## `displacement:weight` + `displacement:acceleration` + `displacement:year` +
## `horsepower:year`
## <environment: 0x00000151ceb111e8>
##
## $Model_10
## y ~ displacement + horsepower + weight + acceleration + `cylinders:displacement` +
## `cylinders:horsepower` + `displacement:weight` + `displacement:year` +
## `horsepower:year` + `acceleration:year`
## <environment: 0x00000151ceb111e8>
##
## $Model_11
## y ~ displacement + horsepower + weight + acceleration + year +
## `cylinders:acceleration` + `displacement:weight` + `displacement:acceleration` +
## `displacement:year` + `horsepower:year` + `acceleration:year`
## <environment: 0x00000151ceb111e8>
##
## $Model_13
## y ~ cylinders + displacement + horsepower + weight + acceleration +
## year + `cylinders:acceleration` + `cylinders:year` + `displacement:weight` +
## `displacement:acceleration` + `displacement:year` + `horsepower:year` +
## `acceleration:year`
## <environment: 0x00000151ceb111e8>
##
## $Model_12
## y ~ displacement + horsepower + weight + acceleration + year +
## `cylinders:displacement` + `cylinders:weight` + `displacement:weight` +
## `displacement:acceleration` + `displacement:year` + `horsepower:year` +
## `acceleration:year`
## <environment: 0x00000151ceb111e8>
cat("Cp values for these models:\n")
## Cp values for these models:
print(bs_results$cp_vals)
## [1] 8.708209 9.295648 9.484580 9.715710 9.934279
Creating press Calculator
calc_press <- function(model) {
resid <- residuals(model)
hat_vals <- hatvalues(model)
sum((resid / (1 - hat_vals))^2)
}
Obtaining our press values
train_df <- as.data.frame(cbind(y = y_train, X_train))
press_values <- sapply(best_formulas1, function(form) {
mod <- lm(form, data = train_df)
calc_press(mod)
})
cat("PRESS values for top models:\n")
## PRESS values for top models:
print(press_values)
## Model_9 Model_10 Model_11 Model_13 Model_12
## 2385.696 2393.448 2395.702 2392.663 2403.257
Top model chosen with min press value
best_model_name <- names(which.min(press_values))
best_formula1 <- best_formulas1[[best_model_name]]
cat("Selected model by min PRESS:\n")
## Selected model by min PRESS:
print(best_formula1)
## y ~ displacement + horsepower + weight + year + `cylinders:acceleration` +
## `displacement:weight` + `displacement:acceleration` + `displacement:year` +
## `horsepower:year`
## <environment: 0x00000151ceb111e8>
final_model <- lm(best_formula1, data = train_df)
cat("Final model summary:\n")
## Final model summary:
print(summary(final_model))
##
## Call:
## lm(formula = best_formula1, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7608 -1.4214 -0.0954 1.2451 11.3314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.348e+01 1.107e+01 -8.448 1.23e-15 ***
## displacement -3.053e-01 7.706e-02 -3.962 9.27e-05 ***
## horsepower 1.282e+00 2.007e-01 6.385 6.38e-10 ***
## weight -7.204e-03 9.036e-04 -7.972 3.16e-14 ***
## year 1.936e+00 1.407e-01 13.760 < 2e-16 ***
## `cylinders:acceleration` 3.768e-02 1.883e-02 2.001 0.046283 *
## `displacement:weight` 1.719e-05 2.345e-06 7.333 2.04e-12 ***
## `displacement:acceleration` -2.537e-03 7.324e-04 -3.464 0.000609 ***
## `displacement:year` 3.668e-03 1.053e-03 3.483 0.000568 ***
## `horsepower:year` -1.830e-02 2.724e-03 -6.715 9.17e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.702 on 305 degrees of freedom
## Multiple R-squared: 0.8847, Adjusted R-squared: 0.8813
## F-statistic: 260 on 9 and 305 DF, p-value: < 2.2e-16
Evaluating our test data by comparing it to our predicted
values
test_df <- as.data.frame(cbind(y = y_test, X_test))
preds <- predict(final_model, newdata = test_df)
results_df <- data.frame(actual = y_test, predicted = preds)
cat("Comparison of actual vs. predicted on test data (1st few rows):\n")
## Comparison of actual vs. predicted on test data (1st few rows):
print(head(results_df))
## actual predicted
## 2 15 14.10196
## 13 15 14.77870
## 16 22 17.48390
## 17 18 17.67007
## 26 10 11.54983
## 34 19 18.66122
only here to make knit possible into html and thus into a pdf, makes
by origin possible
lars_results <- best_lars_model(X_train, y_train)

## Non-zero coefficients in LARS model at best step (step 0.5555556 ):
lars_model <- lars_results$lars_model
best_step <- lars_results$best_step
Generating our predictions for Leaps and Lars in order to plot
#generating predictions for Leaps model
preds_leaps <- predict(final_model, newdata = test_df)
corr_leaps <- cor(y_test, preds_leaps)
#generating predictions for LARS model
preds_lars <- predict(lars_model, newx = X_test, s = best_step, type = "fit", mode = "fraction")$fit
corr_lars <- cor(y_test, preds_lars)
par(mfrow = c(1, 2))
# Leaps plot
plot(y_test, preds_leaps,
main = paste0("Leaps Model Predictions\nCorrelation: ", round(corr_leaps, 3)),
xlab = "Actual MPG", ylab = "Predicted MPG")
abline(0, 1, col = "red")
# Lars plot
plot(y_test, preds_lars,
main = paste0("LARS Model Predictions\nCorrelation: ", round(corr_lars, 3)),
xlab = "Actual MPG", ylab = "Predicted MPG")
abline(0, 1, col = "red")

par(mfrow = c(1, 1))
Separating data into 3 sets by origin 1, 2, 3
auto_data1 <- Auto[Auto$origin == 1, ]
auto_data1 <- auto_data1[, !(names(auto_data1) %in% c("origin", "name"))]
auto_data2 <- Auto[Auto$origin == 2, ]
auto_data2 <- auto_data2[, !(names(auto_data2) %in% c("origin", "name"))]
auto_data3 <- Auto[Auto$origin == 3, ]
auto_data3 <- auto_data3[, !(names(auto_data3) %in% c("origin", "name"))]
setting up our x’s and y’s accordingly to origins
preped_data1 <- preparing_data(auto_data1, target = "mpg")
X1 <- preped_data1$X
y1 <- preped_data1$y
set.seed(42)
train_index1 <- createDataPartition(y1, p = 0.8, list = FALSE)
X1_train <- X1[train_index1, ]
y1_train <- y1[train_index1]
X1_test <- X1[-train_index1, ]
y1_test <- y1[-train_index1]
preped_data2 <- preparing_data(auto_data2, target = "mpg")
X2 <- preped_data2$X
y2 <- preped_data2$y
set.seed(42)
train_index2 <- createDataPartition(y2, p = 0.8, list = FALSE)
X2_train <- X2[train_index2, ]
y2_train <- y2[train_index2]
X2_test <- X2[-train_index2, ]
y2_test <- y2[-train_index2]
preped_data3 <- preparing_data(auto_data3, target = "mpg")
X3 <- preped_data3$X
y3 <- preped_data3$y
set.seed(42)
train_index3 <- createDataPartition(y2, p = 0.8, list = FALSE)
X3_train <- X3[train_index3, ]
y3_train <- y3[train_index3]
X3_test <- X3[-train_index3, ]
y3_test <- y3[-train_index3]
All of the above reused together in order to compare origins
datasets <- list(
origin1 = list(X_train = X1_train, y_train = y1_train,
X_test = X1_test, y_test = y1_test),
origin2 = list(X_train = X2_train, y_train = y2_train,
X_test = X2_test, y_test = y2_test),
origin3 = list(X_train = X3_train, y_train = y3_train,
X_test = X3_test, y_test = y3_test)
)
for(origin in names(datasets)) {
data <- datasets[[origin]]
bs_results <- best_subset_selection(data$X_train, data$y_train, num_models = 5)
cat(sprintf("Top candidate models by Cp for country %s:\n", origin))
print(bs_results$best_formulas)
cat("Cp values for these models:\n")
print(bs_results$cp_vals)
cat("\n")
train_df <- data.frame(y = data$y_train, data$X_train, check.names = FALSE)
press_values <- sapply(bs_results$best_formulas, function(form) {
mod <- lm(form, data = train_df)
calc_press(mod)
})
cat("PRESS values for top models:\n")
print(press_values)
cat("\n")
best_model_name <- names(which.min(press_values))
best_formula <- bs_results$best_formulas[[best_model_name]]
cat("Selected model by min PRESS:\n")
print(best_formula)
# Fit the final model using the selected formula
final_model <- lm(best_formula, data = train_df)
cat("Final model summary:\n")
print(summary(final_model))
cat("\n")
test_df <- data.frame(y = data$y_test, data$X_test, check.names = FALSE)
preds <- predict(final_model, newdata = test_df)
results_df <- data.frame(actual = data$y_test, predicted = preds)
cat("Comparison of actual vs. predicted on test data (1st few rows):\n")
print(head(results_df))
preds_leaps <- predict(final_model, newdata = test_df)
corr_leaps <- cor(data$y_test, preds_leaps)
preds_lars <- predict(lars_model, newx = data$X_test, s = best_step, type = "fit", mode = "fraction")$fit
corr_lars <- cor(data$y_test, preds_lars)
par(mfrow = c(1, 2))
#Leaps plot
plot(data$y_test, preds_leaps,
main = paste0("Leaps Model Predictions for ", origin, "\nCorrelation: ", round(corr_leaps, 3)),
xlab = "Actual MPG", ylab = "Predicted MPG",
cex.main = 0.8, cex.lab = 0.8, cex.axis = 0.8)
abline(0, 1, col = "red")
#LARS plot
plot(data$y_test, preds_lars,
main = paste0("LARS Model Predictions for ", origin, "\nCorrelation: ", round(corr_lars, 3)),
xlab = "Actual MPG", ylab = "Predicted MPG",
cex.main = 0.8, cex.lab = 0.8, cex.axis = 0.8)
abline(0, 1, col = "red")
par(mfrow = c(1, 1))
cat("******************************************************************************************\n")
}
## Top candidate models by Cp for country origin1:
## $Model_8
## y ~ cylinders + horsepower + acceleration + `cylinders:weight` +
## `cylinders:acceleration` + `horsepower:weight` + `weight:year` +
## `acceleration:year`
## <environment: 0x00000151d9c29858>
##
## $Model_9
## y ~ cylinders + horsepower + acceleration + `cylinders:weight` +
## `cylinders:acceleration` + `cylinders:year` + `horsepower:weight` +
## `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
##
## $Model_10
## y ~ displacement + acceleration + `cylinders:weight` + `cylinders:acceleration` +
## `cylinders:year` + `displacement:year` + `horsepower:weight` +
## `horsepower:year` + `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
##
## $Model_11
## y ~ displacement + horsepower + weight + acceleration + `cylinders:weight` +
## `cylinders:acceleration` + `cylinders:year` + `displacement:horsepower` +
## `displacement:year` + `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
##
## $Model_7
## y ~ cylinders + acceleration + `cylinders:weight` + `cylinders:acceleration` +
## `horsepower:acceleration` + `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
##
## Cp values for these models:
## [1] 1.007339 2.146979 2.282387 3.156909 3.787229
##
## PRESS values for top models:
## Model_8 Model_9 Model_10 Model_11 Model_7
## 884.1332 887.9229 907.1569 952.1318 896.7424
##
## Selected model by min PRESS:
## y ~ cylinders + horsepower + acceleration + `cylinders:weight` +
## `cylinders:acceleration` + `horsepower:weight` + `weight:year` +
## `acceleration:year`
## <environment: 0x00000151d9c29858>
## Final model summary:
##
## Call:
## lm(formula = best_formula, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2002 -1.2647 0.0155 1.1076 11.8067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.156e+01 5.762e+00 15.889 < 2e-16 ***
## cylinders -5.751e+00 1.207e+00 -4.766 3.73e-06 ***
## horsepower -1.398e-01 4.346e-02 -3.216 0.00153 **
## acceleration -7.412e+00 4.475e-01 -16.564 < 2e-16 ***
## `cylinders:weight` 7.737e-04 2.705e-04 2.860 0.00471 **
## `cylinders:acceleration` 2.127e-01 4.738e-02 4.490 1.24e-05 ***
## `horsepower:weight` 2.939e-05 1.045e-05 2.813 0.00542 **
## `weight:year` -1.745e-04 1.664e-05 -10.483 < 2e-16 ***
## `acceleration:year` 7.671e-02 4.408e-03 17.402 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.056 on 189 degrees of freedom
## Multiple R-squared: 0.9008, Adjusted R-squared: 0.8966
## F-statistic: 214.5 on 8 and 189 DF, p-value: < 2.2e-16
##
##
## Comparison of actual vs. predicted on test data (1st few rows):
## actual predicted
## 1 18 15.57781
## 2 15 14.23848
## 13 15 15.27868
## 14 14 12.51643
## 25 21 19.94057
## 26 10 12.09805

## ******************************************************************************************
## Top candidate models by Cp for country origin2:
## $Model_5
## y ~ horsepower + acceleration + year + `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
##
## $Model_6
## y ~ horsepower + year + `cylinders:acceleration` + `cylinders:year` +
## `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
##
## $Model_7
## y ~ horsepower + weight + acceleration + year + `horsepower:year` +
## `weight:acceleration` + `weight:year`
## <environment: 0x00000151d1e90400>
##
## $Model_8
## y ~ cylinders + horsepower + acceleration + year + `cylinders:weight` +
## `horsepower:weight` + `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
##
## $Model_4
## y ~ horsepower + weight + year + `horsepower:year`
## <environment: 0x00000151d1e90400>
##
## Cp values for these models:
## [1] -1.7613949 -0.4831622 0.7532218 0.9017915 1.5105327
##
## PRESS values for top models:
## Model_5 Model_6 Model_7 Model_8 Model_4
## 481.2531 478.8528 481.6577 494.7330 521.8340
##
## Selected model by min PRESS:
## y ~ horsepower + year + `cylinders:acceleration` + `cylinders:year` +
## `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
## Final model summary:
##
## Call:
## lm(formula = best_formula, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7495 -1.4228 0.2323 1.2989 8.6054
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.193e+02 3.699e+01 -5.929 3.01e-07 ***
## horsepower 2.111e+00 4.714e-01 4.479 4.50e-05 ***
## year 3.521e+00 4.789e-01 7.351 1.90e-09 ***
## `cylinders:acceleration` 3.301e-01 9.244e-02 3.571 0.000809 ***
## `cylinders:year` -7.124e-02 2.337e-02 -3.048 0.003707 **
## `horsepower:year` -2.874e-02 6.280e-03 -4.576 3.25e-05 ***
## `weight:acceleration` -3.749e-04 8.085e-05 -4.637 2.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.807 on 49 degrees of freedom
## Multiple R-squared: 0.8338, Adjusted R-squared: 0.8134
## F-statistic: 40.97 on 6 and 49 DF, p-value: < 2.2e-16
##
##
## Comparison of actual vs. predicted on test data (1st few rows):
## actual predicted
## 51 28 24.16980
## 53 30 23.75635
## 78 22 23.71001
## 80 26 25.58853
## 103 26 29.91588
## 121 19 22.21875

## ******************************************************************************************
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 3 linear dependencies found
## Reordering variables and trying again:
## Top candidate models by Cp for country origin3:
## $Model_6
## y ~ year + `cylinders:displacement` + `cylinders:horsepower` +
## `displacement:acceleration` + `displacement:year` + `horsepower:year`
## <environment: 0x00000151d00c0d68>
##
## $Model_5
## y ~ weight + acceleration + `displacement:acceleration` + `displacement:year` +
## `weight:acceleration`
## <environment: 0x00000151d00c0d68>
##
## $Model_7
## y ~ displacement + weight + `cylinders:displacement` + `cylinders:horsepower` +
## `displacement:acceleration` + `horsepower:year` + `weight:year`
## <environment: 0x00000151d00c0d68>
##
## $Model_8
## y ~ displacement + weight + `cylinders:displacement` + `cylinders:horsepower` +
## `displacement:acceleration` + `displacement:year` + `horsepower:year` +
## `weight:year`
## <environment: 0x00000151d00c0d68>
##
## $Model_10
## y ~ displacement + weight + `cylinders:displacement` + `cylinders:horsepower` +
## `displacement:horsepower` + `displacement:acceleration` +
## `horsepower:weight` + `horsepower:year` + `weight:acceleration` +
## `weight:year`
## <environment: 0x00000151d00c0d68>
##
## Cp values for these models:
## [1] -0.0645841 1.2196160 1.2789209 2.4662333 3.0777192
##
## PRESS values for top models:
## Model_6 Model_5 Model_7 Model_8 Model_10
## 513.6535 598.7956 548.8406 544.5331 630.6537
##
## Selected model by min PRESS:
## y ~ year + `cylinders:displacement` + `cylinders:horsepower` +
## `displacement:acceleration` + `displacement:year` + `horsepower:year`
## <environment: 0x00000151d00c0d68>
## Final model summary:
##
## Call:
## lm(formula = best_formula, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6496 -1.6434 -0.0782 1.3417 9.5466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.249604 13.034385 -4.085 0.000163 ***
## year 1.402686 0.166081 8.446 4.01e-11 ***
## `cylinders:displacement` -0.174018 0.043717 -3.981 0.000227 ***
## `cylinders:horsepower` 0.259501 0.058034 4.472 4.61e-05 ***
## `displacement:acceleration` -0.006397 0.003254 -1.966 0.054999 .
## `displacement:year` 0.007557 0.002262 3.341 0.001603 **
## `horsepower:year` -0.013781 0.002799 -4.923 1.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.969 on 49 degrees of freedom
## Multiple R-squared: 0.7467, Adjusted R-squared: 0.7157
## F-statistic: 24.07 on 6 and 49 DF, p-value: 4.738e-13
##
##
## Comparison of actual vs. predicted on test data (1st few rows):
## actual predicted
## 55 35 30.67135
## 72 19 22.50731
## 109 20 26.16063
## 112 18 24.69484
## 124 20 26.06789
## 150 24 24.37393

## ******************************************************************************************