library(ISLR)
library(leaps)
library(lars)
## Loaded lars 1.3
library(ggplot2)
library(caret)
## Loading required package: lattice

Assignment 4 Leaps vs Lars

Write a program where

input a matrix or data frame, choosing a y variable and a set of x variables, specifying a training data set and a testing data set using random selection (of units so rows in the (X|y) matrix)

data("Auto")

#removing unwanted columns(last 2 rows)
auto_data <- Auto[, !names(Auto) %in% c("name", "origin")]
#using model.matrix
preparing_data <- function(df, target = "mpg") {
  X <- df[, setdiff(names(df), target)]
  y <- df[[target]]
  #making sure factors are numeric
  X[] <- lapply(X, function(col) if (is.factor(col)) as.numeric(col) else col)
  #creating 2nd order (with interactions) features
  X_poly <- model.matrix(~ .^2, data = X)[, -1]  #removing intercept column
  list(X = X_poly, y = y)
}

setting our seed for repeatability and setting up our x’s and y’s as well as splitting our train and test data set 80/20 (100)

preped_data <- preparing_data(auto_data, target = "mpg")
X <- preped_data$X
y <- preped_data$y

#splitting into training and testing sets (80/20 split)
set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)

X_train <- X[train_index, ]
y_train <- y[train_index]

X_test  <- X[-train_index, ]
y_test  <- y[-train_index]

Choosing our top 5 overall models for MPG

best_subset_selection <- function(X, y, num_models = 5) {
  
  train_df <- as.data.frame(cbind(y, X))
  form <- as.formula("y ~ .")
  
  regfit <- regsubsets(form, data = train_df, nvmax = ncol(X), method = "exhaustive")
  reg_summary <- summary(regfit)
  
  cp_vals <- reg_summary$cp
  valid_idx <- which(!is.na(cp_vals))
  best_indices <- valid_idx[order(cp_vals[valid_idx])][1:num_models]
  
  best_formulas <- list()
  for(i in best_indices) {
    included <- reg_summary$which[i, ]
    
    predictors <- names(included)[included][-1] #excludes intercept
    if(length(predictors) == 0) next #skipping nulls
    best_formulas[[paste0("Model_", i)]] <- as.formula(paste("y ~", paste(predictors, collapse = " + ")))
  }
  list(best_formulas = best_formulas,
       cp_vals = cp_vals[best_indices])
}

best_lars_model <- function(X_train, y_train) {
  # Fit the LARS model with type "lasso"
  lars_model <- lars(X_train, y_train, type = "lasso")
  
  # Use cross-validation to select the best step
  cv_model <- cv.lars(X_train, y_train, type = "lasso")
  best_step <- cv_model$index[which.min(cv_model$cv)]
  
  # Identify nonzero coefficients at the best step
  nonzero_coefs <- which(lars_model$beta[best_step, ] != 0)
  cat("Non-zero coefficients in LARS model at best step (step", best_step, "):", 
      paste(nonzero_coefs, collapse = ", "), "\n")
  
  list(lars_model = lars_model,
       best_step = best_step,
       nonzero_coefs = nonzero_coefs)
}

bs_results <- best_subset_selection(X_train, y_train, num_models = 5)
best_formulas1 <- bs_results$best_formulas
cat("Top candidate models by Cp:\n")
## Top candidate models by Cp:
print(best_formulas1)
## $Model_9
## y ~ displacement + horsepower + weight + year + `cylinders:acceleration` + 
##     `displacement:weight` + `displacement:acceleration` + `displacement:year` + 
##     `horsepower:year`
## <environment: 0x00000151ceb111e8>
## 
## $Model_10
## y ~ displacement + horsepower + weight + acceleration + `cylinders:displacement` + 
##     `cylinders:horsepower` + `displacement:weight` + `displacement:year` + 
##     `horsepower:year` + `acceleration:year`
## <environment: 0x00000151ceb111e8>
## 
## $Model_11
## y ~ displacement + horsepower + weight + acceleration + year + 
##     `cylinders:acceleration` + `displacement:weight` + `displacement:acceleration` + 
##     `displacement:year` + `horsepower:year` + `acceleration:year`
## <environment: 0x00000151ceb111e8>
## 
## $Model_13
## y ~ cylinders + displacement + horsepower + weight + acceleration + 
##     year + `cylinders:acceleration` + `cylinders:year` + `displacement:weight` + 
##     `displacement:acceleration` + `displacement:year` + `horsepower:year` + 
##     `acceleration:year`
## <environment: 0x00000151ceb111e8>
## 
## $Model_12
## y ~ displacement + horsepower + weight + acceleration + year + 
##     `cylinders:displacement` + `cylinders:weight` + `displacement:weight` + 
##     `displacement:acceleration` + `displacement:year` + `horsepower:year` + 
##     `acceleration:year`
## <environment: 0x00000151ceb111e8>
cat("Cp values for these models:\n")
## Cp values for these models:
print(bs_results$cp_vals)
## [1] 8.708209 9.295648 9.484580 9.715710 9.934279

Creating press Calculator

calc_press <- function(model) {
  resid <- residuals(model)
  hat_vals <- hatvalues(model)
  sum((resid / (1 - hat_vals))^2)
}

Obtaining our press values

train_df <- as.data.frame(cbind(y = y_train, X_train))
press_values <- sapply(best_formulas1, function(form) {
  mod <- lm(form, data = train_df)
  calc_press(mod)
})
cat("PRESS values for top models:\n")
## PRESS values for top models:
print(press_values)
##  Model_9 Model_10 Model_11 Model_13 Model_12 
## 2385.696 2393.448 2395.702 2392.663 2403.257

Top model chosen with min press value

best_model_name <- names(which.min(press_values))
best_formula1 <- best_formulas1[[best_model_name]]
cat("Selected model by min PRESS:\n")
## Selected model by min PRESS:
print(best_formula1)
## y ~ displacement + horsepower + weight + year + `cylinders:acceleration` + 
##     `displacement:weight` + `displacement:acceleration` + `displacement:year` + 
##     `horsepower:year`
## <environment: 0x00000151ceb111e8>
final_model <- lm(best_formula1, data = train_df)
cat("Final model summary:\n")
## Final model summary:
print(summary(final_model))
## 
## Call:
## lm(formula = best_formula1, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7608 -1.4214 -0.0954  1.2451 11.3314 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -9.348e+01  1.107e+01  -8.448 1.23e-15 ***
## displacement                -3.053e-01  7.706e-02  -3.962 9.27e-05 ***
## horsepower                   1.282e+00  2.007e-01   6.385 6.38e-10 ***
## weight                      -7.204e-03  9.036e-04  -7.972 3.16e-14 ***
## year                         1.936e+00  1.407e-01  13.760  < 2e-16 ***
## `cylinders:acceleration`     3.768e-02  1.883e-02   2.001 0.046283 *  
## `displacement:weight`        1.719e-05  2.345e-06   7.333 2.04e-12 ***
## `displacement:acceleration` -2.537e-03  7.324e-04  -3.464 0.000609 ***
## `displacement:year`          3.668e-03  1.053e-03   3.483 0.000568 ***
## `horsepower:year`           -1.830e-02  2.724e-03  -6.715 9.17e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.702 on 305 degrees of freedom
## Multiple R-squared:  0.8847, Adjusted R-squared:  0.8813 
## F-statistic:   260 on 9 and 305 DF,  p-value: < 2.2e-16

Evaluating our test data by comparing it to our predicted values

test_df <- as.data.frame(cbind(y = y_test, X_test))
preds <- predict(final_model, newdata = test_df)
results_df <- data.frame(actual = y_test, predicted = preds)
cat("Comparison of actual vs. predicted on test data (1st few rows):\n")
## Comparison of actual vs. predicted on test data (1st few rows):
print(head(results_df))
##    actual predicted
## 2      15  14.10196
## 13     15  14.77870
## 16     22  17.48390
## 17     18  17.67007
## 26     10  11.54983
## 34     19  18.66122

only here to make knit possible into html and thus into a pdf, makes by origin possible

lars_results <- best_lars_model(X_train, y_train)

## Non-zero coefficients in LARS model at best step (step 0.5555556 ):
lars_model <- lars_results$lars_model
best_step <- lars_results$best_step

Generating our predictions for Leaps and Lars in order to plot

#generating predictions for Leaps model
preds_leaps <- predict(final_model, newdata = test_df)
corr_leaps <- cor(y_test, preds_leaps)

#generating predictions for LARS model 
preds_lars <- predict(lars_model, newx = X_test, s = best_step, type = "fit", mode = "fraction")$fit
corr_lars  <- cor(y_test, preds_lars)


par(mfrow = c(1, 2))

# Leaps plot
plot(y_test, preds_leaps,
     main = paste0("Leaps Model Predictions\nCorrelation: ", round(corr_leaps, 3)),
     xlab = "Actual MPG", ylab = "Predicted MPG")
abline(0, 1, col = "red")

# Lars plot
plot(y_test, preds_lars,
     main = paste0("LARS Model Predictions\nCorrelation: ", round(corr_lars, 3)),
     xlab = "Actual MPG", ylab = "Predicted MPG")
abline(0, 1, col = "red")

par(mfrow = c(1, 1))

Separating data into 3 sets by origin 1, 2, 3

auto_data1 <- Auto[Auto$origin == 1, ]
auto_data1 <- auto_data1[, !(names(auto_data1) %in% c("origin", "name"))]

auto_data2 <- Auto[Auto$origin == 2, ]
auto_data2 <- auto_data2[, !(names(auto_data2) %in% c("origin", "name"))]

auto_data3 <- Auto[Auto$origin == 3, ]
auto_data3 <- auto_data3[, !(names(auto_data3) %in% c("origin", "name"))]

setting up our x’s and y’s accordingly to origins

preped_data1 <- preparing_data(auto_data1, target = "mpg")
X1 <- preped_data1$X
y1 <- preped_data1$y

set.seed(42)
train_index1 <- createDataPartition(y1, p = 0.8, list = FALSE)

X1_train <- X1[train_index1, ]
y1_train <- y1[train_index1]

X1_test  <- X1[-train_index1, ]
y1_test  <- y1[-train_index1]

preped_data2 <- preparing_data(auto_data2, target = "mpg")
X2 <- preped_data2$X
y2 <- preped_data2$y

set.seed(42)
train_index2 <- createDataPartition(y2, p = 0.8, list = FALSE)

X2_train <- X2[train_index2, ]
y2_train <- y2[train_index2]

X2_test  <- X2[-train_index2, ]
y2_test  <- y2[-train_index2]

preped_data3 <- preparing_data(auto_data3, target = "mpg")
X3 <- preped_data3$X
y3 <- preped_data3$y

set.seed(42)
train_index3 <- createDataPartition(y2, p = 0.8, list = FALSE)

X3_train <- X3[train_index3, ]
y3_train <- y3[train_index3]

X3_test  <- X3[-train_index3, ]
y3_test  <- y3[-train_index3]

All of the above reused together in order to compare origins

datasets <- list(
  origin1 = list(X_train = X1_train, y_train = y1_train,
                 X_test = X1_test, y_test = y1_test),
  origin2 = list(X_train = X2_train, y_train = y2_train,
                 X_test = X2_test, y_test = y2_test),
  origin3 = list(X_train = X3_train, y_train = y3_train,
                 X_test = X3_test, y_test = y3_test)
)

for(origin in names(datasets)) {
  data <- datasets[[origin]]
  bs_results <- best_subset_selection(data$X_train, data$y_train, num_models = 5)
  
  cat(sprintf("Top candidate models by Cp for country %s:\n", origin))
  print(bs_results$best_formulas)
  cat("Cp values for these models:\n")
  print(bs_results$cp_vals)
  cat("\n")
  
  train_df <- data.frame(y = data$y_train, data$X_train, check.names = FALSE)
  
  press_values <- sapply(bs_results$best_formulas, function(form) {
    mod <- lm(form, data = train_df)
    calc_press(mod)
  })
  cat("PRESS values for top models:\n")
  print(press_values)
  cat("\n")
  
  best_model_name <- names(which.min(press_values))
  best_formula <- bs_results$best_formulas[[best_model_name]]
  cat("Selected model by min PRESS:\n")
  print(best_formula)
  
  # Fit the final model using the selected formula
  final_model <- lm(best_formula, data = train_df)
  cat("Final model summary:\n")
  print(summary(final_model))
  cat("\n")
  
  test_df <- data.frame(y = data$y_test, data$X_test, check.names = FALSE)
  preds <- predict(final_model, newdata = test_df)
  results_df <- data.frame(actual = data$y_test, predicted = preds)
  
  cat("Comparison of actual vs. predicted on test data (1st few rows):\n")
  print(head(results_df))
  
  preds_leaps <- predict(final_model, newdata = test_df)
  corr_leaps <- cor(data$y_test, preds_leaps)
  
  preds_lars <- predict(lars_model, newx = data$X_test, s = best_step, type = "fit", mode = "fraction")$fit
  corr_lars  <- cor(data$y_test, preds_lars)
  
  par(mfrow = c(1, 2))
  #Leaps plot
  plot(data$y_test, preds_leaps,
       main = paste0("Leaps Model Predictions for ", origin, "\nCorrelation: ", round(corr_leaps, 3)),
       xlab = "Actual MPG", ylab = "Predicted MPG",
       cex.main = 0.8, cex.lab = 0.8, cex.axis = 0.8)
  abline(0, 1, col = "red")
  #LARS plot
  plot(data$y_test, preds_lars,
       main = paste0("LARS Model Predictions for ", origin, "\nCorrelation: ", round(corr_lars, 3)),
       xlab = "Actual MPG", ylab = "Predicted MPG",
       cex.main = 0.8, cex.lab = 0.8, cex.axis = 0.8)
  abline(0, 1, col = "red")
  par(mfrow = c(1, 1))

  cat("******************************************************************************************\n")
}
## Top candidate models by Cp for country origin1:
## $Model_8
## y ~ cylinders + horsepower + acceleration + `cylinders:weight` + 
##     `cylinders:acceleration` + `horsepower:weight` + `weight:year` + 
##     `acceleration:year`
## <environment: 0x00000151d9c29858>
## 
## $Model_9
## y ~ cylinders + horsepower + acceleration + `cylinders:weight` + 
##     `cylinders:acceleration` + `cylinders:year` + `horsepower:weight` + 
##     `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
## 
## $Model_10
## y ~ displacement + acceleration + `cylinders:weight` + `cylinders:acceleration` + 
##     `cylinders:year` + `displacement:year` + `horsepower:weight` + 
##     `horsepower:year` + `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
## 
## $Model_11
## y ~ displacement + horsepower + weight + acceleration + `cylinders:weight` + 
##     `cylinders:acceleration` + `cylinders:year` + `displacement:horsepower` + 
##     `displacement:year` + `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
## 
## $Model_7
## y ~ cylinders + acceleration + `cylinders:weight` + `cylinders:acceleration` + 
##     `horsepower:acceleration` + `weight:year` + `acceleration:year`
## <environment: 0x00000151d9c29858>
## 
## Cp values for these models:
## [1] 1.007339 2.146979 2.282387 3.156909 3.787229
## 
## PRESS values for top models:
##  Model_8  Model_9 Model_10 Model_11  Model_7 
## 884.1332 887.9229 907.1569 952.1318 896.7424 
## 
## Selected model by min PRESS:
## y ~ cylinders + horsepower + acceleration + `cylinders:weight` + 
##     `cylinders:acceleration` + `horsepower:weight` + `weight:year` + 
##     `acceleration:year`
## <environment: 0x00000151d9c29858>
## Final model summary:
## 
## Call:
## lm(formula = best_formula, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2002 -1.2647  0.0155  1.1076 11.8067 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               9.156e+01  5.762e+00  15.889  < 2e-16 ***
## cylinders                -5.751e+00  1.207e+00  -4.766 3.73e-06 ***
## horsepower               -1.398e-01  4.346e-02  -3.216  0.00153 ** 
## acceleration             -7.412e+00  4.475e-01 -16.564  < 2e-16 ***
## `cylinders:weight`        7.737e-04  2.705e-04   2.860  0.00471 ** 
## `cylinders:acceleration`  2.127e-01  4.738e-02   4.490 1.24e-05 ***
## `horsepower:weight`       2.939e-05  1.045e-05   2.813  0.00542 ** 
## `weight:year`            -1.745e-04  1.664e-05 -10.483  < 2e-16 ***
## `acceleration:year`       7.671e-02  4.408e-03  17.402  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.056 on 189 degrees of freedom
## Multiple R-squared:  0.9008, Adjusted R-squared:  0.8966 
## F-statistic: 214.5 on 8 and 189 DF,  p-value: < 2.2e-16
## 
## 
## Comparison of actual vs. predicted on test data (1st few rows):
##    actual predicted
## 1      18  15.57781
## 2      15  14.23848
## 13     15  15.27868
## 14     14  12.51643
## 25     21  19.94057
## 26     10  12.09805

## ******************************************************************************************
## Top candidate models by Cp for country origin2:
## $Model_5
## y ~ horsepower + acceleration + year + `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
## 
## $Model_6
## y ~ horsepower + year + `cylinders:acceleration` + `cylinders:year` + 
##     `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
## 
## $Model_7
## y ~ horsepower + weight + acceleration + year + `horsepower:year` + 
##     `weight:acceleration` + `weight:year`
## <environment: 0x00000151d1e90400>
## 
## $Model_8
## y ~ cylinders + horsepower + acceleration + year + `cylinders:weight` + 
##     `horsepower:weight` + `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
## 
## $Model_4
## y ~ horsepower + weight + year + `horsepower:year`
## <environment: 0x00000151d1e90400>
## 
## Cp values for these models:
## [1] -1.7613949 -0.4831622  0.7532218  0.9017915  1.5105327
## 
## PRESS values for top models:
##  Model_5  Model_6  Model_7  Model_8  Model_4 
## 481.2531 478.8528 481.6577 494.7330 521.8340 
## 
## Selected model by min PRESS:
## y ~ horsepower + year + `cylinders:acceleration` + `cylinders:year` + 
##     `horsepower:year` + `weight:acceleration`
## <environment: 0x00000151d1e90400>
## Final model summary:
## 
## Call:
## lm(formula = best_formula, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7495 -1.4228  0.2323  1.2989  8.6054 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.193e+02  3.699e+01  -5.929 3.01e-07 ***
## horsepower                2.111e+00  4.714e-01   4.479 4.50e-05 ***
## year                      3.521e+00  4.789e-01   7.351 1.90e-09 ***
## `cylinders:acceleration`  3.301e-01  9.244e-02   3.571 0.000809 ***
## `cylinders:year`         -7.124e-02  2.337e-02  -3.048 0.003707 ** 
## `horsepower:year`        -2.874e-02  6.280e-03  -4.576 3.25e-05 ***
## `weight:acceleration`    -3.749e-04  8.085e-05  -4.637 2.66e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.807 on 49 degrees of freedom
## Multiple R-squared:  0.8338, Adjusted R-squared:  0.8134 
## F-statistic: 40.97 on 6 and 49 DF,  p-value: < 2.2e-16
## 
## 
## Comparison of actual vs. predicted on test data (1st few rows):
##     actual predicted
## 51      28  24.16980
## 53      30  23.75635
## 78      22  23.71001
## 80      26  25.58853
## 103     26  29.91588
## 121     19  22.21875

## ******************************************************************************************
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 3 linear dependencies found
## Reordering variables and trying again:
## Top candidate models by Cp for country origin3:
## $Model_6
## y ~ year + `cylinders:displacement` + `cylinders:horsepower` + 
##     `displacement:acceleration` + `displacement:year` + `horsepower:year`
## <environment: 0x00000151d00c0d68>
## 
## $Model_5
## y ~ weight + acceleration + `displacement:acceleration` + `displacement:year` + 
##     `weight:acceleration`
## <environment: 0x00000151d00c0d68>
## 
## $Model_7
## y ~ displacement + weight + `cylinders:displacement` + `cylinders:horsepower` + 
##     `displacement:acceleration` + `horsepower:year` + `weight:year`
## <environment: 0x00000151d00c0d68>
## 
## $Model_8
## y ~ displacement + weight + `cylinders:displacement` + `cylinders:horsepower` + 
##     `displacement:acceleration` + `displacement:year` + `horsepower:year` + 
##     `weight:year`
## <environment: 0x00000151d00c0d68>
## 
## $Model_10
## y ~ displacement + weight + `cylinders:displacement` + `cylinders:horsepower` + 
##     `displacement:horsepower` + `displacement:acceleration` + 
##     `horsepower:weight` + `horsepower:year` + `weight:acceleration` + 
##     `weight:year`
## <environment: 0x00000151d00c0d68>
## 
## Cp values for these models:
## [1] -0.0645841  1.2196160  1.2789209  2.4662333  3.0777192
## 
## PRESS values for top models:
##  Model_6  Model_5  Model_7  Model_8 Model_10 
## 513.6535 598.7956 548.8406 544.5331 630.6537 
## 
## Selected model by min PRESS:
## y ~ year + `cylinders:displacement` + `cylinders:horsepower` + 
##     `displacement:acceleration` + `displacement:year` + `horsepower:year`
## <environment: 0x00000151d00c0d68>
## Final model summary:
## 
## Call:
## lm(formula = best_formula, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6496 -1.6434 -0.0782  1.3417  9.5466 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -53.249604  13.034385  -4.085 0.000163 ***
## year                          1.402686   0.166081   8.446 4.01e-11 ***
## `cylinders:displacement`     -0.174018   0.043717  -3.981 0.000227 ***
## `cylinders:horsepower`        0.259501   0.058034   4.472 4.61e-05 ***
## `displacement:acceleration`  -0.006397   0.003254  -1.966 0.054999 .  
## `displacement:year`           0.007557   0.002262   3.341 0.001603 ** 
## `horsepower:year`            -0.013781   0.002799  -4.923 1.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.969 on 49 degrees of freedom
## Multiple R-squared:  0.7467, Adjusted R-squared:  0.7157 
## F-statistic: 24.07 on 6 and 49 DF,  p-value: 4.738e-13
## 
## 
## Comparison of actual vs. predicted on test data (1st few rows):
##     actual predicted
## 55      35  30.67135
## 72      19  22.50731
## 109     20  26.16063
## 112     18  24.69484
## 124     20  26.06789
## 150     24  24.37393

## ******************************************************************************************