Please hand this result in on Canvas no later than 11:59pm on Thursday, May 8!

# Load required packages
library(rpart)       
library(rpart.plot)  
library(randomForest) 
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)         
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)       
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(MASS)        
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
# Set seed
set.seed(1234)

# Load data
mushrooms <- read.csv("mushrooms.csv")

# Check structure of the data
str(mushrooms)
## 'data.frame':    8124 obs. of  23 variables:
##  $ class                   : chr  "p" "e" "e" "p" ...
##  $ cap.shape               : chr  "x" "x" "b" "x" ...
##  $ cap.surface             : chr  "s" "s" "s" "y" ...
##  $ cap.color               : chr  "n" "y" "w" "w" ...
##  $ bruises                 : chr  "t" "t" "t" "t" ...
##  $ odor                    : chr  "p" "a" "l" "p" ...
##  $ gill.attachment         : chr  "f" "f" "f" "f" ...
##  $ gill.spacing            : chr  "c" "c" "c" "c" ...
##  $ gill.size               : chr  "n" "b" "b" "n" ...
##  $ gill.color              : chr  "k" "k" "n" "n" ...
##  $ stalk.shape             : chr  "e" "e" "e" "e" ...
##  $ stalk.root              : chr  "e" "c" "c" "e" ...
##  $ stalk.surface.above.ring: chr  "s" "s" "s" "s" ...
##  $ stalk.surface.below.ring: chr  "s" "s" "s" "s" ...
##  $ stalk.color.above.ring  : chr  "w" "w" "w" "w" ...
##  $ stalk.color.below.ring  : chr  "w" "w" "w" "w" ...
##  $ veil.type               : chr  "p" "p" "p" "p" ...
##  $ veil.color              : chr  "w" "w" "w" "w" ...
##  $ ring.number             : chr  "o" "o" "o" "o" ...
##  $ ring.type               : chr  "p" "p" "p" "p" ...
##  $ spore.print.color       : chr  "k" "n" "n" "k" ...
##  $ population              : chr  "s" "n" "n" "s" ...
##  $ habitat                 : chr  "u" "g" "m" "u" ...

Questions

  1. Experimenting with tree-based methods: In this problem, you will used CARTs, bagging, random forests, and boosting in an attempt to accurately predict whether mushrooms are edible or poisonous, based on a large collection of varaibles decribing the mushrooms (cap shape, cap surface, odor, gill spacing, gill size, stalk color, etc.). The data can be found under the Files tab in folder Exam 2/Final Exam (mushrooms.csv) on Canvas. More information regarding this data can be found (including variable level designation for interpretation) at the following site: https://www.kaggle.com/uciml/mushroom-classification/data
  1. Split the data into a training and test set, each consisting of 50% of the entire sample. Do this randomly, not based on positioning in the dataset (i.e. do not take the first half as your test set, etc., but instead use the function).
# Create index for random split
n <- nrow(mushrooms)
training_index <- sample(1:n, size = n/2)

# Create the training and test sets
train_data <- mushrooms[training_index, ]
test_data <- mushrooms[-training_index, ]

# Verify the split worked correctly
cat("Training set size:", nrow(train_data), "observations\n")
## Training set size: 4062 observations
cat("Test set size:", nrow(test_data), "observations\n")
## Test set size: 4062 observations
  1. Using a CART, fit a classification tree to the training data and employ cost complexity pruning with CV used to determine the optimal number of terminal nodes to include. Then test this “best” model on the test data set. Interpret the final best model using the tree structure, and visualize it / add text to represent the decisions.
# Fit a classification tree (CART) to the training data
cart_model <- rpart(class ~ ., data = train_data, method = "class")

# Look at the full model
print(cart_model)
## n= 4062 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 4062 1953 e (0.51920236 0.48079764)  
##   2) odor=a,l,n 2167   58 e (0.97323489 0.02676511)  
##     4) spore.print.color=b,h,k,n,o,u,w,y 2135   26 e (0.98782201 0.01217799) *
##     5) spore.print.color=r 32    0 p (0.00000000 1.00000000) *
##   3) odor=c,f,m,p,s,y 1895    0 p (0.00000000 1.00000000) *
# Cross-validation for cost complexity pruning
printcp(cart_model)
## 
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] odor              spore.print.color
## 
## Root node error: 1953/4062 = 0.4808
## 
## n= 4062 
## 
##         CP nsplit rel error   xerror      xstd
## 1 0.970302      0  1.000000 1.000000 0.0163049
## 2 0.016385      1  0.029698 0.029698 0.0038716
## 3 0.010000      2  0.013313 0.013313 0.0026025
plotcp(cart_model) # Visual representation of cross-validation results

# Find the optimal complexity parameter (cp) value
# This selects the cp that minimizes cross-validated error
optimal_cp <- cart_model$cptable[which.min(cart_model$cptable[,"xerror"]), "CP"]
cat("Optimal complexity parameter (cp):", optimal_cp, "\n")
## Optimal complexity parameter (cp): 0.01
# Prune the tree using the optimal cp value
pruned_cart <- prune(cart_model, cp = optimal_cp)

# Visualize the pruned tree with descriptive text
rpart.plot(pruned_cart, extra = 104, # Shows class probabilities and percentages
           box.palette = "RdGn", # Red-Green color scheme (red for poisonous, green for edible)
           shadow.col = "gray", # Add shadows for better visibility
           nn = TRUE, # Display node numbers
           main = "Classification Tree for Mushroom Edibility")

# Test the pruned model on the test data
cart_predictions <- predict(pruned_cart, test_data, type = "class")

# Create confusion matrix to evaluate performance
conf_matrix <- table(Predicted = cart_predictions, Actual = test_data$class)
print(conf_matrix)
##          Actual
## Predicted    e    p
##         e 2099   22
##         p    0 1941
# Calculate accuracy
cart_accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Test set accuracy:", round(cart_accuracy * 100, 2), "%\n")
## Test set accuracy: 99.46 %
# Interpret the model structure
cat("\nModel Interpretation:\n")
## 
## Model Interpretation:
cat("Number of terminal nodes:", length(pruned_cart$frame$var[pruned_cart$frame$var == "<leaf>"]), "\n")
## Number of terminal nodes: 3
cat("Important splitting variables:", "\n")
## Important splitting variables:
imp_vars <- pruned_cart$variable.importance
print(imp_vars[order(imp_vars, decreasing = TRUE)])
##                     odor        spore.print.color               gill.color 
##                 1915.109                 1399.578                 1153.134 
##                ring.type stalk.surface.above.ring stalk.surface.below.ring 
##                 1031.835                 1013.644                 1002.527

interpretation:

Looking at the tree visualization, we can see that:

Our performance summary shows:

in terms of our tuning with the complexity parameter:

  1. Use bootstrap aggregation on the training set to build a model which can predict edible vs poisonous status. Test your resulting bagged tree model on the testing data created in part a). Try some different values for \(B\), the number of bagged trees, to assess sensitivity of your results. How many trees seems sufficient? Create variable importance plots for this final model and interpret the findings.
# Try different values of B (number of bagged trees)
b_values <- c(1, 2, 5, 10, 20, 30, 40, 50, 100, 250)
num_predictors <- ncol(train_data) - 1

# Get a list of all character/factor columns
char_cols <- names(train_data)[sapply(train_data, is.character) | sapply(train_data, is.factor)]

# Convert character columns to factors with consistent levels
for (col in char_cols) {
  # Get all possible levels from both datasets
  all_levels <- unique(c(as.character(train_data[[col]]), as.character(test_data[[col]])))
  
  # Convert both to factors with the same levels
  train_data[[col]] <- factor(train_data[[col]], levels = all_levels)
  test_data[[col]] <- factor(test_data[[col]], levels = all_levels)
}

# Loop through B values
results <- data.frame(B = b_values, Accuracy = NA)

for(i in 1:length(b_values)) {
  b <- b_values[i]
  # Fit bagging model
  bag_model <- randomForest(class ~ ., data = train_data,
                           ntree = b, mtry = num_predictors, 
                           importance = TRUE)
  
  # Get predictions and calculate accuracy
  predictions <- predict(bag_model, test_data)
  conf <- table(predictions, test_data$class)
  accuracy <- sum(diag(conf)) / sum(conf)
  
  results$Accuracy[i] <- accuracy
  cat("B =", b, "trees | Accuracy:", round(accuracy * 100, 2), "%\n")
}
## B = 1 trees | Accuracy: 99.95 %
## B = 2 trees | Accuracy: 100 %
## B = 5 trees | Accuracy: 100 %
## B = 10 trees | Accuracy: 99.95 %
## B = 20 trees | Accuracy: 99.95 %
## B = 30 trees | Accuracy: 99.95 %
## B = 40 trees | Accuracy: 99.95 %
## B = 50 trees | Accuracy: 99.95 %
## B = 100 trees | Accuracy: 99.95 %
## B = 250 trees | Accuracy: 99.95 %
# Store the best accuracy for comparison later
accuracy_bagging <- max(results$Accuracy)

# Find best model
best_idx <- which.max(results$Accuracy)
best_b <- b_values[best_idx]
cat("\nBest B:", best_b, "| Accuracy:", round(results$Accuracy[best_idx] * 100, 2), "%\n")
## 
## Best B: 2 | Accuracy: 100 %
# Refit with best B for final model
final_bag <- randomForest(class ~ ., data = train_data,
                         ntree = best_b, mtry = num_predictors, 
                         importance = TRUE)

# Plot variable importance
varImpPlot(final_bag, main = "Variable Importance (Bagging)", 
          sort = TRUE, n.var = 10)

# Confusion matrix
conf_matrix <- table(Predicted = predict(final_bag, test_data),
                    Actual = test_data$class)
print(conf_matrix)
##          Actual
## Predicted    p    e
##         p 1963    0
##         e    0 2099

Variable Importance:

Bagging Performance:

  1. Use random forests on the training set to build a model which can predict edible vs poisonous status. Test your resulting random forest model on the testing data created in part a). Try some alternative values for \(mtry\), the number of predictors which are “tried” as optimal splitters at each node, and also \(B\). Compare these models in terms of their test performance. Which \(B\) and \(mtry\) combination seems best? Create variable importance plots for this final model and interpret the findings.
# d) Use random forests on the training set

# Define values to try for mtry and B
mtry_values <- c(2, 4, 6, 8)  # Number of variables to try at each split
b_values <- c(50, 100, 200)   # Number of trees

# Create a data frame to store results
rf_results <- expand.grid(mtry = mtry_values, B = b_values, Accuracy = NA)

# Loop through combinations
for(i in 1:nrow(rf_results)) {
  mtry <- rf_results$mtry[i]
  b <- rf_results$B[i]
  
  # Fit random forest model
  rf_model <- randomForest(class ~ ., 
                          data = train_data,
                          ntree = b, 
                          mtry = mtry,
                          importance = TRUE)
  
  # Calculate accuracy on test set
  predictions <- predict(rf_model, test_data)
  conf <- table(predictions, test_data$class)
  accuracy <- sum(diag(conf)) / sum(conf)
  
  # Store results
  rf_results$Accuracy[i] <- accuracy
  
  # Print progress
  cat("Random Forest with mtry =", mtry, ", B =", b, 
      "| Accuracy:", round(accuracy * 100, 2), "%\n")
}
## Random Forest with mtry = 2 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 4 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 6 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 8 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 2 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 4 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 6 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 8 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 2 , B = 200 | Accuracy: 100 %
## Random Forest with mtry = 4 , B = 200 | Accuracy: 100 %
## Random Forest with mtry = 6 , B = 200 | Accuracy: 100 %
## Random Forest with mtry = 8 , B = 200 | Accuracy: 100 %
# After finding best combination
best_idx <- which.max(rf_results$Accuracy)
best_mtry <- rf_results$mtry[best_idx]
best_b <- rf_results$B[best_idx]

# Store the best RF accuracy for comparison in part f
accuracy_rf <- rf_results$Accuracy[best_idx]

cat("\nBest combination: mtry =", best_mtry, ", B =", best_b,
    "| Accuracy:", round(rf_results$Accuracy[best_idx] * 100, 2), "%\n")
## 
## Best combination: mtry = 2 , B = 50 | Accuracy: 100 %
# Refit with best parameters
final_rf <- randomForest(class ~ ., 
                        data = train_data,
                        ntree = best_b, 
                        mtry = best_mtry,
                        importance = TRUE)

# Variable importance plot
varImpPlot(final_rf, 
          main = "Variable Importance (Random Forest)",
          sort = TRUE, 
          n.var = 10)

# Confusion matrix
conf_matrix <- table(Predicted = predict(final_rf, test_data),
                    Actual = test_data$class)
print(conf_matrix)
##          Actual
## Predicted    p    e
##         p 1963    0
##         e    0 2099

Variable Importance:

Random Forest Performance:

  1. Use boosting on the training set to build a model which can predict edible vs poisonous status. Test your resulting boosted model on the testing data created in part a). You should assess alternative values for \(\lambda\), perhaps 0.001, 0.01, 0.1, etc. What do you notice about the relationship between \(\lambda\) and \(B\)? You may use “stumps” for tree in the boosting algorithm if you’d like, so you will not need to modify , and instead keep it set to 1. Which combination (of \(\lambda\), \(B\)) seems best? Create variable importance plots for this final model and interpret the findings.
# e) Use boosting on the training set

# Convert class to numeric for gbm (edible = 1, poisonous = 0)
train_data_gbm <- train_data
train_data_gbm$class_num <- ifelse(train_data_gbm$class == "e", 1, 0)

test_data_gbm <- test_data
test_data_gbm$class_num <- ifelse(test_data_gbm$class == "e", 1, 0)

# Define values to try
lambda_values <- c(0.001, 0.01, 0.1, 0.5)  # Learning rate values
b_values <- c(100, 500, 1000, 5000)       # Number of trees

# Create a data frame to store results
boost_results <- expand.grid(lambda = lambda_values, 
                             B = b_values, 
                             Accuracy = NA)

# Loop through combinations
for(i in 1:nrow(boost_results)) {
  lambda <- boost_results$lambda[i]
  b <- boost_results$B[i]
  
  # Fit boosting model
  boost_model <- gbm(class_num ~ . - class,  # Exclude original class variable
                    data = train_data_gbm,
                    distribution = "bernoulli",
                    n.trees = b,
                    interaction.depth = 1,  # Using stumps as specified
                    shrinkage = lambda,
                    bag.fraction = 0.5,
                    train.fraction = 1.0,
                    verbose = FALSE)
  
  # Make predictions
  pred_probs <- predict(boost_model, 
                       test_data_gbm, 
                       n.trees = b, 
                       type = "response")
  predictions <- ifelse(pred_probs > 0.5, 1, 0)
  
  # Calculate accuracy
  accuracy <- mean(predictions == test_data_gbm$class_num)
  boost_results$Accuracy[i] <- accuracy
  
  # Print progress
  cat("Boosting with lambda =", lambda, ", B =", b, 
      "| Accuracy:", round(accuracy * 100, 2), "%\n")
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 100 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 100 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 100 | Accuracy: 99.75 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 100 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 500 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 500 | Accuracy: 99.46 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 500 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 500 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 1000 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 1000 | Accuracy: 99.75 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 1000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 1000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 5000 | Accuracy: 99.46 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 5000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 5000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 5000 | Accuracy: 100 %
# Find best combination
best_idx <- which.max(boost_results$Accuracy)
best_lambda <- boost_results$lambda[best_idx]
best_b <- boost_results$B[best_idx]

cat("\nBest combination: lambda =", best_lambda, ", B =", best_b,
    "| Accuracy:", round(boost_results$Accuracy[best_idx] * 100, 2), "%\n")
## 
## Best combination: lambda = 0.5 , B = 100 | Accuracy: 100 %
# Refit with best parameters
final_boost <- gbm(class_num ~ . - class,
                  data = train_data_gbm,
                  distribution = "bernoulli",
                  n.trees = best_b,
                  interaction.depth = 1,
                  shrinkage = best_lambda,
                  bag.fraction = 0.5,
                  train.fraction = 1.0,
                  verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
# Look at variable importance
summary(final_boost, n.trees = best_b, plotit = TRUE, cBars = 10)
# Create confusion matrix
pred_probs <- predict(final_boost, 
                     test_data_gbm, 
                     n.trees = best_b, 
                     type = "response")
predictions <- ifelse(pred_probs > 0.5, "e", "p")
conf_matrix <- table(Predicted = predictions, 
                    Actual = test_data$class)
print(conf_matrix)
##          Actual
## Predicted    p    e
##         e    0 2099
##         p 1963    0
# Examine relationship between lambda and B
# Plot a subset of the results to visualize relationship
library(ggplot2)
ggplot(boost_results, aes(x = B, y = Accuracy, color = factor(lambda))) +
  geom_line() +
  geom_point() +
  labs(title = "Relationship between Number of Trees (B) and Learning Rate (lambda)",
       x = "Number of Trees (B)",
       y = "Accuracy",
       color = "Lambda") +
  theme_minimal()

Boosting Performance:

Variable Importance:

Relationship Between Lambda and B:

  1. Based on your results to b) through e), which statistical learning algorithm seems to yield the best accuracy results? Select the best model from all models. Is this likely to be the same answer that other students will get on this exam, assuming you correctly used your name in setting the seed on part a) (set.seed(1234))?
# Collect accuracies from all methods
model_accuracies <- c(
  "CART" = cart_accuracy,          # From part b
  "Bagging (best B)" = accuracy_bagging, # From part c
  "Random Forest (best)" = accuracy_rf,  # From part d
  "Boosting (best)" = accuracy     # From part e
)

# Find the best model
best_model_name <- names(model_accuracies)[which.max(model_accuracies)]
best_accuracy <- max(model_accuracies)

# Display results
cat("Comparison of Model Accuracies:\n")
## Comparison of Model Accuracies:
for(i in 1:length(model_accuracies)) {
  cat(names(model_accuracies)[i], ": ", round(model_accuracies[i] * 100, 2), "%\n", sep="")
}
## CART: 99.46%
## Bagging (best B): 100%
## Random Forest (best): 100%
## Boosting (best): 100%
cat("\nBest model: ", best_model_name, " with accuracy: ", 
    round(best_accuracy * 100, 2), "%\n", sep="")
## 
## Best model: Bagging (best B) with accuracy: 100%

Model Comparison and Conclusion:

  1. Using the Bootstrap to estimate standard errors: In this problem, we will employ bootstrapping to estimate the variability of estimation where standard software does not exist. Use the function, or risk losing points!

Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.

# Define function to calculate difference in median mpg between 4 and 8 cylinder cars
median_diff <- function(data, indices) {
  # Subset data using bootstrap indices
  d <- data[indices, ]
  
  # Calculate median mpg for 4 cylinder cars
  median_4cyl <- median(d$mpg[d$cylinders == 4])
  
  # Calculate median mpg for 8 cylinder cars
  median_8cyl <- median(d$mpg[d$cylinders == 8])
  
  # Return the difference (4cyl - 8cyl)
  return(median_4cyl - median_8cyl)
}

# Perform bootstrap with 1000 resamples
set.seed(1234)  # For reproducibility
boot_results <- boot(Auto, median_diff, R = 1000)

# Print bootstrap results
print(boot_results)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Auto, statistic = median_diff, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original   bias    std. error
## t1*     14.4 -0.30745    0.797002
# Extract standard error
se <- boot_results$t0
cat("Estimated difference in median mpg:", boot_results$t0, "\n")
## Estimated difference in median mpg: 14.4
cat("Standard error of the difference:", sd(boot_results$t), "\n")
## Standard error of the difference: 0.797002
  1. Write your own function for conducting 10-fold cross validation to assess the test error in a linear regression model predicting estimated performance in the dataset from R package , using only minimum main memory and maximum main memory as predictors. DO NOT worry about including interaction effects, or higher order effects as in the first exam. Just a main effect for each variable will be sufficient. You need to show how to use the function (creates the folds for you) from the R package , as explained in class.
# Define a function for 10-fold cv of linear regression
cv_linear_regression <- function(data, formula) {
  # Create 10 folds
  folds <- createFolds(y = 1:nrow(data), k = 10, list = TRUE)
  
  # Create vector to store MSE for each fold
  mse_values <- numeric(10)
  
  # Perform 10-fold cv
  for (i in 1:10) {
    # Split data into training/test sets
    test_indices <- folds[[i]]
    train_data <- data[-test_indices, ]
    test_data <- data[test_indices, ]
    
    # train linear regression model
    model <- lm(formula, data = train_data)
    
    # Make predictions on test data
    predictions <- predict(model, newdata = test_data)
    
    # Calculate MSE for this fold
    mse_values[i] <- mean((test_data$perf - predictions)^2)
  }
  
  # Return results
  return(list(
    fold_mse = mse_values,
    avg_mse = mean(mse_values),
    sd_mse = sd(mse_values)
  ))
}

# Apply the function to cpus dataset with mmin and mmax as predictors
data(cpus)
result <- cv_linear_regression(cpus, perf ~ mmin + mmax)

# Print results
cat("MSE for each fold:\n")
## MSE for each fold:
print(result$fold_mse)
##  [1] 23330.724  2174.315  3904.244  2936.811  1783.400  2169.889  2831.968
##  [8]  4610.988 12789.835  6337.623
cat("\nAverage MSE across folds:", result$avg_mse, "\n")
## 
## Average MSE across folds: 6286.98
cat("Standard deviation of MSE:", result$sd_mse, "\n")
## Standard deviation of MSE: 6819.683
# Fit the model on the full dataset for comparison
full_model <- lm(perf ~ mmin + mmax, data = cpus)
summary(full_model)
## 
## Call:
## lm(formula = perf ~ mmin + mmax, data = cpus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -173.65  -27.65    3.75   23.46  535.66 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.277e+01  7.255e+00  -4.516 1.06e-05 ***
## mmin         1.371e-02  2.024e-03   6.776 1.27e-10 ***
## mmax         8.397e-03  6.695e-04  12.542  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.83 on 206 degrees of freedom
## Multiple R-squared:  0.7913, Adjusted R-squared:  0.7892 
## F-statistic: 390.5 on 2 and 206 DF,  p-value: < 2.2e-16