Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
# Load required packages
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(MASS)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
# Set seed
set.seed(1234)
# Load data
mushrooms <- read.csv("mushrooms.csv")
# Check structure of the data
str(mushrooms)
## 'data.frame': 8124 obs. of 23 variables:
## $ class : chr "p" "e" "e" "p" ...
## $ cap.shape : chr "x" "x" "b" "x" ...
## $ cap.surface : chr "s" "s" "s" "y" ...
## $ cap.color : chr "n" "y" "w" "w" ...
## $ bruises : chr "t" "t" "t" "t" ...
## $ odor : chr "p" "a" "l" "p" ...
## $ gill.attachment : chr "f" "f" "f" "f" ...
## $ gill.spacing : chr "c" "c" "c" "c" ...
## $ gill.size : chr "n" "b" "b" "n" ...
## $ gill.color : chr "k" "k" "n" "n" ...
## $ stalk.shape : chr "e" "e" "e" "e" ...
## $ stalk.root : chr "e" "c" "c" "e" ...
## $ stalk.surface.above.ring: chr "s" "s" "s" "s" ...
## $ stalk.surface.below.ring: chr "s" "s" "s" "s" ...
## $ stalk.color.above.ring : chr "w" "w" "w" "w" ...
## $ stalk.color.below.ring : chr "w" "w" "w" "w" ...
## $ veil.type : chr "p" "p" "p" "p" ...
## $ veil.color : chr "w" "w" "w" "w" ...
## $ ring.number : chr "o" "o" "o" "o" ...
## $ ring.type : chr "p" "p" "p" "p" ...
## $ spore.print.color : chr "k" "n" "n" "k" ...
## $ population : chr "s" "n" "n" "s" ...
## $ habitat : chr "u" "g" "m" "u" ...
# Create index for random split
n <- nrow(mushrooms)
training_index <- sample(1:n, size = n/2)
# Create the training and test sets
train_data <- mushrooms[training_index, ]
test_data <- mushrooms[-training_index, ]
# Verify the split worked correctly
cat("Training set size:", nrow(train_data), "observations\n")
## Training set size: 4062 observations
cat("Test set size:", nrow(test_data), "observations\n")
## Test set size: 4062 observations
# Fit a classification tree (CART) to the training data
cart_model <- rpart(class ~ ., data = train_data, method = "class")
# Look at the full model
print(cart_model)
## n= 4062
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4062 1953 e (0.51920236 0.48079764)
## 2) odor=a,l,n 2167 58 e (0.97323489 0.02676511)
## 4) spore.print.color=b,h,k,n,o,u,w,y 2135 26 e (0.98782201 0.01217799) *
## 5) spore.print.color=r 32 0 p (0.00000000 1.00000000) *
## 3) odor=c,f,m,p,s,y 1895 0 p (0.00000000 1.00000000) *
# Cross-validation for cost complexity pruning
printcp(cart_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] odor spore.print.color
##
## Root node error: 1953/4062 = 0.4808
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.970302 0 1.000000 1.000000 0.0163049
## 2 0.016385 1 0.029698 0.029698 0.0038716
## 3 0.010000 2 0.013313 0.013313 0.0026025
plotcp(cart_model) # Visual representation of cross-validation results
# Find the optimal complexity parameter (cp) value
# This selects the cp that minimizes cross-validated error
optimal_cp <- cart_model$cptable[which.min(cart_model$cptable[,"xerror"]), "CP"]
cat("Optimal complexity parameter (cp):", optimal_cp, "\n")
## Optimal complexity parameter (cp): 0.01
# Prune the tree using the optimal cp value
pruned_cart <- prune(cart_model, cp = optimal_cp)
# Visualize the pruned tree with descriptive text
rpart.plot(pruned_cart, extra = 104, # Shows class probabilities and percentages
box.palette = "RdGn", # Red-Green color scheme (red for poisonous, green for edible)
shadow.col = "gray", # Add shadows for better visibility
nn = TRUE, # Display node numbers
main = "Classification Tree for Mushroom Edibility")
# Test the pruned model on the test data
cart_predictions <- predict(pruned_cart, test_data, type = "class")
# Create confusion matrix to evaluate performance
conf_matrix <- table(Predicted = cart_predictions, Actual = test_data$class)
print(conf_matrix)
## Actual
## Predicted e p
## e 2099 22
## p 0 1941
# Calculate accuracy
cart_accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Test set accuracy:", round(cart_accuracy * 100, 2), "%\n")
## Test set accuracy: 99.46 %
# Interpret the model structure
cat("\nModel Interpretation:\n")
##
## Model Interpretation:
cat("Number of terminal nodes:", length(pruned_cart$frame$var[pruned_cart$frame$var == "<leaf>"]), "\n")
## Number of terminal nodes: 3
cat("Important splitting variables:", "\n")
## Important splitting variables:
imp_vars <- pruned_cart$variable.importance
print(imp_vars[order(imp_vars, decreasing = TRUE)])
## odor spore.print.color gill.color
## 1915.109 1399.578 1153.134
## ring.type stalk.surface.above.ring stalk.surface.below.ring
## 1031.835 1013.644 1002.527
Looking at the tree visualization, we can see that:
The root node shows the entire dataset split with roughly 52% edible and 48% poisonous mushrooms.
The first and most important split is based on odor. Mushrooms with odors ‘a’, ‘l’, ‘n’ go to the left branch (node 2), while those with other odors go to the right branch (node 3).
For mushrooms with odors ‘a’, ‘l’, or ‘n’ (left branch, node 2), they are further split based on spore.print.color. (97% e & 3% p)
If the spore print color is ‘b’, ‘h’, ‘k’, ‘n’, ‘o’, ‘u’, or ‘w’, they go to node 4 (99% edible)
If the spore print color is not in that list, they go to node 5 (100% poisonous)
Mushrooms with other odors (right branch, node 3) are classified as 100% poisonous.
Our performance summary shows:
The model achieved an 99.48% accuracy on the test set.
The confusion matrix shows that out of edible mushrooms, 2100 were correctly classified and 21 were misclassified.
All poisonous mushrooms (1941) were correctly classified, which is probably the most important class to predict
odor, spore print color, gill color, stalk surface above/below ring were the 4 most important variables
in terms of our tuning with the complexity parameter:
The optimal complexity parameter (cp) was determined to be 0.01.
The tree has only 3 terminal nodes, which means it’s really simple simple yet highly accurate.
From the CP plot, we can see that once we reach a tree size of 2 splits (3 terminal nodes), adding more complexity doesn’t improve the cross-validated error.
# Try different values of B (number of bagged trees)
b_values <- c(1, 2, 5, 10, 20, 30, 40, 50, 100, 250)
num_predictors <- ncol(train_data) - 1
# Get a list of all character/factor columns
char_cols <- names(train_data)[sapply(train_data, is.character) | sapply(train_data, is.factor)]
# Convert character columns to factors with consistent levels
for (col in char_cols) {
# Get all possible levels from both datasets
all_levels <- unique(c(as.character(train_data[[col]]), as.character(test_data[[col]])))
# Convert both to factors with the same levels
train_data[[col]] <- factor(train_data[[col]], levels = all_levels)
test_data[[col]] <- factor(test_data[[col]], levels = all_levels)
}
# Loop through B values
results <- data.frame(B = b_values, Accuracy = NA)
for(i in 1:length(b_values)) {
b <- b_values[i]
# Fit bagging model
bag_model <- randomForest(class ~ ., data = train_data,
ntree = b, mtry = num_predictors,
importance = TRUE)
# Get predictions and calculate accuracy
predictions <- predict(bag_model, test_data)
conf <- table(predictions, test_data$class)
accuracy <- sum(diag(conf)) / sum(conf)
results$Accuracy[i] <- accuracy
cat("B =", b, "trees | Accuracy:", round(accuracy * 100, 2), "%\n")
}
## B = 1 trees | Accuracy: 99.95 %
## B = 2 trees | Accuracy: 100 %
## B = 5 trees | Accuracy: 100 %
## B = 10 trees | Accuracy: 99.95 %
## B = 20 trees | Accuracy: 99.95 %
## B = 30 trees | Accuracy: 99.95 %
## B = 40 trees | Accuracy: 99.95 %
## B = 50 trees | Accuracy: 99.95 %
## B = 100 trees | Accuracy: 99.95 %
## B = 250 trees | Accuracy: 99.95 %
# Store the best accuracy for comparison later
accuracy_bagging <- max(results$Accuracy)
# Find best model
best_idx <- which.max(results$Accuracy)
best_b <- b_values[best_idx]
cat("\nBest B:", best_b, "| Accuracy:", round(results$Accuracy[best_idx] * 100, 2), "%\n")
##
## Best B: 2 | Accuracy: 100 %
# Refit with best B for final model
final_bag <- randomForest(class ~ ., data = train_data,
ntree = best_b, mtry = num_predictors,
importance = TRUE)
# Plot variable importance
varImpPlot(final_bag, main = "Variable Importance (Bagging)",
sort = TRUE, n.var = 10)
# Confusion matrix
conf_matrix <- table(Predicted = predict(final_bag, test_data),
Actual = test_data$class)
print(conf_matrix)
## Actual
## Predicted p e
## p 1963 0
## e 0 2099
Variable Importance:
Bagging Performance:
# d) Use random forests on the training set
# Define values to try for mtry and B
mtry_values <- c(2, 4, 6, 8) # Number of variables to try at each split
b_values <- c(50, 100, 200) # Number of trees
# Create a data frame to store results
rf_results <- expand.grid(mtry = mtry_values, B = b_values, Accuracy = NA)
# Loop through combinations
for(i in 1:nrow(rf_results)) {
mtry <- rf_results$mtry[i]
b <- rf_results$B[i]
# Fit random forest model
rf_model <- randomForest(class ~ .,
data = train_data,
ntree = b,
mtry = mtry,
importance = TRUE)
# Calculate accuracy on test set
predictions <- predict(rf_model, test_data)
conf <- table(predictions, test_data$class)
accuracy <- sum(diag(conf)) / sum(conf)
# Store results
rf_results$Accuracy[i] <- accuracy
# Print progress
cat("Random Forest with mtry =", mtry, ", B =", b,
"| Accuracy:", round(accuracy * 100, 2), "%\n")
}
## Random Forest with mtry = 2 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 4 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 6 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 8 , B = 50 | Accuracy: 100 %
## Random Forest with mtry = 2 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 4 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 6 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 8 , B = 100 | Accuracy: 100 %
## Random Forest with mtry = 2 , B = 200 | Accuracy: 100 %
## Random Forest with mtry = 4 , B = 200 | Accuracy: 100 %
## Random Forest with mtry = 6 , B = 200 | Accuracy: 100 %
## Random Forest with mtry = 8 , B = 200 | Accuracy: 100 %
# After finding best combination
best_idx <- which.max(rf_results$Accuracy)
best_mtry <- rf_results$mtry[best_idx]
best_b <- rf_results$B[best_idx]
# Store the best RF accuracy for comparison in part f
accuracy_rf <- rf_results$Accuracy[best_idx]
cat("\nBest combination: mtry =", best_mtry, ", B =", best_b,
"| Accuracy:", round(rf_results$Accuracy[best_idx] * 100, 2), "%\n")
##
## Best combination: mtry = 2 , B = 50 | Accuracy: 100 %
# Refit with best parameters
final_rf <- randomForest(class ~ .,
data = train_data,
ntree = best_b,
mtry = best_mtry,
importance = TRUE)
# Variable importance plot
varImpPlot(final_rf,
main = "Variable Importance (Random Forest)",
sort = TRUE,
n.var = 10)
# Confusion matrix
conf_matrix <- table(Predicted = predict(final_rf, test_data),
Actual = test_data$class)
print(conf_matrix)
## Actual
## Predicted p e
## p 1963 0
## e 0 2099
Variable Importance:
Random Forest Performance:
# e) Use boosting on the training set
# Convert class to numeric for gbm (edible = 1, poisonous = 0)
train_data_gbm <- train_data
train_data_gbm$class_num <- ifelse(train_data_gbm$class == "e", 1, 0)
test_data_gbm <- test_data
test_data_gbm$class_num <- ifelse(test_data_gbm$class == "e", 1, 0)
# Define values to try
lambda_values <- c(0.001, 0.01, 0.1, 0.5) # Learning rate values
b_values <- c(100, 500, 1000, 5000) # Number of trees
# Create a data frame to store results
boost_results <- expand.grid(lambda = lambda_values,
B = b_values,
Accuracy = NA)
# Loop through combinations
for(i in 1:nrow(boost_results)) {
lambda <- boost_results$lambda[i]
b <- boost_results$B[i]
# Fit boosting model
boost_model <- gbm(class_num ~ . - class, # Exclude original class variable
data = train_data_gbm,
distribution = "bernoulli",
n.trees = b,
interaction.depth = 1, # Using stumps as specified
shrinkage = lambda,
bag.fraction = 0.5,
train.fraction = 1.0,
verbose = FALSE)
# Make predictions
pred_probs <- predict(boost_model,
test_data_gbm,
n.trees = b,
type = "response")
predictions <- ifelse(pred_probs > 0.5, 1, 0)
# Calculate accuracy
accuracy <- mean(predictions == test_data_gbm$class_num)
boost_results$Accuracy[i] <- accuracy
# Print progress
cat("Boosting with lambda =", lambda, ", B =", b,
"| Accuracy:", round(accuracy * 100, 2), "%\n")
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 100 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 100 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 100 | Accuracy: 99.75 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 100 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 500 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 500 | Accuracy: 99.46 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 500 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 500 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 1000 | Accuracy: 98.47 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 1000 | Accuracy: 99.75 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 1000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 1000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 , B = 5000 | Accuracy: 99.46 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 , B = 5000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 , B = 5000 | Accuracy: 100 %
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.5 , B = 5000 | Accuracy: 100 %
# Find best combination
best_idx <- which.max(boost_results$Accuracy)
best_lambda <- boost_results$lambda[best_idx]
best_b <- boost_results$B[best_idx]
cat("\nBest combination: lambda =", best_lambda, ", B =", best_b,
"| Accuracy:", round(boost_results$Accuracy[best_idx] * 100, 2), "%\n")
##
## Best combination: lambda = 0.5 , B = 100 | Accuracy: 100 %
# Refit with best parameters
final_boost <- gbm(class_num ~ . - class,
data = train_data_gbm,
distribution = "bernoulli",
n.trees = best_b,
interaction.depth = 1,
shrinkage = best_lambda,
bag.fraction = 0.5,
train.fraction = 1.0,
verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
# Look at variable importance
summary(final_boost, n.trees = best_b, plotit = TRUE, cBars = 10)
# Create confusion matrix
pred_probs <- predict(final_boost,
test_data_gbm,
n.trees = best_b,
type = "response")
predictions <- ifelse(pred_probs > 0.5, "e", "p")
conf_matrix <- table(Predicted = predictions,
Actual = test_data$class)
print(conf_matrix)
## Actual
## Predicted p e
## e 0 2099
## p 1963 0
# Examine relationship between lambda and B
# Plot a subset of the results to visualize relationship
library(ggplot2)
ggplot(boost_results, aes(x = B, y = Accuracy, color = factor(lambda))) +
geom_line() +
geom_point() +
labs(title = "Relationship between Number of Trees (B) and Learning Rate (lambda)",
x = "Number of Trees (B)",
y = "Accuracy",
color = "Lambda") +
theme_minimal()
Boosting Performance:
Variable Importance:
Relationship Between Lambda and B:
Higher lambda values (0.1, 0.5) reach maximum accuracy quickly with fewer trees, showing almost immediate perfect performance. This demonstrates that faster learning rates effectively capture the pattern with minimal iterations.
Lower lambda values (0.001, 0.01) require substantially more trees to achieve comparable accuracy, with the slowest rate (0.001) still not reaching perfect accuracy even at 5000 trees. This illustrates the trade-off between learning speed and the number of iterations needed.
# Collect accuracies from all methods
model_accuracies <- c(
"CART" = cart_accuracy, # From part b
"Bagging (best B)" = accuracy_bagging, # From part c
"Random Forest (best)" = accuracy_rf, # From part d
"Boosting (best)" = accuracy # From part e
)
# Find the best model
best_model_name <- names(model_accuracies)[which.max(model_accuracies)]
best_accuracy <- max(model_accuracies)
# Display results
cat("Comparison of Model Accuracies:\n")
## Comparison of Model Accuracies:
for(i in 1:length(model_accuracies)) {
cat(names(model_accuracies)[i], ": ", round(model_accuracies[i] * 100, 2), "%\n", sep="")
}
## CART: 99.46%
## Bagging (best B): 100%
## Random Forest (best): 100%
## Boosting (best): 100%
cat("\nBest model: ", best_model_name, " with accuracy: ",
round(best_accuracy * 100, 2), "%\n", sep="")
##
## Best model: Bagging (best B) with accuracy: 100%
Model Comparison and Conclusion:
Based on results from parts b through e, all models got near perfect accuracy, with minimal differences between them w/ variable importance as well.
The simple CART model reached 99.48% with just a few splits, while the ensemble methods (Bagging, Random Forest, and Boosting) managed to reach 100% with minimal parameter tuning. The only issue other students might have is being paranoid that they’ve missed something important due to how simple it was to get to 100%.
While these results technically suggest that ensemble methods performed best, the marginal improvement over CART doesn’t justify their added complexity for this particular dataset. The mushroom classification problem appears to have such clear decision boundaries that sophisticated methods are overkill.
This dataset, while illustrative of tree-based methods’ effectiveness for classification, doesn’t provide an optimal demonstration of when more complex ensemble approaches are truly necessary.
Other students using the same seed should find similar results. While they might select different “best” models, they should reasonably lean toward CART for its interpretability, especially since no model missed any poisonous mushrooms.
A dataset with more subtle patterns would have better highlighted the relative strengths of these different approaches and provided more meaningful opportunities to apply the concepts.
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
# Define function to calculate difference in median mpg between 4 and 8 cylinder cars
median_diff <- function(data, indices) {
# Subset data using bootstrap indices
d <- data[indices, ]
# Calculate median mpg for 4 cylinder cars
median_4cyl <- median(d$mpg[d$cylinders == 4])
# Calculate median mpg for 8 cylinder cars
median_8cyl <- median(d$mpg[d$cylinders == 8])
# Return the difference (4cyl - 8cyl)
return(median_4cyl - median_8cyl)
}
# Perform bootstrap with 1000 resamples
set.seed(1234) # For reproducibility
boot_results <- boot(Auto, median_diff, R = 1000)
# Print bootstrap results
print(boot_results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = median_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.30745 0.797002
# Extract standard error
se <- boot_results$t0
cat("Estimated difference in median mpg:", boot_results$t0, "\n")
## Estimated difference in median mpg: 14.4
cat("Standard error of the difference:", sd(boot_results$t), "\n")
## Standard error of the difference: 0.797002
# Define a function for 10-fold cv of linear regression
cv_linear_regression <- function(data, formula) {
# Create 10 folds
folds <- createFolds(y = 1:nrow(data), k = 10, list = TRUE)
# Create vector to store MSE for each fold
mse_values <- numeric(10)
# Perform 10-fold cv
for (i in 1:10) {
# Split data into training/test sets
test_indices <- folds[[i]]
train_data <- data[-test_indices, ]
test_data <- data[test_indices, ]
# train linear regression model
model <- lm(formula, data = train_data)
# Make predictions on test data
predictions <- predict(model, newdata = test_data)
# Calculate MSE for this fold
mse_values[i] <- mean((test_data$perf - predictions)^2)
}
# Return results
return(list(
fold_mse = mse_values,
avg_mse = mean(mse_values),
sd_mse = sd(mse_values)
))
}
# Apply the function to cpus dataset with mmin and mmax as predictors
data(cpus)
result <- cv_linear_regression(cpus, perf ~ mmin + mmax)
# Print results
cat("MSE for each fold:\n")
## MSE for each fold:
print(result$fold_mse)
## [1] 23330.724 2174.315 3904.244 2936.811 1783.400 2169.889 2831.968
## [8] 4610.988 12789.835 6337.623
cat("\nAverage MSE across folds:", result$avg_mse, "\n")
##
## Average MSE across folds: 6286.98
cat("Standard deviation of MSE:", result$sd_mse, "\n")
## Standard deviation of MSE: 6819.683
# Fit the model on the full dataset for comparison
full_model <- lm(perf ~ mmin + mmax, data = cpus)
summary(full_model)
##
## Call:
## lm(formula = perf ~ mmin + mmax, data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -173.65 -27.65 3.75 23.46 535.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.277e+01 7.255e+00 -4.516 1.06e-05 ***
## mmin 1.371e-02 2.024e-03 6.776 1.27e-10 ***
## mmax 8.397e-03 6.695e-04 12.542 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.83 on 206 degrees of freedom
## Multiple R-squared: 0.7913, Adjusted R-squared: 0.7892
## F-statistic: 390.5 on 2 and 206 DF, p-value: < 2.2e-16