library(MASS) # For Boston dataset
library(randomForest) # For random forest models
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2) # For visualization
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# Set seed for reproducibility
set.seed(123)
# Load Boston dataset
data(Boston)
# Split data into training and testing sets (70% training, 30% testing)
train_indices <- sample(1:nrow(Boston), 0.7*nrow(Boston))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
# Define ranges for mtry and ntree
mtry_values <- 1:13 # Boston dataset has 13 predictors
ntree_values <- c(25, 50, 100, 200, 300, 400, 500)
# Create a data frame to store results
results <- data.frame()
# Run random forests with different combinations of mtry and ntree
for (m in mtry_values) {
for (n in ntree_values) {
# Fit random forest model
rf_model <- randomForest(medv ~ .,
data = train_data,
mtry = m,
ntree = n,
importance = TRUE)
# Make predictions on test set
predictions <- predict(rf_model, test_data)
# Calculate test MSE
test_mse <- mean((predictions - test_data$medv)^2)
# Store results
results <- rbind(results, data.frame(mtry = m, ntree = n, test_mse = test_mse))
}
# Print progress
cat("Completed mtry =", m, "\n")
}
## Completed mtry = 1
## Completed mtry = 2
## Completed mtry = 3
## Completed mtry = 4
## Completed mtry = 5
## Completed mtry = 6
## Completed mtry = 7
## Completed mtry = 8
## Completed mtry = 9
## Completed mtry = 10
## Completed mtry = 11
## Completed mtry = 12
## Completed mtry = 13
# Create plot similar to Figure 8.10
ggplot(results, aes(x = mtry, y = test_mse, color = factor(ntree))) +
geom_line() +
geom_point() +
scale_color_brewer(palette = "Set1", name = "Number of Trees") +
labs(title = "Test MSE for Random Forests on Boston Housing Data",
x = "Number of Variables Considered at Each Split (mtry)",
y = "Test Mean Squared Error") +
theme_minimal() +
theme(legend.position = "right")
# Alternative visualization: heatmap of test MSE
ggplot(results, aes(x = factor(mtry), y = factor(ntree), fill = test_mse)) +
geom_tile() +
scale_fill_viridis_c(name = "Test MSE") +
labs(title = "Test MSE for Random Forests on Boston Housing Data",
x = "Number of Variables Considered (mtry)",
y = "Number of Trees (ntree)") +
theme_minimal()
# Find the combination with minimum test MSE
best_result <- results[which.min(results$test_mse), ]
cat("Best combination: mtry =", best_result$mtry,
", ntree =", best_result$ntree,
", Test MSE =", round(best_result$test_mse, 4), "\n")
## Best combination: mtry = 4 , ntree = 400 , Test MSE = 10.8984
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(randomForest)
library(BART)
## Warning: package 'BART' was built under R version 4.4.3
## Loading required package: nlme
## Loading required package: survival
# Load the Carseats dataset
data(Carseats)
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets (70% training, 30% testing)
train_indices <- sample(1:nrow(Carseats), 0.7 * nrow(Carseats))
train_data <- Carseats[train_indices, ]
test_data <- Carseats[-train_indices, ]
# Output the dimensions of the training and test sets
dim(train_data)
## [1] 280 11
dim(test_data)
## [1] 120 11
# Fit a regression tree to the training set
reg_tree <- rpart(Sales ~ ., data = train_data, method = "anova")
# Plot the tree
rpart.plot(reg_tree, type = 4, extra = 101, fallen.leaves = TRUE,
main = "Regression Tree for Carseats Sales")
# Make predictions on the test set
predictions <- predict(reg_tree, test_data)
# Calculate test MSE
test_mse <- mean((predictions - test_data$Sales)^2)
print(paste("Test MSE:", round(test_mse, 4)))
## [1] "Test MSE: 3.7844"
# Print a summary of the tree
printcp(reg_tree)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = train_data, method = "anova")
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Price ShelveLoc
##
## Root node error: 2227.2/280 = 7.9544
##
## n= 280
##
## CP nsplit rel error xerror xstd
## 1 0.254622 0 1.00000 1.00742 0.084722
## 2 0.092152 1 0.74538 0.76140 0.059011
## 3 0.070902 2 0.65323 0.73872 0.058054
## 4 0.043245 3 0.58232 0.63047 0.048350
## 5 0.036049 4 0.53908 0.62552 0.050904
## 6 0.032271 5 0.50303 0.63451 0.052964
## 7 0.024286 7 0.43849 0.59448 0.048571
## 8 0.017482 8 0.41420 0.59106 0.048463
## 9 0.015915 9 0.39672 0.60713 0.052976
## 10 0.015626 10 0.38080 0.61605 0.053264
## 11 0.014133 11 0.36518 0.60544 0.052892
## 12 0.013544 12 0.35104 0.59908 0.052783
## 13 0.012653 14 0.32396 0.59610 0.052405
## 14 0.010461 15 0.31130 0.58343 0.051559
## 15 0.010000 16 0.30084 0.58139 0.051643
# Cross-validation to determine optimal complexity parameter
cv_tree <- rpart(Sales ~ ., data = train_data, method = "anova",
control = rpart.control(cp = 0.001, xval = 10))
# Plot the cross-validation results
plotcp(cv_tree)
# Find the optimal cp value
optimal_cp <- cv_tree$cptable[which.min(cv_tree$cptable[,"xerror"]), "CP"]
print(paste("Optimal complexity parameter:", optimal_cp))
## [1] "Optimal complexity parameter: 0.001"
# Prune the tree using the optimal cp
pruned_tree <- prune(cv_tree, cp = optimal_cp)
# Plot the pruned tree
rpart.plot(pruned_tree, type = 4, extra = 101, fallen.leaves = TRUE,
main = "Pruned Regression Tree for Carseats Sales")
# Make predictions with the pruned tree
pruned_predictions <- predict(pruned_tree, test_data)
# Calculate test MSE for the pruned tree
pruned_test_mse <- mean((pruned_predictions - test_data$Sales)^2)
print(paste("Pruned Tree Test MSE:", round(pruned_test_mse, 4)))
## [1] "Pruned Tree Test MSE: 3.5192"
# Fit a bagging model (Random Forest with m = p)
bagging_model <- randomForest(Sales ~ ., data = train_data,
mtry = ncol(train_data) - 1, # All predictors
importance = TRUE,
ntree = 500)
# Make predictions on the test set
bagging_predictions <- predict(bagging_model, test_data)
# Calculate test MSE for the bagging model
bagging_test_mse <- mean((bagging_predictions - test_data$Sales)^2)
print(paste("Bagging Test MSE:", round(bagging_test_mse, 4)))
## [1] "Bagging Test MSE: 2.2414"
# Variable importance
importance(bagging_model)
## %IncMSE IncNodePurity
## CompPrice 34.6929966 263.733263
## Income 9.4304377 113.884814
## Advertising 19.7679196 145.470012
## Population -2.0798516 64.696448
## Price 66.7489094 632.515073
## ShelveLoc 72.3378601 690.312936
## Age 18.4365623 187.336470
## Education 3.4295024 64.064236
## Urban 1.4639584 9.568371
## US 0.8511541 7.816334
varImpPlot(bagging_model, main = "Variable Importance from Bagging")
# Try different values of mtry
mtry_values <- c(1:9) # Adjust based on number of predictors
rf_results <- data.frame(mtry = integer(), mse = numeric())
for (m in mtry_values) {
rf_model <- randomForest(Sales ~ ., data = train_data,
mtry = m,
importance = TRUE,
ntree = 500)
# Make predictions
rf_predictions <- predict(rf_model, test_data)
# Calculate test MSE
rf_test_mse <- mean((rf_predictions - test_data$Sales)^2)
# Store results
rf_results <- rbind(rf_results, data.frame(mtry = m, mse = rf_test_mse))
}
# Find optimal mtry
optimal_mtry <- rf_results$mtry[which.min(rf_results$mse)]
print(paste("Optimal mtry:", optimal_mtry))
## [1] "Optimal mtry: 8"
print(paste("Minimum Test MSE:", round(min(rf_results$mse), 4)))
## [1] "Minimum Test MSE: 2.2227"
# Plot mtry vs MSE
plot(rf_results$mtry, rf_results$mse, type = "b",
xlab = "Number of Variables at Each Split (mtry)",
ylab = "Test MSE", main = "Random Forest: mtry vs Test MSE")
# Fit the random forest with optimal mtry
optimal_rf <- randomForest(Sales ~ ., data = train_data,
mtry = optimal_mtry,
importance = TRUE,
ntree = 500)
# Variable importance
importance(optimal_rf)
## %IncMSE IncNodePurity
## CompPrice 29.837365 253.609378
## Income 8.789126 122.113482
## Advertising 21.677064 162.568311
## Population -1.445769 69.865277
## Price 60.172243 588.860323
## ShelveLoc 65.302222 676.736537
## Age 19.149264 209.696319
## Education 2.726531 70.367105
## Urban 1.071614 9.385424
## US 2.388123 9.628030
varImpPlot(optimal_rf, main = "Variable Importance from Random Forest")
# Prepare data for BART
x_train <- train_data[, names(train_data) != "Sales"]
y_train <- train_data$Sales
x_test <- test_data[, names(test_data) != "Sales"]
y_test <- test_data$Sales
# Convert categorical variables to factors if they aren't already
for (i in 1:ncol(x_train)) {
if (is.character(x_train[,i])) {
x_train[,i] <- as.factor(x_train[,i])
x_test[,i] <- as.factor(x_test[,i])
}
}
# Fit BART model
bart_model <- gbart(x_train, y_train, x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 280, 14, 120
## y1,yn: 3.222214, -2.907786
## x1,x[n*p]: 104.000000, 1.000000
## xp1,xp[np*p]: 138.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 67 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.198596,7.43779
## *****sigma: 1.009719
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 5s
## trcnt,tecnt: 1000,1000
# Extract predictions (BART returns posterior samples, take the mean)
bart_predictions <- bart_model$yhat.test.mean
# Calculate test MSE for BART
bart_test_mse <- mean((bart_predictions - y_test)^2)
print(paste("BART Test MSE:", round(bart_test_mse, 4)))
## [1] "BART Test MSE: 1.5911"
# Variable importance in BART
bart_var_importance <- apply(bart_model$varcount, 2, mean)
names(bart_var_importance) <- names(x_train)
bart_var_importance <- sort(bart_var_importance, decreasing = TRUE)
print("BART Variable Importance:")
## [1] "BART Variable Importance:"
print(bart_var_importance)
## Price CompPrice ShelveLoc <NA> Urban <NA>
## 24.802 18.387 18.361 17.020 16.623 16.550
## Age Income <NA> <NA> Education Population
## 16.509 16.386 15.989 15.603 15.597 15.592
## Advertising US
## 15.338 14.832
# Plot variable importance
barplot(bart_var_importance,
main = "Variable Importance from BART",
xlab = "Variables", ylab = "Inclusion Proportion",
las = 2, cex.names = 0.7)
"BART Test MSE: 1.6107"
library(ISLR)
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(class)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Load the Caravan dataset
data(Caravan)
# Convert Purchase to a binary factor (0/1)
Caravan$Purchase <- ifelse(Caravan$Purchase == "Yes", 1, 0)
# Create training and test sets
train_data <- Caravan[1:1000, ]
test_data <- Caravan[1001:nrow(Caravan), ]
# Check dimensions
dim(train_data)
## [1] 1000 86
dim(test_data)
## [1] 4822 86
# Define shrinkage values to try
shrinkage_values <- c(0.01, 0.05, 0.1, 0.2)
# Store models and their performance
boost_models <- list()
test_errors <- numeric(length(shrinkage_values))
importance_lists <- list()
# Fit boosting models with different shrinkage values
for (i in 1:length(shrinkage_values)) {
lambda <- shrinkage_values[i]
# Fit boosting model
boost_model <- gbm(Purchase ~ .,
data = train_data,
distribution = "bernoulli",
n.trees = 1000,
interaction.depth = 4,
shrinkage = lambda,
verbose = FALSE)
# Store the model
boost_models[[i]] <- boost_model
# Make predictions on test set
pred_probs <- predict(boost_model,
newdata = test_data,
n.trees = 1000,
type = "response")
# Calculate test error (misclassification rate with 0.2 threshold)
pred_class <- ifelse(pred_probs > 0.2, 1, 0)
test_errors[i] <- mean(pred_class != test_data$Purchase)
# Get variable importance
importance <- summary(boost_model, plotit = FALSE)
importance_lists[[i]] <- importance
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
# Find the best shrinkage value
best_index <- which.min(test_errors)
best_lambda <- shrinkage_values[best_index]
best_model <- boost_models[[best_index]]
# Print results
cat("Best shrinkage value:", best_lambda, "\n")
## Best shrinkage value: 0.01
cat("Test error rate:", test_errors[best_index], "\n\n")
## Test error rate: 0.09083368
# Plot variable importance for the best model
print("Most important predictors:")
## [1] "Most important predictors:"
importance <- importance_lists[[best_index]]
head(importance, 10)
## var rel.inf
## PPERSAUT PPERSAUT 7.919325
## MGODGE MGODGE 5.113786
## MOSTYPE MOSTYPE 4.623106
## MKOOPKLA MKOOPKLA 4.570109
## MOPLHOOG MOPLHOOG 4.131745
## PBRAND PBRAND 4.008690
## MGODPR MGODPR 3.446960
## MBERMIDD MBERMIDD 3.355803
## MINK3045 MINK3045 3.059326
## MAUT2 MAUT2 2.939429
# Plot variable importance
par(mar = c(10, 4, 4, 2) + 0.1)
barplot(importance$rel.inf[1:10],
names.arg = importance$var[1:10],
las = 2,
main = "Top 10 Most Important Variables",
cex.names = 0.7)
# Use the best boosting model to predict on test data
best_boost_probs <- predict(best_model,
newdata = test_data,
n.trees = 1000,
type = "response")
# Create predictions using 0.2 threshold
best_boost_preds <- ifelse(best_boost_probs > 0.2, 1, 0)
# Create confusion matrix
boost_conf_matrix <- table(Predicted = best_boost_preds,
Actual = test_data$Purchase)
print("Boosting Confusion Matrix:")
## [1] "Boosting Confusion Matrix:"
print(boost_conf_matrix)
## Actual
## Predicted 0 1
## 0 4353 258
## 1 180 31
# Calculate precision (fraction of predicted purchases that are actual purchases)
boost_precision <- boost_conf_matrix[2,2] / sum(boost_conf_matrix[2,])
cat("Boosting Precision (fraction of predicted purchases that are actual):",
round(boost_precision, 4), "\n")
## Boosting Precision (fraction of predicted purchases that are actual): 0.1469
# Calculate other metrics
boost_recall <- boost_conf_matrix[2,2] / sum(boost_conf_matrix[,2])
boost_accuracy <- sum(diag(boost_conf_matrix)) / sum(boost_conf_matrix)
cat("Boosting Recall:", round(boost_recall, 4), "\n")
## Boosting Recall: 0.1073
cat("Boosting Accuracy:", round(boost_accuracy, 4), "\n\n")
## Boosting Accuracy: 0.9092
# Now implement KNN with different k values
# First, standardize the data
standardized_train <- scale(train_data[, -which(names(train_data) == "Purchase")])
standardized_test <- scale(test_data[, -which(names(test_data) == "Purchase")],
center = attr(standardized_train, "scaled:center"),
scale = attr(standardized_train, "scaled:scale"))
# Try different values of k
k_values <- c(1, 3, 5, 7, 9, 11, 13, 15, 17, 19)
knn_test_errors <- numeric(length(k_values))
knn_precision <- numeric(length(k_values))
# Identify complete cases
complete_indices <- complete.cases(train_data)
train_data_complete <- train_data[complete_indices, ]
# Do the same for test data
test_complete_indices <- complete.cases(test_data)
test_data_complete <- test_data[test_complete_indices, ]
for (i in 1:length(k_values)) {
k <- k_values[i]
# Fit KNN model
knn_pred_probs <- knn(train = train_data_complete,
test = test_data_complete,
cl = train_data$Purchase,
k = k,
prob = TRUE)
# Convert to binary predictions
knn_preds <- as.numeric(as.character(knn_pred_probs))
# Calculate test error
knn_test_errors[i] <- mean(knn_preds != test_data$Purchase)
# Create confusion matrix
knn_conf_matrix <- table(Predicted = knn_preds, Actual = test_data$Purchase)
# Calculate precision (if possible)
if (sum(knn_preds) > 0) {
knn_precision[i] <- sum(knn_preds & test_data$Purchase) / sum(knn_preds)
} else {
knn_precision[i] <- NA
}
}
# Find best k
best_k_index <- which.min(knn_test_errors)
best_k <- k_values[best_k_index]
# Print KNN results
cat("Best k value:", best_k, "\n")
## Best k value: 11
cat("KNN Test error rate:", knn_test_errors[best_k_index], "\n")
## KNN Test error rate: 0.05993364
cat("KNN Precision:", knn_precision[best_k_index], "\n\n")
## KNN Precision: 0.5
# Final comparison
cat("Comparison of Best Models:\n")
## Comparison of Best Models:
cat("Boosting (shrinkage =", best_lambda, ") - Precision:", round(boost_precision, 4), "\n")
## Boosting (shrinkage = 0.01 ) - Precision: 0.1469
cat("KNN (k =", best_k, ") - Precision:", round(knn_precision[best_k_index], 4), "\n")
## KNN (k = 11 ) - Precision: 0.5
# Plot comparison of error rates
plot(shrinkage_values, test_errors, type = "b", col = "blue",
xlab = "Shrinkage Value", ylab = "Test Error Rate",
main = "Comparison of Model Performance",
ylim = c(min(c(test_errors, knn_test_errors)),
max(c(test_errors, knn_test_errors))))
points(k_values/100, knn_test_errors, type = "b", col = "red")
legend("topright", legend = c("Boosting", "KNN"),
col = c("blue", "red"), lty = 1, pch = 1)
Interpretation:
KNN (red line) demonstrates significantly lower test error rates (around 0.06) compared to Boosting (blue line, around 0.09) across different shrinkage values.
KNN’s performance improves dramatically as shrinkage increases from 0.01 to about 0.10, then stabilizes, suggesting an optimal shrinkage value around 0.10-0.15.
Boosting shows only slight improvement with increased shrinkage values, maintaining a relatively consistent error rate.