library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(MASS) # for the Boston dataset
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# Load the Boston housing dataset
data(Boston)
# Split the data into train and test sets
set.seed(123)
train_indices <- sample(1:nrow(Boston), 0.7 * nrow(Boston))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
# Define a grid of mtry and ntree values
max_mtry <- floor(sqrt(ncol(train_data) - 1)) # Maximum allowed mtry
mtry_values <- seq(1, max_mtry, by = 2) # Range of mtry values
ntree_values <- c(25, 100, 250, 500, 750, 1000) # Range of ntree values
# Initialize an empty data frame to store test errors
error_df <- data.frame(mtry = numeric(0), ntree = numeric(0), test_error = numeric(0))
# Train random forest models for each combination of mtry and ntree
for (mtry in mtry_values) {
for (ntree in ntree_values) {
# Fit random forest model
rf_model <- randomForest(medv ~ ., data = train_data, mtry = mtry, ntree = ntree)
# Make predictions on test data
predictions <- predict(rf_model, newdata = test_data)
# Calculate test error
test_error <- mean((predictions - test_data$medv)^2)
# Store results in data frame
error_df <- rbind(error_df, data.frame(mtry = mtry, ntree = ntree, test_error = test_error))
}
}
print(error_df)
## mtry ntree test_error
## 1 1 25 23.28242
## 2 1 100 18.36389
## 3 1 250 19.82422
## 4 1 500 19.31549
## 5 1 750 19.64971
## 6 1 1000 19.80716
## 7 3 25 12.17295
## 8 3 100 11.86997
## 9 3 250 10.52562
## 10 3 500 11.20792
## 11 3 750 11.26762
## 12 3 1000 11.15690
# Plot the test error as a function of mtry and ntree
ggplot(error_df, aes(x = ntree, y = mtry, fill = test_error)) +
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(x = "Number of Trees (ntree)", y = "Number of Variables (mtry)", fill = "Test Error") +
theme_minimal() +
theme(axis.text = element_text(size = 10), axis.title = element_text(size = 12))
For mtry = 1:
As the number of trees (ntree) increases from 25 to 1000, the test error fluctuates between approximately 17.78 and 19.74. There doesn’t seem to be a clear trend in the test error as ntree increases. The test error remains relatively high compared to other values of mtry. For mtry = 3:
As the number of trees (ntree) increases from 25 to 500, the test error decreases from approximately 12.25 to 11.23. There is a decreasing trend in the test error as ntree increases, indicating better performance with more trees in the random forest model. Overall, these results suggest that using a higher value of mtry (in this case, mtry = 3) tends to result in lower test error compared to using a lower value of mtry (such as mtry = 1). Additionally, increasing the number of trees (ntree) generally leads to improved model performance, as evidenced by the decreasing test error for most combinations of mtry and ntree. However, the effect of increasing ntree appears to be more pronounced when mtry is higher.
library(randomForest)
# Set the seed for reproducibility
set.seed(123)
# Determine the number of candidate predictors in the dataset
ncols <- ncol(Carseats)
nCandVar <- ncols - 1 # Remove Sales
# Fit the bagging model on the training data and test on the test data
bagg_model <- randomForest(x = train_data[, -1], y = train_data$Sales,
xtest = test_data[, -1], ytest = test_data$Sales,
ntrees = 500, mtry = nCandVar)
# Plot the test error rate over the number of trees
plot(x = 1:500, y = bagg_model$test$mse, type = "l")
# Find the index of the minimum test MSE
min_index <- which.min(bagg_model$test$mse)
# Display the minimum test MSE and the corresponding number of trees
min_mse <- bagg_model$test$mse[min_index]
min_trees <- min_index
min_mse
## [1] 2.297977
min_trees
## [1] 475
# Extract variable importance measures
var_importance <- importance(bagg_model)
# Plot variable importance
varImpPlot(bagg_model)
library(randomForest)
# Set the seed for reproducibility
set.seed(123)
# Determine the number of candidate predictors in the dataset
ncols <- ncol(Carseats)
nCandVar <- ncols - 1 # Remove Sales
# Fit the random forest model on the training data
rf_model <- randomForest(Sales ~ ., data = train_data,
ntree = 500, mtry = nCandVar)
# Make predictions on the test set
predictions_rf <- predict(rf_model, newdata = test_data)
# Calculate test MSE
test_mse_rf <- mean((predictions_rf - test_data$Sales)^2)
test_mse_rf
## [1] 2.306931
# Get variable importance measures
var_importance_rf <- importance(rf_model)
# Display variable importance measures
var_importance_rf
## IncNodePurity
## CompPrice 264.667285
## Income 113.628559
## Advertising 153.556067
## Population 62.515055
## Price 617.820503
## ShelveLoc 697.081817
## Age 178.707596
## Education 68.491233
## Urban 8.387530
## US 7.612992
# Plot variable importance
varImpPlot(rf_model)
Although altering the value of m did not demonstrate any noticeable impact in this particular scenario, it is plausible that it could influence outcomes in different contexts. Therefore, conducting a sensitivity analysis remains important for evaluating its effect in future analyses.
# Load the Caravan dataset
data("Caravan")
# Set seed for reproducibility
set.seed(123)
# Create a training set consisting of the first 1,000 observations
train_data <- Caravan[1:1000, ]
# Create a test set consisting of the remaining observations
test_data <- Caravan[1001:nrow(Caravan), ]
# Load the required library
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Set seed for reproducibility
set.seed(123)
# Convert the response variable Purchase to binary
train_data$Purchase_binary <- ifelse(train_data$Purchase == "Yes", 1, 0)
test_data$Purchase_binary <- ifelse(test_data$Purchase == "Yes", 1, 0)
# Fit boosting models with different shrinkage values
gbm1 <- gbm(Purchase_binary ~ . - Purchase, data = train_data,
shrinkage = 0.01, interaction.depth = 2,
distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
gbm2 <- gbm(Purchase_binary ~ . - Purchase, data = train_data,
shrinkage = 0.05, interaction.depth = 2,
distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
gbm3 <- gbm(Purchase_binary ~ . - Purchase, data = train_data,
shrinkage = 0.1, interaction.depth = 2,
distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
gbm4 <- gbm(Purchase_binary ~ . - Purchase, data = train_data,
shrinkage = 0.2, interaction.depth = 2,
distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
# Summary of variable importance for each model
summary_gbm1 <- summary(gbm1)
summary_gbm2 <- summary(gbm2)
summary_gbm3 <- summary(gbm3)
summary_gbm4 <- summary(gbm4)
# Test error rate for each model
test_error_gbm1 <- mean(test_data$Purchase_binary != ifelse(predict(gbm1, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
test_error_gbm2 <- mean(test_data$Purchase_binary != ifelse(predict(gbm2, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
test_error_gbm3 <- mean(test_data$Purchase_binary != ifelse(predict(gbm3, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
test_error_gbm4 <- mean(test_data$Purchase_binary != ifelse(predict(gbm4, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
# Display test error rate for each model
test_error_gbm1
## [1] 0.06097055
test_error_gbm2
## [1] 0.07341352
test_error_gbm3
## [1] 0.0748652
test_error_gbm4
## [1] 0.08378266
# Predictions for test set using gbm1
predictions_gbm1 <- ifelse(predict(gbm1, newdata = test_data, n.trees = 1000, type = "response") > 0.5, "Yes", "No")
table(obs = test_data$Purchase, predict = predictions_gbm1)
## predict
## obs No Yes
## No 4520 13
## Yes 281 8
# Plot variable importance for gbm1
smaller_data <- data.frame(var = summary_gbm1$var[1:10], relInf = summary_gbm1$rel.inf[1:10])
ggplot(aes(x = reorder(var, relInf), y = relInf), data = smaller_data) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
ylab("Relative Influence") + xlab("Variable")
# Predict probabilities of purchase for the test data using gbm1
probabilities <- predict(gbm1, newdata = test_data, n.trees = 1000, type = "response")
# Convert probabilities to binary predictions using a threshold of 20%
predictions <- ifelse(probabilities > 0.2, 1, 0)
# Form a confusion matrix
confusion_matrix <- table(observed = test_data$Purchase_binary, predicted = predictions)
# Calculate the fraction of people predicted to make a purchase who do make one
fraction_correct <- confusion_matrix[2, 2] / sum(predictions)
# Display confusion matrix
confusion_matrix
## predicted
## observed 0 1
## 0 4364 169
## 1 251 38
# Display fraction of people predicted to make a purchase who do make one
fraction_correct
## [1] 0.1835749