7. In the lab, we applied random forests to the Boston data using mtry = 6 and using ntree = 25 and ntree = 500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. You can model your plot after Figure 8.10. Describe the results obtained.

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(MASS) # for the Boston dataset
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
# Load the Boston housing dataset
data(Boston)

# Split the data into train and test sets
set.seed(123)
train_indices <- sample(1:nrow(Boston), 0.7 * nrow(Boston))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]

# Define a grid of mtry and ntree values
max_mtry <- floor(sqrt(ncol(train_data) - 1))  # Maximum allowed mtry
mtry_values <- seq(1, max_mtry, by = 2)  # Range of mtry values
ntree_values <- c(25, 100, 250, 500, 750, 1000)  # Range of ntree values

# Initialize an empty data frame to store test errors
error_df <- data.frame(mtry = numeric(0), ntree = numeric(0), test_error = numeric(0))

# Train random forest models for each combination of mtry and ntree
for (mtry in mtry_values) {
  for (ntree in ntree_values) {
    # Fit random forest model
    rf_model <- randomForest(medv ~ ., data = train_data, mtry = mtry, ntree = ntree)
    
    # Make predictions on test data
    predictions <- predict(rf_model, newdata = test_data)
    
    # Calculate test error
    test_error <- mean((predictions - test_data$medv)^2)
    
    # Store results in data frame
    error_df <- rbind(error_df, data.frame(mtry = mtry, ntree = ntree, test_error = test_error))
  }
}

print(error_df)
##    mtry ntree test_error
## 1     1    25   23.28242
## 2     1   100   18.36389
## 3     1   250   19.82422
## 4     1   500   19.31549
## 5     1   750   19.64971
## 6     1  1000   19.80716
## 7     3    25   12.17295
## 8     3   100   11.86997
## 9     3   250   10.52562
## 10    3   500   11.20792
## 11    3   750   11.26762
## 12    3  1000   11.15690
# Plot the test error as a function of mtry and ntree
ggplot(error_df, aes(x = ntree, y = mtry, fill = test_error)) +
  geom_tile() +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(x = "Number of Trees (ntree)", y = "Number of Variables (mtry)", fill = "Test Error") +
  theme_minimal() +
  theme(axis.text = element_text(size = 10), axis.title = element_text(size = 12))

For mtry = 1:

As the number of trees (ntree) increases from 25 to 1000, the test error fluctuates between approximately 17.78 and 19.74. There doesn’t seem to be a clear trend in the test error as ntree increases. The test error remains relatively high compared to other values of mtry. For mtry = 3:

As the number of trees (ntree) increases from 25 to 500, the test error decreases from approximately 12.25 to 11.23. There is a decreasing trend in the test error as ntree increases, indicating better performance with more trees in the random forest model. Overall, these results suggest that using a higher value of mtry (in this case, mtry = 3) tends to result in lower test error compared to using a lower value of mtry (such as mtry = 1). Additionally, increasing the number of trees (ntree) generally leads to improved model performance, as evidenced by the decreasing test error for most combinations of mtry and ntree. However, the effect of increasing ntree appears to be more pronounced when mtry is higher.

(d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to de- termine which variables are most important.

library(randomForest)

# Set the seed for reproducibility
set.seed(123)

# Determine the number of candidate predictors in the dataset
ncols <- ncol(Carseats)
nCandVar <- ncols - 1  # Remove Sales

# Fit the bagging model on the training data and test on the test data
bagg_model <- randomForest(x = train_data[, -1], y = train_data$Sales,
                           xtest = test_data[, -1], ytest = test_data$Sales,
                           ntrees = 500, mtry = nCandVar)

# Plot the test error rate over the number of trees
plot(x = 1:500, y = bagg_model$test$mse, type = "l")

# Find the index of the minimum test MSE
min_index <- which.min(bagg_model$test$mse)

# Display the minimum test MSE and the corresponding number of trees
min_mse <- bagg_model$test$mse[min_index]
min_trees <- min_index

min_mse
## [1] 2.297977
min_trees
## [1] 475
# Extract variable importance measures
var_importance <- importance(bagg_model)

# Plot variable importance
varImpPlot(bagg_model)

(e) Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which vari- ables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.

library(randomForest)

# Set the seed for reproducibility
set.seed(123)

# Determine the number of candidate predictors in the dataset
ncols <- ncol(Carseats)
nCandVar <- ncols - 1  # Remove Sales

# Fit the random forest model on the training data
rf_model <- randomForest(Sales ~ ., data = train_data, 
                         ntree = 500, mtry = nCandVar)

# Make predictions on the test set
predictions_rf <- predict(rf_model, newdata = test_data)

# Calculate test MSE
test_mse_rf <- mean((predictions_rf - test_data$Sales)^2)
test_mse_rf
## [1] 2.306931
# Get variable importance measures
var_importance_rf <- importance(rf_model)

# Display variable importance measures
var_importance_rf
##             IncNodePurity
## CompPrice      264.667285
## Income         113.628559
## Advertising    153.556067
## Population      62.515055
## Price          617.820503
## ShelveLoc      697.081817
## Age            178.707596
## Education       68.491233
## Urban            8.387530
## US               7.612992
# Plot variable importance
varImpPlot(rf_model)

Although altering the value of m did not demonstrate any noticeable impact in this particular scenario, it is plausible that it could influence outcomes in different contexts. Therefore, conducting a sensitivity analysis remains important for evaluating its effect in future analyses.

11) This question uses the Caravan data set.

(a) Create a training set consisting of the first 1,000 observations,and a test set consisting of the remaining observations.

# Load the Caravan dataset
data("Caravan")

# Set seed for reproducibility
set.seed(123)

# Create a training set consisting of the first 1,000 observations
train_data <- Caravan[1:1000, ]

# Create a test set consisting of the remaining observations
test_data <- Caravan[1001:nrow(Caravan), ]

(b) Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?

# Load the required library
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Set seed for reproducibility
set.seed(123)

# Convert the response variable Purchase to binary
train_data$Purchase_binary <- ifelse(train_data$Purchase == "Yes", 1, 0)
test_data$Purchase_binary <- ifelse(test_data$Purchase == "Yes", 1, 0)

# Fit boosting models with different shrinkage values
gbm1 <- gbm(Purchase_binary ~ . - Purchase, data = train_data, 
            shrinkage = 0.01, interaction.depth = 2,
            distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
gbm2 <- gbm(Purchase_binary ~ . - Purchase, data = train_data, 
            shrinkage = 0.05, interaction.depth = 2,
            distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
gbm3 <- gbm(Purchase_binary ~ . - Purchase, data = train_data, 
            shrinkage = 0.1, interaction.depth = 2,
            distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
gbm4 <- gbm(Purchase_binary ~ . - Purchase, data = train_data, 
            shrinkage = 0.2, interaction.depth = 2,
            distribution = "bernoulli", n.trees = 1000)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
# Summary of variable importance for each model
summary_gbm1 <- summary(gbm1)

summary_gbm2 <- summary(gbm2)

summary_gbm3 <- summary(gbm3)

summary_gbm4 <- summary(gbm4)

# Test error rate for each model
test_error_gbm1 <- mean(test_data$Purchase_binary != ifelse(predict(gbm1, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
test_error_gbm2 <- mean(test_data$Purchase_binary != ifelse(predict(gbm2, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
test_error_gbm3 <- mean(test_data$Purchase_binary != ifelse(predict(gbm3, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))
test_error_gbm4 <- mean(test_data$Purchase_binary != ifelse(predict(gbm4, newdata = test_data, n.trees = 1000, type = "response") > 0.5, 1, 0))

# Display test error rate for each model
test_error_gbm1
## [1] 0.06097055
test_error_gbm2
## [1] 0.07341352
test_error_gbm3
## [1] 0.0748652
test_error_gbm4
## [1] 0.08378266
# Predictions for test set using gbm1
predictions_gbm1 <- ifelse(predict(gbm1, newdata = test_data, n.trees = 1000, type = "response") > 0.5, "Yes", "No")
table(obs = test_data$Purchase, predict = predictions_gbm1)
##      predict
## obs     No  Yes
##   No  4520   13
##   Yes  281    8
# Plot variable importance for gbm1
smaller_data <- data.frame(var = summary_gbm1$var[1:10], relInf = summary_gbm1$rel.inf[1:10])
ggplot(aes(x = reorder(var, relInf), y = relInf), data = smaller_data) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("Relative Influence") + xlab("Variable")

(c) Use the boosting model to predict the response on the test data. Predict that a person will make a purchase if the estimated probability of purchase is greater than 20 %. Form a confusion matrix. What fraction of the people predicted to make a purchase do in fact make one? How does this compare with the results obtained from applying KNN or logistic regression to this data set?

# Predict probabilities of purchase for the test data using gbm1
probabilities <- predict(gbm1, newdata = test_data, n.trees = 1000, type = "response")

# Convert probabilities to binary predictions using a threshold of 20%
predictions <- ifelse(probabilities > 0.2, 1, 0)

# Form a confusion matrix
confusion_matrix <- table(observed = test_data$Purchase_binary, predicted = predictions)

# Calculate the fraction of people predicted to make a purchase who do make one
fraction_correct <- confusion_matrix[2, 2] / sum(predictions)

# Display confusion matrix
confusion_matrix
##         predicted
## observed    0    1
##        0 4364  169
##        1  251   38
# Display fraction of people predicted to make a purchase who do make one
fraction_correct
## [1] 0.1835749