library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)
library(AppliedPredictiveModeling)
library(mice)
library(rpart)
library(rpart.plot)
library(RANN)

Exercise 8.1

Data Setup

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y) %>%
  as.data.frame()
colnames(simulated)[ncol(simulated)] <- "y"

Part (a)

set.seed(200)
model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)

rfImp1_df <- rfImp1 %>%
  rownames_to_column("Variable") %>%
  arrange(desc(Overall))

ggplot(rfImp1_df, aes(x = reorder(Variable, Overall), y = Overall)) +
  geom_col(fill = "#2c7bb6") +
  coord_flip() +
  labs(title = "Variable Importance - Random Forest (Original Predictors)",
       x = "Predictor", y = "Importance (MSE Increase)") +
  theme_minimal()

Observation: The model correctly ignores V6-V10. V1 and V2 dominate because of their nonlinear interaction in the true formula.

Part (b)

simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9476651
set.seed(200)
model2 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp2 <- varImp(model2, scale = FALSE)

rfImp2_df <- rfImp2 %>%
  rownames_to_column("Variable") %>%
  arrange(desc(Overall))

ggplot(rfImp2_df, aes(x = reorder(Variable, Overall), y = Overall)) +
  geom_col(fill = "#d7191c") +
  coord_flip() +
  labs(title = "Variable Importance - Random Forest (With Duplicate of V1)",
       x = "Predictor", y = "Importance (MSE Increase)") +
  theme_minimal()

Observation: V1’s importance drops sharply once its clone enters the picture. The two nearly identical predictors split the credit between them.

Part (c)

set.seed(200)
cfModel <- cforest(y ~ ., data = simulated)

cfImp_traditional <- varimp(cfModel, conditional = FALSE)
cfImp_conditional  <- varimp(cfModel, conditional = TRUE)

cfImp_df <- tibble(
  Variable    = names(cfImp_traditional),
  Traditional = cfImp_traditional,
  Conditional = cfImp_conditional
) %>%
  pivot_longer(-Variable, names_to = "Type", values_to = "Importance")

ggplot(cfImp_df, aes(x = reorder(Variable, Importance), y = Importance,
                     fill = Type)) +
  geom_col(position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("Traditional" = "#2c7bb6",
                                "Conditional"  = "#fdae61")) +
  labs(title = "cforest Variable Importance: Traditional vs Conditional",
       x = "Predictor", y = "Importance") +
  theme_minimal()

Observation: Conditional importance fixes the problem that duplicate1 gets knocked down and V1 earns back its rightful spot at the top.

Part (d)

set.seed(200)
gbmModel <- train(y ~ ., data = simulated,
                  method = "gbm",
                  tuneLength = 5,
                  verbose = FALSE)

gbmImp <- varImp(gbmModel)$importance %>%
  rownames_to_column("Variable") %>%
  rename(Importance = Overall) %>%
  arrange(desc(Importance))

ggplot(gbmImp, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_col(fill = "#1a9641") +
  coord_flip() +
  labs(title = "Variable Importance - Gradient Boosted Trees",
       x = "Predictor", y = "Importance") +
  theme_minimal()

set.seed(200)
cubistModel <- train(y ~ ., data = simulated,
                     method = "cubist",
                     tuneLength = 5)

cubistImp <- varImp(cubistModel)$importance %>%
  rownames_to_column("Variable") %>%
  rename(Importance = Overall) %>%
  arrange(desc(Importance))

ggplot(cubistImp, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_col(fill = "#7b2d8b") +
  coord_flip() +
  labs(title = "Variable Importance - Cubist",
       x = "Predictor", y = "Importance") +
  theme_minimal()

Observation: GBM and Cubist tell the same story as random forest. V1-V5 matter, V6-V10 do not, though correlated predictors still dilute each other.

Exercise 8.2

set.seed(200)
n <- 500

# y is random noise - no true relationship with any predictor
y_noise <- rnorm(n)

bias_data <- tibble(
  y          = y_noise,
  continuous = rnorm(n),                          # many unique values
  fine       = round(rnorm(n), 1),                # moderate granularity
  coarse     = round(rnorm(n)),                   # integer - low granularity
  binary     = sample(c(0, 1), n, replace = TRUE) # only 2 values
)

set.seed(200)
bias_tree <- rpart(y ~ ., data = bias_data, control = rpart.control(maxdepth = 3))

bias_imp <- bias_tree$variable.importance

if (length(bias_imp) > 0) {
  tibble(Variable   = names(bias_imp),
         Importance = bias_imp) %>%
    ggplot(aes(x = reorder(Variable, Importance), y = Importance)) +
    geom_col(fill = "#e66101") +
    coord_flip() +
    labs(title = "Tree Bias: Importance by Predictor Granularity",
         subtitle = "All predictors are pure noise - y has no true relationship with any",
         x = "Predictor", y = "Importance") +
    theme_minimal()
} else {
  message("No splits were made - all predictors correctly ignored.")
}

Observation: The continuous predictor wins even though none of them mean anything. More split points means more chances to look useful by accident.

Exercise 8.3

data(solubility)

sol_df <- solTrainXtrans %>%
  as.data.frame() %>%
  mutate(y = solTrainY)
set.seed(200)
gbm_low <- train(y ~ ., data = sol_df,
                 method = "gbm",
                 verbose = FALSE,
                 tuneGrid = expand.grid(
                   n.trees           = 1000,
                   interaction.depth = 4,
                   shrinkage         = 0.1,
                   n.minobsinnode    = 10
                 ))

imp_low <- varImp(gbm_low)$importance %>%
  rownames_to_column("Variable") %>%
  rename(Importance = Overall) %>%
  arrange(desc(Importance)) %>%
  slice(1:20) %>%
  mutate(Setting = "Low (0.1 / 0.1)")
set.seed(200)
gbm_high <- train(y ~ ., data = sol_df,
                  method = "gbm",
                  verbose = FALSE,
                  tuneGrid = expand.grid(
                    n.trees           = 1000,
                    interaction.depth = 4,
                    shrinkage         = 0.9,
                    n.minobsinnode    = 10
                  ))

imp_high <- varImp(gbm_high)$importance %>%
  rownames_to_column("Variable") %>%
  rename(Importance = Overall) %>%
  arrange(desc(Importance)) %>%
  slice(1:20) %>%
  mutate(Setting = "High (0.9 / 0.9)")
bind_rows(imp_low, imp_high) %>%
  ggplot(aes(x = reorder(Variable, Importance), y = Importance, fill = Setting)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ Setting, scales = "free") +
  scale_fill_manual(values = c("Low (0.1 / 0.1)"  = "#2c7bb6",
                                "High (0.9 / 0.9)" = "#d7191c")) +
  labs(title = "GBM Variable Importance: Low vs High Shrinkage",
       x = "Predictor", y = "Importance") +
  theme_minimal() +
  theme(legend.position = "none")

Part (a): The high-shrinkage model concentrates almost all importance on just a handful of predictors. There’s steep dropoff after the top 2 or 3.

Part (b): The low-shrinkage model is likely to generalize better. High shrinkage risks overfitting to whichever predictors happen to correlate with the noise in the training set.

Part (c): Increasing interaction depth would steepen the importance slope even further in the high-shrinkage model.

Exercise 8.7

Data Setup

data(ChemicalManufacturingProcess)

# Separate predictors and response
chem_x <- ChemicalManufacturingProcess[, -1]   # all columns except yield
chem_y <- ChemicalManufacturingProcess[, 1] 

# Impute missing values with kNN imputation
set.seed(200)
impute_model <- preProcess(chem_x, method = "knnImpute")
chem_x_imp   <- predict(impute_model, chem_x)

# Remove near-zero variance predictors
nzv       <- nearZeroVar(chem_x_imp)
chem_x_imp <- chem_x_imp[, -nzv]

# Remove highly correlated predictors (threshold 0.90)
cor_mat    <- cor(chem_x_imp, use = "pairwise.complete.obs")
high_cor   <- findCorrelation(cor_mat, cutoff = 0.90)
chem_x_imp <- chem_x_imp[, -high_cor]

# Train / test split (80/20)
set.seed(200)
train_idx  <- createDataPartition(chem_y, p = 0.80, list = FALSE)

train_x <- chem_x_imp[train_idx, ]
test_x  <- chem_x_imp[-train_idx, ]
train_y <- chem_y[train_idx]
test_y  <- chem_y[-train_idx]

train_df <- train_x %>% as.data.frame() %>% mutate(yield = train_y)
test_df  <- test_x  %>% as.data.frame() %>% mutate(yield = test_y)

Part (a)

ctrl <- trainControl(method = "cv", number = 10)

set.seed(200)
cart_model <- train(yield ~ ., data = train_df,
                    method = "rpart",
                    tuneLength = 10,
                    trControl = ctrl)

set.seed(200)
rf_model <- train(yield ~ ., data = train_df,
                  method = "rf",
                  tuneLength = 5,
                  ntree = 500,
                  trControl = ctrl)

set.seed(200)
gbm_model <- train(yield ~ ., data = train_df,
                   method = "gbm",
                   tuneLength = 5,
                   verbose = FALSE,
                   trControl = ctrl)
resamps <- resamples(list(CART = cart_model,
                          RF   = rf_model,
                          GBM  = gbm_model))

summary(resamps)
## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: CART, RF, GBM 
## Number of resamples: 10 
## 
## MAE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
## CART 0.7521420 0.9143625 1.0173337 1.0014329 1.0487896 1.272429    0
## RF   0.7160432 0.7604402 0.7871186 0.8577466 0.9142958 1.261202    0
## GBM  0.5995838 0.7775140 0.8527816 0.8722095 0.9254392 1.335244    0
## 
## RMSE 
##           Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## CART 1.0272083 1.1363924 1.214104 1.233823 1.317890 1.543316    0
## RF   0.8999486 0.9519647 1.051475 1.123891 1.157207 1.738877    0
## GBM  0.8539661 0.9401171 1.045415 1.091232 1.143023 1.698583    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.4254823 0.5276662 0.5651583 0.5561615 0.5994056 0.6942489    0
## RF   0.3414838 0.5646167 0.6537324 0.6304415 0.7036304 0.8294807    0
## GBM  0.4525038 0.5857719 0.6417557 0.6401266 0.7222969 0.8316321    0
dotplot(resamps, metric = "RMSE",
        main = "Cross-Validated RMSE: CART vs RF vs GBM")

results <- tibble(
  Model = c("CART", "RF", "GBM"),
  Test_RMSE = c(
    RMSE(predict(cart_model, test_df), test_y),
    RMSE(predict(rf_model,  test_df), test_y),
    RMSE(predict(gbm_model, test_df), test_y)
  ),
  Test_R2 = c(
    cor(predict(cart_model, test_df), test_y)^2,
    cor(predict(rf_model,  test_df), test_y)^2,
    cor(predict(gbm_model, test_df), test_y)^2
  )
)

results
## # A tibble: 3 × 3
##   Model Test_RMSE Test_R2
##   <chr>     <dbl>   <dbl>
## 1 CART       1.76   0.255
## 2 RF         1.40   0.523
## 3 GBM        1.37   0.544

Observation: RF and GBM both crush the single tree,ensembling handle the noise in this dataset much better than any one split structure can.

Part (b)

best_model <- gbm_model  

imp_df <- varImp(best_model)$importance %>%
  rownames_to_column("Variable") %>%
  rename(Importance = Overall) %>%
  arrange(desc(Importance)) %>%
  slice(1:15) %>%
  mutate(
    Type = if_else(str_detect(Variable, "^Bio"),
                   "Biological", "Process")
  )

ggplot(imp_df, aes(x = reorder(Variable, Importance),
                   y = Importance, fill = Type)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("Biological" = "#2c7bb6",
                                "Process"    = "#d7191c")) +
  labs(title = "Top 15 Predictors - Optimal Tree-Based Model (GBM)",
       x = "Predictor", y = "Importance") +
  theme_minimal()

Observation: Process variables own the top of the list. Unlike biological inputs, these can actually be adjusted on the factory floor.

Part (c)

best_cart <- cart_model$finalModel

rpart.plot(best_cart,
           type   = 4,
           extra  = 101,
           fallen.leaves = TRUE,
           main   = "Optimal Single Tree - Chemical Manufacturing Yield")

# Get node assignments and plot yield distribution by terminal node
node_assign <- predict(best_cart, train_df, type = "vector")

# rpart node memberships
train_df %>%
  mutate(node = as.factor(best_cart$where)) %>%
  ggplot(aes(x = node, y = yield, fill = node)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Yield Distribution by Terminal Node - Single Tree",
       x = "Terminal Node", y = "Yield") +
  theme_minimal() +
  theme(legend.position = "none")

Observation: Each leaf captures a meaningfully different yield range, and the wide boxes show exactly where the tree runs out of answers. ```