library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)
library(AppliedPredictiveModeling)
library(mice)
library(rpart)
library(rpart.plot)
library(RANN)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y) %>%
as.data.frame()
colnames(simulated)[ncol(simulated)] <- "y"
set.seed(200)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1_df <- rfImp1 %>%
rownames_to_column("Variable") %>%
arrange(desc(Overall))
ggplot(rfImp1_df, aes(x = reorder(Variable, Overall), y = Overall)) +
geom_col(fill = "#2c7bb6") +
coord_flip() +
labs(title = "Variable Importance - Random Forest (Original Predictors)",
x = "Predictor", y = "Importance (MSE Increase)") +
theme_minimal()
Observation: The model correctly ignores V6-V10. V1 and V2 dominate because of their nonlinear interaction in the true formula.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9476651
set.seed(200)
model2 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2_df <- rfImp2 %>%
rownames_to_column("Variable") %>%
arrange(desc(Overall))
ggplot(rfImp2_df, aes(x = reorder(Variable, Overall), y = Overall)) +
geom_col(fill = "#d7191c") +
coord_flip() +
labs(title = "Variable Importance - Random Forest (With Duplicate of V1)",
x = "Predictor", y = "Importance (MSE Increase)") +
theme_minimal()
Observation: V1’s importance drops sharply once its clone enters the picture. The two nearly identical predictors split the credit between them.
set.seed(200)
cfModel <- cforest(y ~ ., data = simulated)
cfImp_traditional <- varimp(cfModel, conditional = FALSE)
cfImp_conditional <- varimp(cfModel, conditional = TRUE)
cfImp_df <- tibble(
Variable = names(cfImp_traditional),
Traditional = cfImp_traditional,
Conditional = cfImp_conditional
) %>%
pivot_longer(-Variable, names_to = "Type", values_to = "Importance")
ggplot(cfImp_df, aes(x = reorder(Variable, Importance), y = Importance,
fill = Type)) +
geom_col(position = "dodge") +
coord_flip() +
scale_fill_manual(values = c("Traditional" = "#2c7bb6",
"Conditional" = "#fdae61")) +
labs(title = "cforest Variable Importance: Traditional vs Conditional",
x = "Predictor", y = "Importance") +
theme_minimal()
Observation: Conditional importance fixes the problem that duplicate1 gets knocked down and V1 earns back its rightful spot at the top.
set.seed(200)
gbmModel <- train(y ~ ., data = simulated,
method = "gbm",
tuneLength = 5,
verbose = FALSE)
gbmImp <- varImp(gbmModel)$importance %>%
rownames_to_column("Variable") %>%
rename(Importance = Overall) %>%
arrange(desc(Importance))
ggplot(gbmImp, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "#1a9641") +
coord_flip() +
labs(title = "Variable Importance - Gradient Boosted Trees",
x = "Predictor", y = "Importance") +
theme_minimal()
set.seed(200)
cubistModel <- train(y ~ ., data = simulated,
method = "cubist",
tuneLength = 5)
cubistImp <- varImp(cubistModel)$importance %>%
rownames_to_column("Variable") %>%
rename(Importance = Overall) %>%
arrange(desc(Importance))
ggplot(cubistImp, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "#7b2d8b") +
coord_flip() +
labs(title = "Variable Importance - Cubist",
x = "Predictor", y = "Importance") +
theme_minimal()
Observation: GBM and Cubist tell the same story as random forest. V1-V5 matter, V6-V10 do not, though correlated predictors still dilute each other.
set.seed(200)
n <- 500
# y is random noise - no true relationship with any predictor
y_noise <- rnorm(n)
bias_data <- tibble(
y = y_noise,
continuous = rnorm(n), # many unique values
fine = round(rnorm(n), 1), # moderate granularity
coarse = round(rnorm(n)), # integer - low granularity
binary = sample(c(0, 1), n, replace = TRUE) # only 2 values
)
set.seed(200)
bias_tree <- rpart(y ~ ., data = bias_data, control = rpart.control(maxdepth = 3))
bias_imp <- bias_tree$variable.importance
if (length(bias_imp) > 0) {
tibble(Variable = names(bias_imp),
Importance = bias_imp) %>%
ggplot(aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "#e66101") +
coord_flip() +
labs(title = "Tree Bias: Importance by Predictor Granularity",
subtitle = "All predictors are pure noise - y has no true relationship with any",
x = "Predictor", y = "Importance") +
theme_minimal()
} else {
message("No splits were made - all predictors correctly ignored.")
}
Observation: The continuous predictor wins even though none of them mean anything. More split points means more chances to look useful by accident.
data(solubility)
sol_df <- solTrainXtrans %>%
as.data.frame() %>%
mutate(y = solTrainY)
set.seed(200)
gbm_low <- train(y ~ ., data = sol_df,
method = "gbm",
verbose = FALSE,
tuneGrid = expand.grid(
n.trees = 1000,
interaction.depth = 4,
shrinkage = 0.1,
n.minobsinnode = 10
))
imp_low <- varImp(gbm_low)$importance %>%
rownames_to_column("Variable") %>%
rename(Importance = Overall) %>%
arrange(desc(Importance)) %>%
slice(1:20) %>%
mutate(Setting = "Low (0.1 / 0.1)")
set.seed(200)
gbm_high <- train(y ~ ., data = sol_df,
method = "gbm",
verbose = FALSE,
tuneGrid = expand.grid(
n.trees = 1000,
interaction.depth = 4,
shrinkage = 0.9,
n.minobsinnode = 10
))
imp_high <- varImp(gbm_high)$importance %>%
rownames_to_column("Variable") %>%
rename(Importance = Overall) %>%
arrange(desc(Importance)) %>%
slice(1:20) %>%
mutate(Setting = "High (0.9 / 0.9)")
bind_rows(imp_low, imp_high) %>%
ggplot(aes(x = reorder(Variable, Importance), y = Importance, fill = Setting)) +
geom_col() +
coord_flip() +
facet_wrap(~ Setting, scales = "free") +
scale_fill_manual(values = c("Low (0.1 / 0.1)" = "#2c7bb6",
"High (0.9 / 0.9)" = "#d7191c")) +
labs(title = "GBM Variable Importance: Low vs High Shrinkage",
x = "Predictor", y = "Importance") +
theme_minimal() +
theme(legend.position = "none")
Part (a): The high-shrinkage model concentrates almost all importance on just a handful of predictors. There’s steep dropoff after the top 2 or 3.
Part (b): The low-shrinkage model is likely to generalize better. High shrinkage risks overfitting to whichever predictors happen to correlate with the noise in the training set.
Part (c): Increasing interaction depth would steepen the importance slope even further in the high-shrinkage model.
data(ChemicalManufacturingProcess)
# Separate predictors and response
chem_x <- ChemicalManufacturingProcess[, -1] # all columns except yield
chem_y <- ChemicalManufacturingProcess[, 1]
# Impute missing values with kNN imputation
set.seed(200)
impute_model <- preProcess(chem_x, method = "knnImpute")
chem_x_imp <- predict(impute_model, chem_x)
# Remove near-zero variance predictors
nzv <- nearZeroVar(chem_x_imp)
chem_x_imp <- chem_x_imp[, -nzv]
# Remove highly correlated predictors (threshold 0.90)
cor_mat <- cor(chem_x_imp, use = "pairwise.complete.obs")
high_cor <- findCorrelation(cor_mat, cutoff = 0.90)
chem_x_imp <- chem_x_imp[, -high_cor]
# Train / test split (80/20)
set.seed(200)
train_idx <- createDataPartition(chem_y, p = 0.80, list = FALSE)
train_x <- chem_x_imp[train_idx, ]
test_x <- chem_x_imp[-train_idx, ]
train_y <- chem_y[train_idx]
test_y <- chem_y[-train_idx]
train_df <- train_x %>% as.data.frame() %>% mutate(yield = train_y)
test_df <- test_x %>% as.data.frame() %>% mutate(yield = test_y)
ctrl <- trainControl(method = "cv", number = 10)
set.seed(200)
cart_model <- train(yield ~ ., data = train_df,
method = "rpart",
tuneLength = 10,
trControl = ctrl)
set.seed(200)
rf_model <- train(yield ~ ., data = train_df,
method = "rf",
tuneLength = 5,
ntree = 500,
trControl = ctrl)
set.seed(200)
gbm_model <- train(yield ~ ., data = train_df,
method = "gbm",
tuneLength = 5,
verbose = FALSE,
trControl = ctrl)
resamps <- resamples(list(CART = cart_model,
RF = rf_model,
GBM = gbm_model))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: CART, RF, GBM
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.7521420 0.9143625 1.0173337 1.0014329 1.0487896 1.272429 0
## RF 0.7160432 0.7604402 0.7871186 0.8577466 0.9142958 1.261202 0
## GBM 0.5995838 0.7775140 0.8527816 0.8722095 0.9254392 1.335244 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 1.0272083 1.1363924 1.214104 1.233823 1.317890 1.543316 0
## RF 0.8999486 0.9519647 1.051475 1.123891 1.157207 1.738877 0
## GBM 0.8539661 0.9401171 1.045415 1.091232 1.143023 1.698583 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.4254823 0.5276662 0.5651583 0.5561615 0.5994056 0.6942489 0
## RF 0.3414838 0.5646167 0.6537324 0.6304415 0.7036304 0.8294807 0
## GBM 0.4525038 0.5857719 0.6417557 0.6401266 0.7222969 0.8316321 0
dotplot(resamps, metric = "RMSE",
main = "Cross-Validated RMSE: CART vs RF vs GBM")
results <- tibble(
Model = c("CART", "RF", "GBM"),
Test_RMSE = c(
RMSE(predict(cart_model, test_df), test_y),
RMSE(predict(rf_model, test_df), test_y),
RMSE(predict(gbm_model, test_df), test_y)
),
Test_R2 = c(
cor(predict(cart_model, test_df), test_y)^2,
cor(predict(rf_model, test_df), test_y)^2,
cor(predict(gbm_model, test_df), test_y)^2
)
)
results
## # A tibble: 3 × 3
## Model Test_RMSE Test_R2
## <chr> <dbl> <dbl>
## 1 CART 1.76 0.255
## 2 RF 1.40 0.523
## 3 GBM 1.37 0.544
Observation: RF and GBM both crush the single tree,ensembling handle the noise in this dataset much better than any one split structure can.
best_model <- gbm_model
imp_df <- varImp(best_model)$importance %>%
rownames_to_column("Variable") %>%
rename(Importance = Overall) %>%
arrange(desc(Importance)) %>%
slice(1:15) %>%
mutate(
Type = if_else(str_detect(Variable, "^Bio"),
"Biological", "Process")
)
ggplot(imp_df, aes(x = reorder(Variable, Importance),
y = Importance, fill = Type)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("Biological" = "#2c7bb6",
"Process" = "#d7191c")) +
labs(title = "Top 15 Predictors - Optimal Tree-Based Model (GBM)",
x = "Predictor", y = "Importance") +
theme_minimal()
Observation: Process variables own the top of the list. Unlike biological inputs, these can actually be adjusted on the factory floor.
best_cart <- cart_model$finalModel
rpart.plot(best_cart,
type = 4,
extra = 101,
fallen.leaves = TRUE,
main = "Optimal Single Tree - Chemical Manufacturing Yield")
# Get node assignments and plot yield distribution by terminal node
node_assign <- predict(best_cart, train_df, type = "vector")
# rpart node memberships
train_df %>%
mutate(node = as.factor(best_cart$where)) %>%
ggplot(aes(x = node, y = yield, fill = node)) +
geom_boxplot(alpha = 0.7) +
labs(title = "Yield Distribution by Terminal Node - Single Tree",
x = "Terminal Node", y = "Yield") +
theme_minimal() +
theme(legend.position = "none")
Observation: Each leaf captures a meaningfully different yield range, and the wide boxes show exactly where the tree runs out of answers. ```