set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
print(rfImp1)
## Overall
## V1 8.84289608
## V2 6.74508245
## V3 0.67830653
## V4 7.75934674
## V5 2.23628276
## V6 0.11429887
## V7 0.03724747
## V8 -0.05349642
## V9 -0.04495617
## V10 0.03863205
importance_scores <- data.frame(Variable = rownames(rfImp1), Importance = rfImp1$Overall)
uninformative <- importance_scores[importance_scores$Variable %in% paste0("V", 6:10), ]
informative <- importance_scores[!importance_scores$Variable %in% paste0("V", 6:10), ]
cat("Uninformative Predictors (V6-V10):\n")
## Uninformative Predictors (V6-V10):
print(uninformative)
## Variable Importance
## 6 V6 0.11429887
## 7 V7 0.03724747
## 8 V8 -0.05349642
## 9 V9 -0.04495617
## 10 V10 0.03863205
cat("\nInformative Predictors (V1-V5):\n")
##
## Informative Predictors (V1-V5):
print(informative)
## Variable Importance
## 1 V1 8.8428961
## 2 V2 6.7450825
## 3 V3 0.6783065
## 4 V4 7.7593467
## 5 V5 2.2362828
The random forest model effectively distinguished between informative and uninformative predictors. This result validates the robustness of the model in handling noisy or irrelevant features.
set.seed(200)
cforest_model <- cforest(y ~ ., data = simulated,
controls = cforest_unbiased(ntree = 1000, mtry = 5))
traditional_importance <- varimp(cforest_model, conditional = FALSE)
cat("Traditional Importance:\n")
## Traditional Importance:
print(traditional_importance)
## V1 V2 V3 V4 V5 V6
## 5.695285074 6.014531785 0.017901587 7.423626514 1.891218755 -0.025398835
## V7 V8 V9 V10 duplicate1 duplicate2
## 0.027401958 -0.024664819 -0.002522453 -0.007525192 2.689790351 1.522413571
conditional_importance <- varimp(cforest_model, conditional = TRUE)
cat("\nConditional Importance:\n")
##
## Conditional Importance:
print(conditional_importance)
## V1 V2 V3 V4 V5 V6
## 2.543747111 4.759973453 0.009821272 6.296274284 1.320378722 0.019950345
## V7 V8 V9 V10 duplicate1 duplicate2
## 0.006028479 -0.006716411 0.002773191 -0.003680054 1.406771323 0.611015757
importance_comparison <- data.frame(
Variable = names(traditional_importance),
Traditional = traditional_importance,
Conditional = conditional_importance
)
importance_comparison <- importance_comparison[order(-importance_comparison$Traditional), ]
print(importance_comparison)
## Variable Traditional Conditional
## V4 V4 7.423626514 6.296274284
## V2 V2 6.014531785 4.759973453
## V1 V1 5.695285074 2.543747111
## duplicate1 duplicate1 2.689790351 1.406771323
## V5 V5 1.891218755 1.320378722
## duplicate2 duplicate2 1.522413571 0.611015757
## V7 V7 0.027401958 0.006028479
## V3 V3 0.017901587 0.009821272
## V9 V9 -0.002522453 0.002773191
## V10 V10 -0.007525192 -0.003680054
## V8 V8 -0.024664819 -0.006716411
## V6 V6 -0.025398835 0.019950345
The importance score of V1 (2.5437) drops significantly compared to its traditional score (5.6953), as conditional importance accounts for its redundancy with duplicate1 and duplicate2.
Predictors like V4 (6.2963) and V2 (4.7600) remain prominent because they are not strongly correlated with others and their contributions are independent.
The scores for V6–V10 remain near zero, confirming they are not used significantly by the model.
set.seed(123)
n <- 1000
X1 <- runif(n, 0, 10)
X2 <- factor(sample(letters[1:5], n, replace = TRUE))
y <- 0.5 * X1 + as.numeric(X2) + rnorm(n, 0, 1)
data <- data.frame(X1, X2, y)
if (!require("rpart")) install.packages("rpart", dependencies = TRUE)
library(rpart)
tree_model <- rpart(y ~ X1 + X2, data = data, method = "anova")
plot(tree_model)
text(tree_model, use.n = TRUE)
tree_importance <- tree_model$variable.importance
print(tree_importance)
## X2 X1
## 1933.049 1895.806
if (!require("randomForest")) install.packages("randomForest", dependencies = TRUE)
library(randomForest)
rf_model <- randomForest(y ~ X1 + X2, data = data, importance = TRUE)
rf_importance <- importance(rf_model)
print(rf_importance)
## %IncMSE IncNodePurity
## X1 93.31373 2033.961
## X2 249.75163 1954.218
importance_comparison <- data.frame(
Model = c("Decision Tree", "Random Forest"),
X1 = c(tree_importance["X1"], rf_importance["X1", "IncNodePurity"]),
X2 = c(tree_importance["X2"], rf_importance["X2", "IncNodePurity"])
)
print(importance_comparison)
## Model X1 X2
## X1 Decision Tree 1895.806 1933.049
## Random Forest 2033.961 1954.218
barplot(as.matrix(importance_comparison[-1]), beside = TRUE,
col = c("skyblue", "orange"), legend = rownames(importance_comparison),
main = "Variable Importance Comparison", ylab = "Importance Score")
Random forests exhibit a mild bias toward predictors with finer granularity though they do better at mitigating the bias than a single tree.
data(ChemicalManufacturingProcess)
predictors <- ChemicalManufacturingProcess[, -1]
preProcess_params <- preProcess(predictors, method = "medianImpute")
predictors_imputed <- predict(preProcess_params, predictors)
ChemicalManufacturingProcess_imputed <- cbind(
Yield = ChemicalManufacturingProcess$Yield, predictors_imputed
)
set.seed(123)
trainIndex <- createDataPartition(ChemicalManufacturingProcess$Yield, p = 0.8, list = FALSE)
train_data <- ChemicalManufacturingProcess_imputed[trainIndex, ]
test_data <- ChemicalManufacturingProcess_imputed[-trainIndex, ]
train_predictors <- train_data[, -1]
train_yield <- train_data$Yield
test_predictors <- test_data[, -1]
test_yield <- test_data$Yield
train_control <- trainControl(method = "cv", number = 10)
rpart_model <- train(
x = train_predictors,
y = train_yield,
method = "rpart",
trControl = train_control,
tuneLength = 10
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rf_model <- train(
x = train_predictors,
y = train_yield,
method = "rf",
trControl = train_control,
importance = TRUE,
tuneLength = 10
)
gbm_model <- train(
x = train_predictors,
y = train_yield,
method = "gbm",
trControl = train_control,
tuneLength = 10,
verbose = FALSE
)
rpart_rmse <- RMSE(predict(rpart_model, test_predictors), test_yield)
rf_rmse <- RMSE(predict(rf_model, test_predictors), test_yield)
gbm_rmse <- RMSE(predict(gbm_model, test_predictors), test_yield)
model_performance <- data.frame(
Model = c("Decision Tree", "Random Forest", "Gradient Boosting"),
RMSE = c(rpart_rmse, rf_rmse, gbm_rmse)
)
print(model_performance)
## Model RMSE
## 1 Decision Tree 1.800803
## 2 Random Forest 1.270549
## 3 Gradient Boosting 1.115692
Gradient boosting is the best model, beating out random forest by a slight margin. Both did a lot better than the regular decision tree.
gbm_importance <- summary(gbm_model$finalModel, n.trees = gbm_model$bestTune$n.trees)
top_10_gbm <- head(gbm_importance[order(-gbm_importance$rel.inf), ], 10)
print(top_10_gbm)
## var rel.inf
## ManufacturingProcess32 ManufacturingProcess32 25.713553
## ManufacturingProcess17 ManufacturingProcess17 4.597161
## ManufacturingProcess13 ManufacturingProcess13 4.451986
## BiologicalMaterial03 BiologicalMaterial03 4.406043
## BiologicalMaterial12 BiologicalMaterial12 4.355325
## ManufacturingProcess31 ManufacturingProcess31 4.154567
## ManufacturingProcess27 ManufacturingProcess27 3.261691
## ManufacturingProcess06 ManufacturingProcess06 2.998499
## BiologicalMaterial09 BiologicalMaterial09 2.933156
## ManufacturingProcess09 ManufacturingProcess09 2.603285
The manufacturing processes dominate the list, particularly ManufacturingProcess32.
library(rpart.plot)
rpart.plot(rpart_model$finalModel, main = "Optimal Decision Tree")
train_data$Node <- rpart_model$finalModel$where
node_summary <- aggregate(
Yield ~ Node, data = train_data, FUN = function(x) c(mean = mean(x), sd = sd(x))
)
print(node_summary)
## Node Yield.mean Yield.sd
## 1 5 36.9771429 0.9937759
## 2 7 38.0938462 0.4383024
## 3 8 39.1190909 0.8099686
## 4 9 39.3275000 0.6390075
## 5 11 39.5461111 0.9947399
## 6 12 41.4870000 0.9461154
## 7 15 40.3590000 0.9218511
## 8 16 42.0809091 1.3927559
## 9 18 42.1178947 0.8464867
## 10 19 43.5957143 1.4366147
rpart_importance <- rpart_model$finalModel$variable.importance
rpart_importance_sorted <- sort(rpart_importance, decreasing = TRUE)
top_10_rpart <- head(rpart_importance_sorted, 10)
print(top_10_rpart)
## ManufacturingProcess32 ManufacturingProcess33 ManufacturingProcess36
## 207.37619 130.97444 127.33626
## ManufacturingProcess26 ManufacturingProcess19 ManufacturingProcess28
## 102.38823 80.03993 80.03993
## BiologicalMaterial11 BiologicalMaterial06 BiologicalMaterial03
## 65.84740 53.12715 52.80795
## BiologicalMaterial02
## 52.21608
These are ranked very similarly to the variables in the other models. Manufacturing processes still dominate and by a large margin.