# ============================================================
# LAB 010 — Random Forests & Gradient Boosting
# Dataset: Breast Cancer (Wisconsin)
# Στόχος: Πρόβλεψη κακοήθειας από κυτταρολογικά χαρακτηριστικά
# ============================================================
set.seed(42)
# --- Φόρτωση δεδομένων ---
data("BreastCancer", package = "mlbench")
bc <- BreastCancer
# Καθαρισμός: αφαίρεση ID, χειρισμός missing, μετατροπή σε numeric
bc$Id <- NULL
bc <- na.omit(bc)
bc[, 1:9] <- lapply(bc[, 1:9], function(x) as.numeric(as.character(x)))
str(bc)
## 'data.frame': 683 obs. of 10 variables:
## $ Cl.thickness : num 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : num 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : num 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : num 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: num 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
table(bc$Class)
##
## benign malignant
## 444 239
Ποιο Accuracy πήρατε;
Ποια ήταν τα top-3 features σας;
Είναι 97% accuracy «αρκετό» σε ιατρικό context;
Θα απαντήθούν κατά την επίλυση
set.seed(42)
train_idx <- createDataPartition(bc$Class, p = 0.7, list = FALSE)
train <- bc[train_idx, ]
test <- bc[-train_idx, ]
# Έλεγχος ότι οι αναλογίες διατηρήθηκαν (stratified split)
prop.table(table(train$Class)) %>% round(3)
##
## benign malignant
## 0.649 0.351
prop.table(table(test$Class)) %>% round(3)
##
## benign malignant
## 0.652 0.348
set.seed(42)
rf_model <- randomForest(
Class ~ .,
data = train,
ntree = 500,
importance = TRUE
)
print(rf_model)
##
## Call:
## randomForest(formula = Class ~ ., data = train, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 3.76%
## Confusion matrix:
## benign malignant class.error
## benign 301 10 0.03215434
## malignant 8 160 0.04761905
rf_pred <- predict(rf_model, test)
rf_prob <- predict(rf_model, test, type = "prob")[, "malignant"]
# Confusion Matrix
rf_cm <- confusionMatrix(rf_pred, test$Class, positive = "malignant")
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 131 2
## malignant 2 69
##
## Accuracy : 0.9804
## 95% CI : (0.9506, 0.9946)
## No Information Rate : 0.652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9568
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9718
## Specificity : 0.9850
## Pos Pred Value : 0.9718
## Neg Pred Value : 0.9850
## Prevalence : 0.3480
## Detection Rate : 0.3382
## Detection Prevalence : 0.3480
## Balanced Accuracy : 0.9784
##
## 'Positive' Class : malignant
##
# AUC
rf_auc <- roc(test$Class, rf_prob, levels = c("benign", "malignant"))$auc
## Setting direction: controls < cases
cat("RF AUC:", round(rf_auc, 3), "\n")
## RF AUC: 0.999
Άρα απαντώντας στην Ερώτηση 1, το Accuracy = 0.9804. Πολύ υψηλο!
varImpPlot(rf_model, main = "Variable Importance — Random Forest")
importance(rf_model)
## benign malignant MeanDecreaseAccuracy MeanDecreaseGini
## Cl.thickness 12.702656 19.830096 19.363837 11.845927
## Cell.size 11.605774 16.330563 19.826188 60.344346
## Cell.shape 10.417048 20.502944 21.800283 53.561147
## Marg.adhesion 6.228395 12.425305 13.009741 5.977215
## Epith.c.size 10.510480 3.579421 11.128863 11.709247
## Bare.nuclei 20.271034 19.036320 26.035566 34.154820
## Bl.cromatin 6.086543 14.621620 14.691182 14.587449
## Normal.nucleoli 11.849778 11.676750 15.229482 23.815034
## Mitoses 4.984157 0.839361 4.608274 1.550928
Για την Ερώτηση 2, τα top-3 features είναι: 1. Bare.nuclei 2. Cell.shape 3. Cell.size
Ποιο μοντέλο νίκησε;Με πόση διαφορά;
Σας εξέπληξε κάτι στα αποτελέσματα;
Όσοι έκαναν το BONUS: τι παρατηρήσατε με τα 3 διαφορετικά eta;
# Μετατροπή σε numeric matrix
train_x <- as.matrix(train[, 1:9])
train_y <- ifelse(train$Class == "malignant", 1, 0)
test_x <- as.matrix(test[, 1:9])
test_y <- ifelse(test$Class == "malignant", 1, 0)
# Δημιουργία xgb.DMatrix (το optimized format του XGBoost)
dtrain <- xgb.DMatrix(data = train_x, label = train_y)
dtest <- xgb.DMatrix(data = test_x, label = test_y)
set.seed(42)
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = 4,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 500,
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 20,
print_every_n = 25,
verbose = 1
)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 20 rounds.
##
## [1] train-auc:0.977109 test-auc:0.967489
## [26] train-auc:0.997722 test-auc:0.995446
## [51] train-auc:0.999541 test-auc:0.997458
## [76] train-auc:0.999847 test-auc:0.997564
## Stopping. Best iteration:
## [95] train-auc:0.999923 test-auc:0.997458
##
## [95] train-auc:0.999923 test-auc:0.997458
xgb_prob <- predict(xgb_model, dtest)
xgb_pred <- factor(ifelse(xgb_prob > 0.5, "malignant", "benign"),
levels = c("benign", "malignant"))
xgb_cm <- confusionMatrix(xgb_pred, test$Class, positive = "malignant")
xgb_auc <- roc(test$Class, xgb_prob, levels = c("benign", "malignant"))$auc
## Setting direction: controls < cases
results <- data.frame(
Model = c("Random Forest", "XGBoost"),
Accuracy = c(rf_cm$overall["Accuracy"],
xgb_cm$overall["Accuracy"]),
Sensitivity = c(rf_cm$byClass["Sensitivity"],
xgb_cm$byClass["Sensitivity"]),
Specificity = c(rf_cm$byClass["Specificity"],
xgb_cm$byClass["Specificity"]),
AUC = c(rf_auc, xgb_auc)
) %>% mutate(across(where(is.numeric), ~ round(.x, 3)))
print(results)
## Model Accuracy Sensitivity Specificity AUC
## 1 Random Forest 0.980 0.972 0.985 0.999
## 2 XGBoost 0.975 0.958 0.985 0.998
Από το βήμα “Εκπαίδευση XGBoost μοντέλου με early stopping”, με eta=0.1 έχουμε 95 επαναλήψεις με AUC = 0.997458
#eta = 0.01
set.seed(42)
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = 4,
eta = 0.01,
subsample = 0.8,
colsample_bytree = 0.8
)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 500,
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 20,
print_every_n = 25,
verbose = 1
)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 20 rounds.
##
## [1] train-auc:0.977109 test-auc:0.967489
## [26] train-auc:0.992402 test-auc:0.995605
## [51] train-auc:0.992861 test-auc:0.996294
## Stopping. Best iteration:
## [76] train-auc:0.994851 test-auc:0.996399
##
## [76] train-auc:0.994851 test-auc:0.996399
#eta = 0.3
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = 4,
eta = 0.3,
subsample = 0.8,
colsample_bytree = 0.8
)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 500,
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 20,
print_every_n = 25,
verbose = 1
)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 20 rounds.
##
## [1] train-auc:0.982507 test-auc:0.967807
## [26] train-auc:0.999694 test-auc:0.997353
## [51] train-auc:0.999962 test-auc:0.997564
## Stopping. Best iteration:
## [59] train-auc:1.000000 test-auc:0.997035
##
## [59] train-auc:1.000000 test-auc:0.997035
Για eta = 0.01, έχουμε 76 επαναλήψεις με AUC = 0.996399 Για eta = 0.3, έχουμε 59 επαναλήψεις με AUC = 0.997035.
roc_rf <- roc(test$Class, rf_prob, levels = c("benign", "malignant"))
## Setting direction: controls < cases
roc_xgb <- roc(test$Class, xgb_prob, levels = c("benign", "malignant"))
## Setting direction: controls < cases
plot(roc_rf, col = "forestgreen", lwd = 2, main = "ROC Curves Comparison")
lines(roc_xgb, col = "steelblue", lwd = 2)
legend("bottomright",
legend = c(paste0("RF (AUC = ", round(rf_auc, 3), ")"),
paste0("XGB (AUC = ", round(xgb_auc, 3), ")")),
col = c("forestgreen", "steelblue"),
lwd = 2)
Συγκεντρωτικά τα στοιχεία είναι (με αυξανόμενο eta):
| eta | repeats | AUC |
|---|---|---|
| 0.01 | 76 | 0.996399 |
| 0.1 | 95 | 0.997458 |
| 0.3 | 59 | 0.997035 |
Παρατηρούμε πως με μικρότερο (0.01) ή μεγάλητερο (0.3) eta οι επαναλήψεις μειώνονται αλλά και το AUC μειώνεται σε σχεση με eta=0,1. Οπότε, παρόλο που το eta = 0.1 έχει τις περισσότερες επαναλήψεις, το AUC του είναι μεγαλήτερο. Άρα, για ένα καλύτερο αποτέλεσμα θα πρέπει να επιλεχθεί μια μέτρια τιμή eta όμως με αντάλαγμα την ταχύτητα.