#============================================================ #LAB 010 — Random Forests & Gradient Boosting #Dataset: Breast Cancer (Wisconsin) #Στόχος: Πρόβλεψη κακοήθειας από κυτταρολογικά χαρακτηριστικά #============================================================

set.seed(42)

Φόρτωση δεδομένων

data("BreastCancer", package = "mlbench")
bc <- BreastCancer

Καθαρισμός: αφαίρεση ID, χειρισμός missing, μετατροπή σε numeric

bc$Id <- NULL
bc <- na.omit(bc)
bc[, 1:9] <- lapply(bc[, 1:9], function(x) as.numeric(as.character(x)))

str(bc)
## 'data.frame':    683 obs. of  10 variables:
##  $ Cl.thickness   : num  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : num  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : num  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : num  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : num  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : num  1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : num  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: num  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : num  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
table(bc$Class)
## 
##    benign malignant 
##       444       239

ΜΕΡΟΣ Α — BASELINE με Random Forest

TODO 1: Κάνε stratified train/test split (70/30)

#Hint: createDataPartition() από το caret

train_index <- createDataPartition(bc$Class, p = 0.7, list = FALSE)
train_set <- bc[train_index, ]
test_set  <- bc[-train_index, ]

TODO 2: Εκπαίδευσε ένα Random Forest με ntree=500, importance=TRUE

#Στόχος: μοντέλο που προβλέπει το Class

rf_model <- randomForest(Class ~ ., data = train_set, ntree = 500, importance = TRUE)

TODO 3: Υπολόγισε Accuracy, Sensitivity, AUC στο test set

#Hint: confusionMatrix() + roc()

rf_preds <- predict(rf_model, test_set)
rf_probs <- predict(rf_model, test_set, type = "prob")[, "malignant"]
rf_cm <- confusionMatrix(rf_preds, test_set$Class, positive = "malignant")
rf_roc <- roc(test_set$Class, rf_probs)
## Setting levels: control = benign, case = malignant
## Setting direction: controls < cases
print(rf_cm)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       131         2
##   malignant      2        69
##                                           
##                Accuracy : 0.9804          
##                  95% CI : (0.9506, 0.9946)
##     No Information Rate : 0.652           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9568          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9718          
##             Specificity : 0.9850          
##          Pos Pred Value : 0.9718          
##          Neg Pred Value : 0.9850          
##              Prevalence : 0.3480          
##          Detection Rate : 0.3382          
##    Detection Prevalence : 0.3480          
##       Balanced Accuracy : 0.9784          
##                                           
##        'Positive' Class : malignant       
## 
cat("AUC:", auc(rf_roc), "\n")
## AUC: 0.9981997

TODO 4: Δείξε το Variable Importance plot

#Ποια features είναι τα top 3;

varImpPlot(rf_model, main = "RF Feature Importance")

MΕΡΟΣ Β — BOOSTING & TUNING

TODO 5: Προετοίμασε τα δεδομένα για XGBoost

#Hint: as.matrix() + ifelse() για το target

train_x <- as.matrix(train_set[, 1:9])
train_y <- ifelse(train_set$Class == "malignant", 1, 0)
test_x  <- as.matrix(test_set[, 1:9])
test_y  <- ifelse(test_set$Class == "malignant", 1, 0)

dtrain <- xgb.DMatrix(data = train_x, label = train_y)
dtest  <- xgb.DMatrix(data = test_x, label = test_y)

TODO 6: Εκπαίδευσε ένα XGBoost μοντέλο με early stopping

#Παράμετροι: max_depth=4, eta=0.1, nrounds=500, early_stopping_rounds=20

xgb_params <- list(objective = "binary:logistic", max_depth = 4, eta = 0.1)
xgb_model <- xgb.train(params = xgb_params, data = dtrain, nrounds = 500, 
                       watchlist = list(val = dtest, train = dtrain),
                       early_stopping_rounds = 20, verbose = 0)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
# Προβλέψεις XGBoost
xgb_probs <- predict(xgb_model, test_x)
xgb_preds <- ifelse(xgb_probs > 0.5, "malignant", "benign")
xgb_preds <- factor(xgb_preds, levels = c("benign", "malignant"))

TODO 7: Σύγκρινε XGBoost vs Random Forest σε ένα πίνακα

#Στήλες: Accuracy, Sensitivity, Specificity, AUC

xgb_cm <- confusionMatrix(xgb_preds, test_set$Class, positive = "malignant")
xgb_roc <- roc(test_set$Class, xgb_probs)
## Setting levels: control = benign, case = malignant
## Setting direction: controls < cases
comparison_table <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "AUC"),
  RF = c(rf_cm$overall["Accuracy"], rf_cm$byClass["Sensitivity"], rf_cm$byClass["Specificity"], auc(rf_roc)),
  XGBoost = c(xgb_cm$overall["Accuracy"], xgb_cm$byClass["Sensitivity"], xgb_cm$byClass["Specificity"], auc(xgb_roc))
)
print(comparison_table)
##                  Metric        RF   XGBoost
## Accuracy       Accuracy 0.9803922 0.9705882
## Sensitivity Sensitivity 0.9718310 0.9436620
## Specificity Specificity 0.9849624 0.9849624
##                     AUC 0.9981997 0.9955523