#============================================================ #LAB 010 — Random Forests & Gradient Boosting #Dataset: Breast Cancer (Wisconsin) #Στόχος: Πρόβλεψη κακοήθειας από κυτταρολογικά χαρακτηριστικά #============================================================
set.seed(42)
data("BreastCancer", package = "mlbench")
bc <- BreastCancer
bc$Id <- NULL
bc <- na.omit(bc)
bc[, 1:9] <- lapply(bc[, 1:9], function(x) as.numeric(as.character(x)))
str(bc)
## 'data.frame': 683 obs. of 10 variables:
## $ Cl.thickness : num 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : num 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : num 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : num 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: num 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
table(bc$Class)
##
## benign malignant
## 444 239
#Hint: createDataPartition() από το caret
train_index <- createDataPartition(bc$Class, p = 0.7, list = FALSE)
train_set <- bc[train_index, ]
test_set <- bc[-train_index, ]
#Στόχος: μοντέλο που προβλέπει το Class
rf_model <- randomForest(Class ~ ., data = train_set, ntree = 500, importance = TRUE)
#Hint: confusionMatrix() + roc()
rf_preds <- predict(rf_model, test_set)
rf_probs <- predict(rf_model, test_set, type = "prob")[, "malignant"]
rf_cm <- confusionMatrix(rf_preds, test_set$Class, positive = "malignant")
rf_roc <- roc(test_set$Class, rf_probs)
## Setting levels: control = benign, case = malignant
## Setting direction: controls < cases
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 131 2
## malignant 2 69
##
## Accuracy : 0.9804
## 95% CI : (0.9506, 0.9946)
## No Information Rate : 0.652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9568
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9718
## Specificity : 0.9850
## Pos Pred Value : 0.9718
## Neg Pred Value : 0.9850
## Prevalence : 0.3480
## Detection Rate : 0.3382
## Detection Prevalence : 0.3480
## Balanced Accuracy : 0.9784
##
## 'Positive' Class : malignant
##
cat("AUC:", auc(rf_roc), "\n")
## AUC: 0.9981997
#Ποια features είναι τα top 3;
varImpPlot(rf_model, main = "RF Feature Importance")
#Hint: as.matrix() + ifelse() για το target
train_x <- as.matrix(train_set[, 1:9])
train_y <- ifelse(train_set$Class == "malignant", 1, 0)
test_x <- as.matrix(test_set[, 1:9])
test_y <- ifelse(test_set$Class == "malignant", 1, 0)
dtrain <- xgb.DMatrix(data = train_x, label = train_y)
dtest <- xgb.DMatrix(data = test_x, label = test_y)
#Παράμετροι: max_depth=4, eta=0.1, nrounds=500, early_stopping_rounds=20
xgb_params <- list(objective = "binary:logistic", max_depth = 4, eta = 0.1)
xgb_model <- xgb.train(params = xgb_params, data = dtrain, nrounds = 500,
watchlist = list(val = dtest, train = dtrain),
early_stopping_rounds = 20, verbose = 0)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
# Προβλέψεις XGBoost
xgb_probs <- predict(xgb_model, test_x)
xgb_preds <- ifelse(xgb_probs > 0.5, "malignant", "benign")
xgb_preds <- factor(xgb_preds, levels = c("benign", "malignant"))
#Στήλες: Accuracy, Sensitivity, Specificity, AUC
xgb_cm <- confusionMatrix(xgb_preds, test_set$Class, positive = "malignant")
xgb_roc <- roc(test_set$Class, xgb_probs)
## Setting levels: control = benign, case = malignant
## Setting direction: controls < cases
comparison_table <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "AUC"),
RF = c(rf_cm$overall["Accuracy"], rf_cm$byClass["Sensitivity"], rf_cm$byClass["Specificity"], auc(rf_roc)),
XGBoost = c(xgb_cm$overall["Accuracy"], xgb_cm$byClass["Sensitivity"], xgb_cm$byClass["Specificity"], auc(xgb_roc))
)
print(comparison_table)
## Metric RF XGBoost
## Accuracy Accuracy 0.9803922 0.9705882
## Sensitivity Sensitivity 0.9718310 0.9436620
## Specificity Specificity 0.9849624 0.9849624
## AUC 0.9981997 0.9955523