─────────────────────────────────────────────────────────────────────────────

1) sPLIT Data

─────────────────────────────────────────────────────────────────────────────

train_idx  <- createDataPartition(df$label, p = 0.60, list = FALSE)
train_all  <- df[ train_idx, ]
rest       <- df[-train_idx, ]

val_idx    <- createDataPartition(rest$label, p = 0.50, list = FALSE)
validation <- rest[ val_idx, ]
test       <- rest[-val_idx, ]

predictors <- setdiff(names(train_all), "label")

─────────────────────────────────────────────────────────────────────────────

2) Balance the TRAINING set four ways

─────────────────────────────────────────────────────────────────────────────

train_down <- downSample(x = train_all[, predictors],
                         y = train_all$label,
                         yname = "label")
train_up   <- upSample(  x = train_all[, predictors],
                         y = train_all$label,
                         yname = "label")
N_target   <- max(table(train_all$label)) * 2
train_rose <- ovun.sample(label ~ ., data = train_all,
                          method = "both",
                          N      = N_target,
                          seed   = 42)$data

train_sets <- list(
  Original = train_all,
  Down     = train_down,
  Up       = train_up,
  ROSE     = train_rose
)

─────────────────────────────────────────────────────────────────────────────

3) Helpers: compute metrics & find best threshold

─────────────────────────────────────────────────────────────────────────────

compute_metrics <- function(probs, truth, thr) {
  pred <- factor(ifelse(probs > thr, "phish", "ham"),
                 levels = c("ham","phish"))
  cm    <- confusionMatrix(pred, truth, positive = "phish")
  sens  <- cm$byClass["Sensitivity"]
  spec  <- cm$byClass["Specificity"]
  prec  <- cm$byClass["Pos Pred Value"]
  rec   <- sens
  ba    <- (sens + spec) / 2
  f1    <- ifelse((prec + rec)==0, NA, 2*prec*rec/(prec + rec))
  c(Threshold = thr,
    BalancedAccuracy = ba,
    Precision = prec,
    Recall = rec,
    F1 = f1)
}

find_best_threshold <- function(probs, truth) {
  ths    <- seq(0.01, 0.99, by = 0.01)
  metrics <- t(sapply(ths, function(t) compute_metrics(probs, truth, t)))
  metrics_df <- as.data.frame(metrics)
  best <- which.max(metrics_df$F1)
  metrics_df[best, , drop = FALSE]
}

─────────────────────────────────────────────────────────────────────────────

4) Train ensembles & evaluate on validation

─────────────────────────────────────────────────────────────────────────────

results <- NULL
for(balance in names(train_sets)) {
  tr <- train_sets[[balance]]
  
  # ---- Random Forest ----
  tr$label <- factor(tr$label, levels = c("ham","phish"))
  rf_mod   <- randomForest(label ~ ., data = tr, ntree = 100)
  rf_prob  <- predict(rf_mod, validation, type = "prob")[, "phish"]
  rf_best  <- find_best_threshold(rf_prob, validation$label)
  rf_row   <- data.frame(
    Sampling         = balance,
    Model            = "RandomForest",
    Threshold        = rf_best$Threshold,
    BalancedAccuracy = rf_best$BalancedAccuracy,
    Precision        = rf_best$Precision,
    Recall           = rf_best$Recall,
    F1               = rf_best$F1,
    row.names = NULL
  )
  
  # ---- GBM (numeric 0/1 response) ----
  tr_gbm         <- tr
  tr_gbm$label_n <- ifelse(tr_gbm$label == "phish", 1, 0)
  gbm_mod        <- gbm(label_n ~ ., data = tr_gbm[, c(predictors, "label_n")],
                        distribution     = "bernoulli",
                        n.trees          = 100,
                        interaction.depth = 3,
                        shrinkage        = 0.1,
                        bag.fraction     = 0.8,
                        n.minobsinnode   = 5,
                        verbose          = FALSE)
  gbm_prob <- predict(gbm_mod,
                      newdata = validation[, predictors],
                      n.trees = 100,
                      type    = "response")
  gbm_best <- find_best_threshold(gbm_prob, validation$label)
  gbm_row  <- data.frame(
    Sampling         = balance,
    Model            = "GBM",
    Threshold        = gbm_best$Threshold,
    BalancedAccuracy = gbm_best$BalancedAccuracy,
    Precision        = gbm_best$Precision,
    Recall           = gbm_best$Recall,
    F1               = gbm_best$F1,
    row.names = NULL
  )
  
  results <- rbind(results, rf_row, gbm_row)
}

─────────────────────────────────────────────────────────────────────────────

5) Show the results

─────────────────────────────────────────────────────────────────────────────

print(results)

##   Sampling        Model Threshold BalancedAccuracy  Precision     Recall
## 1 Original RandomForest      0.09        0.5813481 0.18548387 0.17293233
## 2 Original          GBM      0.06        0.5946628 0.16666667 0.20300752
## 3     Down RandomForest      0.91        0.5728665 0.14893617 0.15789474
## 4     Down          GBM      0.78        0.6064668 0.10491803 0.24060150
## 5       Up RandomForest      0.57        0.6008263 0.03751144 0.30827068
## 6       Up          GBM      0.72        0.6069735 0.10847458 0.24060150
## 7     ROSE RandomForest      0.82        0.5414231 0.08125000 0.09774436
## 8     ROSE          GBM      0.73        0.6007216 0.11194030 0.22556391
##           F1
## 1 0.17898833
## 2 0.18305085
## 3 0.15328467
## 4 0.14611872
## 5 0.06688418
## 6 0.14953271
## 7 0.08873720
## 8 0.14962594

# 1) Pick the row with the highest Recall
best_rec_row   <- results[which.max(results$Recall), ]
best_sampling  <- best_rec_row$Sampling
best_model     <- best_rec_row$Model
best_threshold <- best_rec_row$Threshold

cat("Highest‑Recall champion:\n",
    " Sampling:  ", best_sampling, "\n",
    " Model:     ", best_model, "\n",
    " Threshold: ", best_threshold, "\n\n")

## Highest‑Recall champion:
##   Sampling:   Up 
##   Model:      RandomForest 
##   Threshold:  0.57

# 2) Build the full 80% dev set
dev_set <- rbind(train_all, validation)

# 3) Balance dev_set exactly as chosen
if (best_sampling == "Original") {
  dev_train <- dev_set
} else if (best_sampling == "Down") {
  dev_train <- downSample(
    x     = dev_set[, predictors],
    y     = dev_set$label,
    yname = "label"
  )
} else if (best_sampling == "Up") {
  dev_train <- upSample(
    x     = dev_set[, predictors],
    y     = dev_set$label,
    yname = "label"
  )
} else if (best_sampling == "ROSE") {
  N_target  <- max(table(dev_set$label)) * 2
  dev_train <- ovun.sample(
    label ~ ., data   = dev_set,
    method          = "both",
    N               = N_target,
    seed            = 42
  )$data
}

# 4) Retrain the chosen model
if (best_model == "RandomForest") {
  dev_train$label <- factor(dev_train$label, levels = c("ham","phish"))
  champ_mod       <- randomForest(label ~ ., data = dev_train, ntree = 100)
  predict_prob    <- function(mod, newdata) {
    predict(mod, newdata, type = "prob")[, "phish"]
  }

} else if (best_model == "GBM") {
  dev_train$label_n <- ifelse(dev_train$label == "phish", 1, 0)
  champ_mod         <- gbm(
    label_n ~ ., data            = dev_train[, c(predictors, "label_n")],
    distribution     = "bernoulli",
    n.trees          = 100,
    interaction.depth = 3,
    shrinkage        = 0.1,
    bag.fraction     = 0.8,
    n.minobsinnode   = 5,
    verbose          = FALSE
  )
  predict_prob      <- function(mod, newdata) {
    predict(mod, newdata, n.trees = 100, type = "response")
  }

} else {
  stop("Unknown model: ", best_model)
}

# 5) Score on TEST at that Recall‑maximizing threshold
test_prob <- predict_prob(champ_mod, test)
test_pred <- factor(
  ifelse(test_prob > best_threshold, "phish", "ham"),
  levels = c("ham","phish")
)

cm      <- confusionMatrix(test_pred, test$label, positive = "phish")$byClass
sens    <- cm["Sensitivity"]
spec    <- cm["Specificity"]
prec    <- cm["Pos Pred Value"]
ba      <- (sens + spec) / 2
f1      <- ifelse((prec + sens)==0, NA, 2*prec*sens/(prec + sens))

final_recall_champ <- data.frame(
  Sampling         = best_sampling,
  Model            = best_model,
  Threshold        = best_threshold,
  BalancedAccuracy = ba,
  Precision        = prec,
  Recall           = sens,
  F1               = f1,
  row.names        = NULL
)

print(final_recall_champ)

##   Sampling        Model Threshold BalancedAccuracy  Precision    Recall
## 1       Up RandomForest      0.57        0.6377014 0.04798464 0.3759398
##           F1
## 1 0.08510638

# 1) Define the two models to refit
to_refit <- data.frame(
  Sampling  = c("Original", "Original"),
  Model     = c("RandomForest", "GBM"),
  Threshold = c(0.09, 0.06),
  stringsAsFactors = FALSE
)

# 2) Build the full 80% development set
dev_set <- rbind(train_all, validation)

# 3) Prepare a placeholder for the final metrics
final_two <- data.frame()

# 4) Loop over each champion spec
for(i in seq_len(nrow(to_refit))) {
  samp <- to_refit$Sampling[i]
  modn <- to_refit$Model[i]
  thr  <- to_refit$Threshold[i]
  
  # 4a) For "Original", dev_train is just the dev_set
  dev_train <- dev_set
  
  # 4b) Fit the specified model
  if(modn == "RandomForest") {
    library(randomForest)
    dev_train$label <- factor(dev_train$label, levels=c("ham","phish"))
    fit <- randomForest(label ~ ., data = dev_train, ntree = 100)
    pred_prob <- predict(fit, test, type="prob")[, "phish"]
    
  } else if(modn == "GBM") {
    library(gbm)
    tmp <- dev_train
    tmp$label_n <- ifelse(tmp$label=="phish", 1, 0)
    fit <- gbm(
      label_n ~ ., data            = tmp[, c(predictors, "label_n")],
      distribution     = "bernoulli",
      n.trees          = 100,
      interaction.depth = 3,
      shrinkage        = 0.1,
      bag.fraction     = 0.8,
      n.minobsinnode   = 5,
      verbose          = FALSE
    )
    pred_prob <- predict(fit, newdata = test[, predictors],
                         n.trees = 100, type = "response")
    
  } else {
    stop("Unknown model: ", modn)
  }
  
  # 5) Binarize at the chosen threshold & compute metrics
  pred_class <- factor(ifelse(pred_prob > thr, "phish", "ham"),
                       levels = c("ham","phish"))
  cm <- caret::confusionMatrix(pred_class, test$label, positive = "phish")$byClass
  
  ba  <- (cm["Sensitivity"] + cm["Specificity"]) / 2
  prec<-  cm["Pos Pred Value"]
  rec <-  cm["Sensitivity"]
  f1  <- ifelse((prec+rec)==0, NA, 2*prec*rec/(prec+rec))
  
  # 6) Store
  final_two <- rbind(final_two, data.frame(
    Sampling         = samp,
    Model            = modn,
    Threshold        = thr,
    BalancedAccuracy = ba,
    Precision        = prec,
    Recall           = rec,
    F1               = f1,
    row.names        = NULL
  ))
}

# 7) Show the results
print(final_recall_champ)

##   Sampling        Model Threshold BalancedAccuracy  Precision    Recall
## 1       Up RandomForest      0.57        0.6377014 0.04798464 0.3759398
##           F1
## 1 0.08510638

print(final_two)

##   Sampling        Model Threshold BalancedAccuracy Precision    Recall
## 1 Original RandomForest      0.09        0.5747921 0.2038835 0.1578947
## 2 Original          GBM      0.06        0.5793718 0.1411043 0.1729323
##          F1
## 1 0.1779661
## 2 0.1554054

phishing detection analysis

Luke Wamalwa

2025-04-20

─────────────────────────────────────────────────────────────────────────────

1) sPLIT Data

─────────────────────────────────────────────────────────────────────────────

─────────────────────────────────────────────────────────────────────────────

2) Balance the TRAINING set four ways

─────────────────────────────────────────────────────────────────────────────

─────────────────────────────────────────────────────────────────────────────

3) Helpers: compute metrics & find best threshold

─────────────────────────────────────────────────────────────────────────────

─────────────────────────────────────────────────────────────────────────────

4) Train ensembles & evaluate on validation

─────────────────────────────────────────────────────────────────────────────

─────────────────────────────────────────────────────────────────────────────

5) Show the results

─────────────────────────────────────────────────────────────────────────────

R Markdown