ROCR
needed for ROC and AUC predicting,
to find the most optimal cutoff point for our model, as well as loading
the dataset we are currently working on,
bank-additional.csv
, which will be renamed
bank_additional
for ease and convenience.library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(lattice)
library(ggplot2)
library(gam)
## Warning: package 'gam' was built under R version 4.3.3
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(readr)
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.3
bank_additional <- read_delim("bank-additional.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 4119 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (11): job, marital, education, default, housing, loan, contact, month, d...
## dbl (10): age, duration, campaign, pdays, previous, emp.var.rate, cons.price...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(bank_additional)
summary(bank_additional)
## age job marital education
## Min. :18.00 Length:4119 Length:4119 Length:4119
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.11
## 3rd Qu.:47.00
## Max. :88.00
## default housing loan contact
## Length:4119 Length:4119 Length:4119 Length:4119
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:4119 Length:4119 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 103.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 181.0 Median : 2.000
## Mean : 256.8 Mean : 2.537
## 3rd Qu.: 317.0 3rd Qu.: 3.000
## Max. :3643.0 Max. :35.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.0000 Length:4119 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.0000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.0000 Mode :character Median : 1.10000
## Mean :960.4 Mean :0.1903 Mean : 0.08497
## 3rd Qu.:999.0 3rd Qu.:0.0000 3rd Qu.: 1.40000
## Max. :999.0 Max. :6.0000 Max. : 1.40000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. :92.20 Min. :-50.8 Min. :0.635 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.334 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5166
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Length:4119
## Class :character
## Mode :character
##
##
##
str(bank_additional)
## spc_tbl_ [4,119 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:4119] 30 39 25 38 47 32 32 41 31 35 ...
## $ job : chr [1:4119] "blue-collar" "services" "services" "services" ...
## $ marital : chr [1:4119] "married" "single" "married" "married" ...
## $ education : chr [1:4119] "basic.9y" "high.school" "high.school" "basic.9y" ...
## $ default : chr [1:4119] "no" "no" "no" "no" ...
## $ housing : chr [1:4119] "yes" "no" "yes" "unknown" ...
## $ loan : chr [1:4119] "no" "no" "no" "unknown" ...
## $ contact : chr [1:4119] "cellular" "telephone" "telephone" "telephone" ...
## $ month : chr [1:4119] "may" "may" "jun" "jun" ...
## $ day_of_week : chr [1:4119] "fri" "fri" "wed" "fri" ...
## $ duration : num [1:4119] 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : num [1:4119] 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : num [1:4119] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : num [1:4119] 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : chr [1:4119] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num [1:4119] -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num [1:4119] 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num [1:4119] -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num [1:4119] 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num [1:4119] 5099 5191 5228 5228 5196 ...
## $ y : chr [1:4119] "no" "no" "no" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. job = col_character(),
## .. marital = col_character(),
## .. education = col_character(),
## .. default = col_character(),
## .. housing = col_character(),
## .. loan = col_character(),
## .. contact = col_character(),
## .. month = col_character(),
## .. day_of_week = col_character(),
## .. duration = col_double(),
## .. campaign = col_double(),
## .. pdays = col_double(),
## .. previous = col_double(),
## .. poutcome = col_character(),
## .. emp.var.rate = col_double(),
## .. cons.price.idx = col_double(),
## .. cons.conf.idx = col_double(),
## .. euribor3m = col_double(),
## .. nr.employed = col_double(),
## .. y = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
levels(as.factor(bank_additional$y))
## [1] "no" "yes"
levels(as.factor(bank_additional$job))
## [1] "admin." "blue-collar" "entrepreneur" "housemaid"
## [5] "management" "retired" "self-employed" "services"
## [9] "student" "technician" "unemployed" "unknown"
levels(as.factor(bank_additional$marital))
## [1] "divorced" "married" "single" "unknown"
levels(as.factor(bank_additional$education))
## [1] "basic.4y" "basic.6y" "basic.9y"
## [4] "high.school" "illiterate" "professional.course"
## [7] "university.degree" "unknown"
levels(as.factor(bank_additional$default))
## [1] "no" "unknown" "yes"
levels(as.factor(bank_additional$housing))
## [1] "no" "unknown" "yes"
levels(as.factor(bank_additional$loan))
## [1] "no" "unknown" "yes"
levels(as.factor(bank_additional$contact))
## [1] "cellular" "telephone"
levels(as.factor(bank_additional$month))
## [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
levels(as.factor(bank_additional$day_of_week))
## [1] "fri" "mon" "thu" "tue" "wed"
bank_additional$y = as.factor(bank_additional$y)
str(bank_additional$y)
## Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
bank_additional$job = as.factor(bank_additional$job)
bank_additional$marital = as.factor(bank_additional$marital)
bank_additional$education = as.factor(bank_additional$education)
bank_additional$default = as.factor(bank_additional$default)
bank_additional$housing = as.factor(bank_additional$housing)
bank_additional$loan = as.factor(bank_additional$loan)
bank_additional$contact = as.factor(bank_additional$contact)
bank_additional$month = as.factor(bank_additional$month)
bank_additional$day_of_week = as.factor(bank_additional$day_of_week)
str(bank_additional$job)
## Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
str(bank_additional$day_of_week)
## Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
pdays
heavily skews the dataset, leading to creating a
categorical variable.####When the observation is less than or equal to 998, it means an individual was contacted, anything above means no contact previously
#Warning run only once
bank_additional$pdays = ifelse(bank_additional$pdays <= 998, "yes", "no")
bank_additional$pdays = as.factor(bank_additional$pdays)
str(bank_additional$pdays)
## Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
duration
from analysisbank_additional <- dplyr::select(bank_additional, -duration)
summary(bank_additional)
## age job marital education
## Min. :18.00 admin. :1012 divorced: 446 university.degree :1264
## 1st Qu.:32.00 blue-collar: 884 married :2509 high.school : 921
## Median :38.00 technician : 691 single :1153 basic.9y : 574
## Mean :40.11 services : 393 unknown : 11 professional.course: 535
## 3rd Qu.:47.00 management : 324 basic.4y : 429
## Max. :88.00 retired : 166 basic.6y : 228
## (Other) : 649 (Other) : 168
## default housing loan contact month
## no :3315 no :1839 no :3349 cellular :2652 may :1378
## unknown: 803 unknown: 105 unknown: 105 telephone:1467 jul : 711
## yes : 1 yes :2175 yes : 665 aug : 636
## jun : 530
## nov : 446
## apr : 215
## (Other): 203
## day_of_week campaign pdays previous poutcome
## fri:768 Min. : 1.000 no :3959 Min. :0.0000 Length:4119
## mon:855 1st Qu.: 1.000 yes: 160 1st Qu.:0.0000 Class :character
## thu:860 Median : 2.000 Median :0.0000 Mode :character
## tue:841 Mean : 2.537 Mean :0.1903
## wed:795 3rd Qu.: 3.000 3rd Qu.:0.0000
## Max. :35.000 Max. :6.0000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.8 Min. :0.635
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.334
## Median : 1.10000 Median :93.75 Median :-41.8 Median :4.857
## Mean : 0.08497 Mean :93.58 Mean :-40.5 Mean :3.621
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.9 Max. :5.045
##
## nr.employed y
## Min. :4964 no :3668
## 1st Qu.:5099 yes: 451
## Median :5191
## Mean :5166
## 3rd Qu.:5228
## Max. :5228
##
set.seed(2024)
index <- sample(1:nrow(bank_additional),nrow(bank_additional)*0.80) #using a 80 20 split
bank_train = bank_additional[index,]
bank_test = bank_additional[-index,]
# Your code here
#a) get rid of duration
glm_model <- glm(y ~. -age, family=binomial, data=bank_train)
summary(glm_model)
##
## Call:
## glm(formula = y ~ . - age, family = binomial, data = bank_train)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.730e+02 1.184e+02 -2.306 0.021132 *
## jobblue-collar -1.720e-01 2.495e-01 -0.689 0.490745
## jobentrepreneur -5.017e-01 4.304e-01 -1.166 0.243681
## jobhousemaid 7.260e-02 4.130e-01 0.176 0.860460
## jobmanagement -4.144e-01 2.688e-01 -1.542 0.123085
## jobretired -4.977e-02 2.978e-01 -0.167 0.867293
## jobself-employed -5.847e-01 3.781e-01 -1.546 0.121990
## jobservices -2.071e-01 2.731e-01 -0.758 0.448257
## jobstudent -1.803e-01 3.780e-01 -0.477 0.633421
## jobtechnician 5.537e-03 2.170e-01 0.026 0.979644
## jobunemployed -2.397e-01 4.118e-01 -0.582 0.560526
## jobunknown -8.340e-01 8.942e-01 -0.933 0.350986
## maritalmarried 1.135e-01 2.180e-01 0.521 0.602550
## maritalsingle 6.401e-02 2.385e-01 0.268 0.788414
## maritalunknown 2.935e-01 1.176e+00 0.250 0.802883
## educationbasic.6y 2.489e-01 3.768e-01 0.661 0.508881
## educationbasic.9y 2.214e-01 2.995e-01 0.739 0.459832
## educationhigh.school 2.526e-01 2.887e-01 0.875 0.381660
## educationilliterate -1.218e+01 5.354e+02 -0.023 0.981856
## educationprofessional.course 1.509e-01 3.161e-01 0.477 0.633040
## educationuniversity.degree 3.677e-01 2.890e-01 1.272 0.203289
## educationunknown 3.711e-01 3.870e-01 0.959 0.337504
## defaultunknown -5.982e-02 1.970e-01 -0.304 0.761429
## defaultyes -1.020e+01 5.354e+02 -0.019 0.984799
## housingunknown -5.240e-01 4.820e-01 -1.087 0.277008
## housingyes -4.771e-02 1.305e-01 -0.366 0.714724
## loanunknown NA NA NA NA
## loanyes -4.683e-02 1.738e-01 -0.269 0.787551
## contacttelephone -1.118e+00 2.756e-01 -4.057 4.98e-05 ***
## monthaug 1.216e-01 3.935e-01 0.309 0.757320
## monthdec 1.107e+00 6.366e-01 1.739 0.082044 .
## monthjul -8.438e-02 3.350e-01 -0.252 0.801117
## monthjun -2.716e-01 4.152e-01 -0.654 0.513034
## monthmar 1.913e+00 4.996e-01 3.830 0.000128 ***
## monthmay -2.325e-01 2.811e-01 -0.827 0.408203
## monthnov -4.319e-01 3.929e-01 -1.099 0.271691
## monthoct 7.075e-02 4.929e-01 0.144 0.885854
## monthsep 4.195e-01 5.775e-01 0.726 0.467608
## day_of_weekmon 2.984e-02 2.024e-01 0.147 0.882794
## day_of_weekthu 7.654e-02 2.043e-01 0.375 0.707923
## day_of_weektue 1.888e-02 2.088e-01 0.090 0.927957
## day_of_weekwed 2.003e-01 2.098e-01 0.955 0.339544
## campaign -7.685e-02 3.765e-02 -2.041 0.041211 *
## pdaysyes 5.008e-01 5.955e-01 0.841 0.400337
## previous 1.723e-01 1.810e-01 0.952 0.341100
## poutcomenonexistent 6.095e-01 3.006e-01 2.028 0.042580 *
## poutcomesuccess 1.210e+00 5.971e-01 2.027 0.042652 *
## emp.var.rate -1.281e+00 4.449e-01 -2.880 0.003974 **
## cons.price.idx 2.275e+00 7.838e-01 2.902 0.003707 **
## cons.conf.idx 6.734e-02 2.588e-02 2.602 0.009267 **
## euribor3m -1.551e-01 4.044e-01 -0.383 0.701391
## nr.employed 1.179e-02 9.549e-03 1.235 0.216924
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2294.2 on 3294 degrees of freedom
## Residual deviance: 1789.5 on 3244 degrees of freedom
## AIC: 1891.5
##
## Number of Fisher Scoring iterations: 12
#b)
pred_prob_bank_test <- predict.glm(glm_model, newdata = bank_test, type = "response")
summary(pred_prob_bank_test)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.009409 0.040859 0.059132 0.107900 0.093820 0.921511
hist(pred_prob_bank_test)
#c)
# step 1. get binary classification with the new optimal p cut value
#pred_class_bank_train_optimal <- (pred_prob_bank_train>0.5)*1
# step 2. get confusion matrix
#conf_test <- table(bank_train$y, pred_class_bank_train_optimal, dnn = c("True", "Predicted"))
#conf_test
#MR<- 1 - sum(diag(conf_test)) / sum(conf_test)
#print(paste0("Testing MR:",MR))
# Confusion matrix to evaluate the model on train data
#table(true = bank_train$y, pred = pred_prob_bank_train)
#d)
pred_test <- prediction(pred_prob_bank_test, as.numeric(bank_test$y) - 1) #need the y variable to be 0 and 1
pred_test
## A prediction instance
## with 824 data points
ROC <- performance(pred_test, "tpr", "fpr")
plot(ROC, colorize=TRUE)
#Get the AUC
auc_test = unlist(slot(performance(pred_test, "auc"), "y.values")) #unlist makes it into a number
print(auc_test)
## [1] 0.7833239
text(0.5,0.5, paste("AUC:", auc_test))
# Extract performance metrics
perf_sens <- performance(pred_test, "sens")
perf_spec <- performance(pred_test, "spec")
# Extract y-values (sensitivity and specificity)
sens <- unlist(perf_sens@y.values)
spec <- unlist(perf_spec@y.values)
cutoffs <- unlist(perf_sens@x.values) # Sensitivity and specificity should share same cutoffs
# Ensure proper lengths
if (length(sens) == length(spec) && length(sens) == length(cutoffs)) {
# Plot sensitivity
plot(cutoffs, sens, type="l", lwd=2, ylab="Sensitivity", xlab="Cutoff", col="blue",
main = paste("Maximized Cutoff\n","AUC: ", round(auc_test, 3)))
par(new=TRUE) # Overlay specificity plot
# Plot specificity
plot(cutoffs, spec, type="l", lwd=2, col='red', ylab="", xlab="", axes=FALSE)
axis(4, at=seq(0,1,0.2)) # Specificity axis labels
mtext("Specificity", side=4, col='red')
# Find optimal cutoff where sensitivity ≈ specificity
min.diff <- which.min(abs(sens - spec))
optimal_cutoff <- cutoffs[min.diff]
# Draw reference lines
abline(h = sens[min.diff], lty = 3)
abline(v = optimal_cutoff, lty = 3)
text(optimal_cutoff, 0, paste("Optimal threshold=", round(optimal_cutoff, 2)), pos=3)
} else {
print("Error: Sensitivity, specificity, and cutoffs lengths do not match.")
}
# Use predicted probabilities, not `pred_test`
pr_class = ifelse(pred_prob_bank_test > optimal_cutoff, 1, 0) # Use optimal threshold
# Ensure bank_test$y is converted correctly
true_labels <- as.factor(ifelse(bank_test$y == "yes", 1, 0)) # Convert "yes"/"no" to 1/0
# Compute confusion matrix
caret::confusionMatrix(as.factor(pr_class), true_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 533 24
## 1 205 62
##
## Accuracy : 0.7221
## 95% CI : (0.6901, 0.7524)
## No Information Rate : 0.8956
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2296
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7222
## Specificity : 0.7209
## Pos Pred Value : 0.9569
## Neg Pred Value : 0.2322
## Prevalence : 0.8956
## Detection Rate : 0.6468
## Detection Prevalence : 0.6760
## Balanced Accuracy : 0.7216
##
## 'Positive' Class : 0
##
# Your code here
#a)
glm_model_2 <- glm(formula = y ~ job + marital + education + default + housing + loan + contact + month + day_of_week, family=binomial, data=bank_train)
summary(glm_model_2)
##
## Call:
## glm(formula = y ~ job + marital + education + default + housing +
## loan + contact + month + day_of_week, family = binomial,
## data = bank_train)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.699649 0.405972 -4.187 2.83e-05 ***
## jobblue-collar -0.275344 0.232493 -1.184 0.23629
## jobentrepreneur -0.659551 0.416977 -1.582 0.11371
## jobhousemaid 0.060233 0.381532 0.158 0.87456
## jobmanagement -0.451498 0.246319 -1.833 0.06681 .
## jobretired 0.421735 0.273664 1.541 0.12330
## jobself-employed -0.617558 0.355811 -1.736 0.08263 .
## jobservices -0.312796 0.254607 -1.229 0.21924
## jobstudent 0.380566 0.352226 1.080 0.27994
## jobtechnician -0.130566 0.198089 -0.659 0.50981
## jobunemployed -0.003002 0.366179 -0.008 0.99346
## jobunknown -0.476428 0.783321 -0.608 0.54305
## maritalmarried 0.131900 0.201665 0.654 0.51308
## maritalsingle 0.082768 0.219918 0.376 0.70665
## maritalunknown -0.051132 1.122788 -0.046 0.96368
## educationbasic.6y 0.177503 0.359717 0.493 0.62169
## educationbasic.9y 0.136619 0.286395 0.477 0.63334
## educationhigh.school 0.239788 0.272121 0.881 0.37822
## educationilliterate -11.725672 535.411306 -0.022 0.98253
## educationprofessional.course 0.217435 0.295561 0.736 0.46193
## educationuniversity.degree 0.388058 0.270166 1.436 0.15090
## educationunknown 0.626535 0.356293 1.758 0.07867 .
## defaultunknown -0.407244 0.188005 -2.166 0.03030 *
## defaultyes -11.642946 535.411343 -0.022 0.98265
## housingunknown -0.394013 0.427418 -0.922 0.35661
## housingyes -0.042480 0.121163 -0.351 0.72589
## loanunknown NA NA NA NA
## loanyes -0.034915 0.162122 -0.215 0.82948
## contacttelephone -1.430888 0.184181 -7.769 7.91e-15 ***
## monthaug -0.678088 0.255956 -2.649 0.00807 **
## monthdec 1.660659 0.523957 3.169 0.00153 **
## monthjul -0.735611 0.257394 -2.858 0.00426 **
## monthjun 0.659450 0.283126 2.329 0.01985 *
## monthmar 1.540677 0.395576 3.895 9.83e-05 ***
## monthmay -0.443370 0.245446 -1.806 0.07086 .
## monthnov -0.642430 0.278562 -2.306 0.02110 *
## monthoct 0.941250 0.354564 2.655 0.00794 **
## monthsep 1.464297 0.361823 4.047 5.19e-05 ***
## day_of_weekmon 0.042775 0.188544 0.227 0.82052
## day_of_weekthu 0.112377 0.187133 0.601 0.54816
## day_of_weektue 0.050272 0.192610 0.261 0.79409
## day_of_weekwed 0.115608 0.195692 0.591 0.55468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2294.2 on 3294 degrees of freedom
## Residual deviance: 2022.3 on 3254 degrees of freedom
## AIC: 2104.3
##
## Number of Fisher Scoring iterations: 12
#b)
pred_prob_bank_test_2 <- predict(glm_model_2, newdata = bank_test, type = "response")
hist(pred_prob_bank_test_2)
#d)
pred_test_2 <- prediction(pred_prob_bank_test_2, as.numeric(bank_test$y) - 1) #need the y variable to be 0 and 1
pred_test_2
## A prediction instance
## with 824 data points
ROC_2 <- performance(pred_test_2, "tpr", "fpr")
plot(ROC_2, colorize=TRUE)
#Get the AUC
auc_test_2 = unlist(slot(performance(pred_test_2, "auc"), "y.values")) #unlist makes it into a number
print(auc_test_2)
## [1] 0.7150139
text(0.5,0.5, paste("AUC:", auc_test_2))
# Extract performance metrics
perf_sens_2 <- performance(pred_test_2, "sens")
perf_spec_2 <- performance(pred_test_2, "spec")
# Extract y-values (sensitivity and specificity)
sens_2 <- unlist(perf_sens_2@y.values)
spec_2 <- unlist(perf_spec_2@y.values)
cutoffs_2 <- unlist(perf_sens_2@x.values) # Sensitivity and specificity should share same cutoffs
# Ensure proper lengths
if (length(sens_2) == length(spec_2) && length(sens_2) == length(cutoffs_2)) {
# Plot sensitivity
plot(cutoffs_2, sens_2, type="l", lwd=2, ylab="Sensitivity", xlab="Cutoff", col="blue",
main = paste("Maximized Cutoff\n","AUC: ", round(auc_test_2, 3)))
par(new=TRUE) # Overlay specificity plot
# Plot specificity
plot(cutoffs_2, spec_2, type="l", lwd=2, col='red', ylab="", xlab="", axes=FALSE)
axis(4, at=seq(0,1,0.2)) # Specificity axis labels
mtext("Specificity", side=4, col='red')
# Find optimal cutoff where sensitivity ≈ specificity
min.diff_2 <- which.min(abs(sens_2 - spec_2))
optimal_cutoff_2 <- cutoffs[min.diff_2]
# Draw reference lines
abline(h = sens[min.diff_2], lty = 3)
abline(v = optimal_cutoff_2, lty = 3)
text(optimal_cutoff_2, 0, paste("Optimal threshold=", round(optimal_cutoff_2, 2)), pos=3)
} else {
print("Error: Sensitivity, specificity, and cutoffs lengths do not match.")
}
# Use predicted probabilities, not `pred_test`
pr_class_2 = ifelse(pred_prob_bank_test > optimal_cutoff_2, 1, 0) # Use optimal threshold
# Ensure bank_test$y is converted correctly
true_labels_2 <- as.factor(ifelse(bank_test$y == "yes", 1, 0)) # Convert "yes"/"no" to 1/0
# Compute confusion matrix
caret::confusionMatrix(as.factor(pr_class_2), true_labels_2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 509 23
## 1 229 63
##
## Accuracy : 0.6942
## 95% CI : (0.6615, 0.7255)
## No Information Rate : 0.8956
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2052
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6897
## Specificity : 0.7326
## Pos Pred Value : 0.9568
## Neg Pred Value : 0.2158
## Prevalence : 0.8956
## Detection Rate : 0.6177
## Detection Prevalence : 0.6456
## Balanced Accuracy : 0.7111
##
## 'Positive' Class : 0
##