Integrar preparação e modelagem/avaliação dos classificadores:
caret
).pROC
);Observação: o relatório é robusto — não exige instalação de pacotes extras. Quando possível, usa alternativas simples da base R.
# Caminho (ajuste se necessário)
base_dir <- "D:/Script_R/Aprendizado_Maquina_TrabGrupo/desafio_02/"
file_titanic <- file.path(base_dir, "titanic.txt")
if (!file.exists(file_titanic)) stop("Arquivo não encontrado: ", file_titanic)
# Leitura com fallback (tab -> csv)
titanic <- tryCatch(
read.table(file_titanic, header = TRUE, sep = "\t", stringsAsFactors = TRUE, quote = '"'),
error = function(e) read.csv(file_titanic, header = TRUE, stringsAsFactors = TRUE, fileEncoding = "UTF-8")
)
# Higiene e tipos
names(titanic) <- trimws(names(titanic))
titanic$Class <- factor(titanic$Class)
titanic$Sex <- factor(titanic$Sex)
titanic$Age <- factor(titanic$Age)
titanic$Survived <- factor(titanic$Survived, levels = c("No","Yes"))
titanic$y_bin <- as.integer(titanic$Survived == "Yes")
cat(enc2utf8("Proporções em Survived (amostra completa):\n"))
## Proporções em Survived (amostra completa):
##
## No Yes
## 0.676965 0.323035
# Estratificação sem caret (fallback manual)
p <- 0.7
set.seed(123)
idx_tr <- unlist(lapply(split(seq_len(nrow(titanic)), titanic$Survived),
function(ix) sample(ix, size = floor(p * length(ix)))))
train <- titanic[idx_tr, ]
test <- titanic[-idx_tr, ]
cat("Proporções no treino:\n"); print(prop.table(table(train$Survived)))
## Proporções no treino:
##
## No Yes
## 0.6772727 0.3227273
## Proporções no teste:
##
## No Yes
## 0.6762481 0.3237519
# Dummies e padronização com base R
mm_train <- model.matrix(~ Class + Sex + Age - 1, data = train)
mm_test <- model.matrix(~ Class + Sex + Age - 1, data = test)
# Alinhar colunas do teste às do treino
miss_cols <- setdiff(colnames(mm_train), colnames(mm_test))
if (length(miss_cols)) {
mm_test <- cbind(mm_test, matrix(0, nrow(mm_test), length(miss_cols),
dimnames = list(NULL, miss_cols)))
}
extra_cols <- setdiff(colnames(mm_test), colnames(mm_train))
if (length(extra_cols)) mm_test <- mm_test[, setdiff(colnames(mm_test), extra_cols), drop = FALSE]
mm_test <- mm_test[, colnames(mm_train), drop = FALSE]
# Padronização (média/desvio do treino)
sc <- scale(mm_train)
X_train_num <- as.data.frame(sc)
X_test_num <- as.data.frame(scale(mm_test,
center = attr(sc, "scaled:center"),
scale = attr(sc, "scaled:scale")))
y_train <- train$Survived
y_test <- test$Survived
# Conversões seguras
to_num <- function(x, name = "obj") {
if (is.null(x)) stop(name, " é NULL.")
if (is.data.frame(x)) x <- x[[1L]]
if (is.list(x)) x <- unlist(x, use.names = FALSE)
if (is.factor(x)) x <- as.character(x)
suppressWarnings(xn <- as.double(x))
as.vector(xn)
}
to_YesNo <- function(x) factor(as.character(x), levels = c("No","Yes"))
# Constrói matriz 2x2 (mesmo se faltar classe)
conf_mat_from <- function(pred, truth) {
pred <- factor(pred, levels = c("No","Yes"))
truth <- factor(truth, levels = c("No","Yes"))
tab0 <- table(pred, truth)
mat <- matrix(0L, 2, 2, dimnames = list(Pred=c("No","Yes"), Truth=c("No","Yes")))
if (length(tab0)) mat[rownames(tab0), colnames(tab0)] <- tab0
mat
}
# Métricas a partir da matriz 2x2 (com NAs bem tratados)
cm_metrics <- function(mat2x2) {
getv <- function(m, r, c) { v <- suppressWarnings(as.numeric(m[r, c])); if (is.na(v)) 0 else v }
TN <- getv(mat2x2, "No", "No")
FP <- getv(mat2x2, "Yes", "No")
FN <- getv(mat2x2, "No", "Yes")
TP <- getv(mat2x2, "Yes", "Yes")
N <- TN + FP + FN + TP
acc <- ifelse(N > 0, (TP + TN) / N, NA_real_)
sens <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
spec <- ifelse((TN + FP) > 0, TN / (TN + FP), NA_real_)
prec <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
f1 <- ifelse(is.finite(prec) & is.finite(sens) & (prec + sens) > 0,
2 * prec * sens / (prec + sens), NA_real_)
c(Accuracy = acc, Sensitivity = sens, Specificity = spec, Precision = prec, F1 = f1)
}
# ROC manual (ordenação por prob decrescente)
roc_points <- function(truth, prob_yes) {
y <- as.integer(to_YesNo(truth) == "Yes")
ord <- order(prob_yes, decreasing = TRUE)
y <- y[ord]; p <- prob_yes[ord]
P <- sum(y == 1); N <- sum(y == 0)
tp <- cumsum(y)
fp <- cumsum(1 - y)
tpr <- tp / P; fpr <- fp / N
data.frame(fpr = c(0, fpr, 1), tpr = c(0, tpr, 1))
}
auc_trapz <- function(fpr, tpr) sum(diff(fpr) * (head(tpr,-1) + tail(tpr,-1)) / 2)
# Melhor corte de Youden por varredura
youden_cut <- function(truth, prob_yes) {
thr <- sort(unique(prob_yes))
thr <- c(-Inf, thr, Inf) # garante extremos
best <- list(J=-Inf, cutoff=0.5, metr=NULL, mat=NULL)
for (t in thr) {
pred <- ifelse(prob_yes >= t, "Yes", "No")
mat <- conf_mat_from(pred, truth)
metr <- cm_metrics(mat)
J <- metr["Sensitivity"] + metr["Specificity"] - 1
if (!is.na(J) && J > best$J) best <- list(J=J, cutoff=t, metr=metr, mat=mat)
}
best
}
# Pipeline de avaliação para modelos probabilísticos
evaluate_prob_model <- function(truth, prob_yes, cutoff = 0.5) {
prob <- to_num(prob_yes, "prob_yes")
truth <- to_YesNo(truth)
# Corte 0.5
pred05 <- ifelse(prob >= cutoff, "Yes", "No")
mat05 <- conf_mat_from(pred05, truth)
metr05 <- cm_metrics(mat05)
risk05 <- 1 - metr05["Accuracy"]
# ROC/AUC
rocdf <- roc_points(truth, prob)
auc <- auc_trapz(rocdf$fpr, rocdf$tpr)
# Youden
ydn <- youden_cut(truth, prob)
list(
cutoff_05 = list(conf = mat05, metr = metr05, risk = risk05),
roc = rocdf, auc = auc,
youden = ydn
)
}
fit_log <- glm(Survived ~ Class + Sex + Age, family = binomial, data = train)
summary(fit_log)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0179899 0.2021449 9.982890 1.811138e-23
## Class2nd -1.0406971 0.2382103 -4.368817 1.249214e-05
## Class3rd -1.7247104 0.2051267 -8.408027 4.169663e-17
## ClassCrew -0.8400688 0.1895045 -4.432976 9.294112e-06
## SexMale -2.3668857 0.1657657 -14.278499 2.979497e-46
## AgeChild 0.6441659 0.2953938 2.180702 2.920544e-02
prob_log <- predict(fit_log, newdata = test, type = "response")
res_log <- evaluate_prob_model(y_test, prob_log)
res_log$cutoff_05$conf
## Truth
## Pred No Yes
## No 413 104
## Yes 34 110
## Accuracy Sensitivity Specificity Precision F1
## 0.7912 0.5140 0.9239 0.7639 0.6145
## Risco (1 - acc) corte 0.5: 0.2088
## AUC (ROC manual): 0.6911
## Corte ótimo (Youden): 0.3218
## Truth
## Pred No Yes
## No 373 83
## Yes 74 131
## Accuracy Sensitivity Specificity Precision F1
## 0.7625 0.6121 0.8345 0.6390 0.6253
y_bin
)## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8919253 0.03180600 28.042673 4.852027e-140
## Class2nd -0.1905020 0.04067186 -4.683879 3.063727e-06
## Class3rd -0.3015101 0.03388646 -8.897656 1.570172e-18
## ClassCrew -0.1716377 0.03423984 -5.012807 5.986687e-07
## SexMale -0.4856684 0.02781910 -17.458096 2.140948e-62
## AgeChild 0.1090399 0.04981697 2.188811 2.876087e-02
pred_lm <- predict(fit_lm, newdata = test)
prob_lm <- pmin(pmax(to_num(pred_lm), 0), 1)
res_lm <- evaluate_prob_model(y_test, prob_lm)
res_lm$cutoff_05$conf
## Truth
## Pred No Yes
## No 413 104
## Yes 34 110
## Accuracy Sensitivity Specificity Precision F1
## 0.7912 0.5140 0.9239 0.7639 0.6145
## Risco (1 - acc) corte 0.5: 0.2088
## AUC (ROC manual): 0.69
## Corte ótimo (Youden): 0.3248
## Truth
## Pred No Yes
## No 373 83
## Yes 74 131
## Accuracy Sensitivity Specificity Precision F1
## 0.7625 0.6121 0.8345 0.6390 0.6253
have_nb <- requireNamespace("e1071", quietly = TRUE)
if (have_nb) {
fit_nb <- e1071::naiveBayes(Survived ~ Class + Sex + Age, data = train)
prob_nb <- predict(fit_nb, newdata = test, type = "raw")[, "Yes"]
res_nb <- evaluate_prob_model(y_test, prob_nb)
res_nb$cutoff_05$conf
round(res_nb$cutoff_05$metr, 4)
cat("Risco (1 - acc) corte 0.5:", round(res_nb$cutoff_05$risk, 4), "\n")
cat("AUC (ROC manual):", round(res_nb$auc, 4), "\n")
cat("Corte ótimo (Youden):", round(res_nb$youden$cutoff, 4), "\n")
res_nb$youden$mat
round(res_nb$youden$metr, 4)
} else {
cat("Pacote 'e1071' ausente — Naive Bayes não executado.\n")
res_nb <- NULL
}
## Risco (1 - acc) corte 0.5: 0.2088
## AUC (ROC manual): 0.6606
## Corte ótimo (Youden): 0.4301
## Accuracy Sensitivity Specificity Precision F1
## 0.7625 0.6121 0.8345 0.6390 0.6253
if (!requireNamespace("MASS", quietly = TRUE)) stop("Pacote 'MASS' ausente.")
library(MASS)
fit_lda <- MASS::lda(Survived ~ Class + Sex + Age, data = train)
prob_lda <- predict(fit_lda, newdata = test)$posterior[, "Yes"]
res_lda <- evaluate_prob_model(y_test, prob_lda)
fit_qda <- MASS::qda(Survived ~ Class + Sex + Age, data = train)
prob_qda <- predict(fit_qda, newdata = test)$posterior[, "Yes"]
res_qda <- evaluate_prob_model(y_test, prob_qda)
# LDA
res_lda$cutoff_05$conf
## Truth
## Pred No Yes
## No 413 104
## Yes 34 110
## Accuracy Sensitivity Specificity Precision F1
## 0.7912 0.5140 0.9239 0.7639 0.6145
## Risco LDA: 0.2088 | AUC: 0.69
## Truth
## Pred No Yes
## No 364 78
## Yes 83 136
## Accuracy Sensitivity Specificity Precision F1
## 0.7564 0.6355 0.8143 0.6210 0.6282
## Risco QDA: 0.2436 | AUC: 0.7029
# KNN sem caret (probabilidade via atributo 'prob')
if (!requireNamespace("class", quietly = TRUE)) stop("Pacote 'class' (KNN) ausente.")
library(class)
k <- 11
pred_knn <- class::knn(train = as.matrix(X_train_num),
test = as.matrix(X_test_num),
cl = y_train, k = k, prob = TRUE)
p_win <- attr(pred_knn, "prob")
prob_knn <- ifelse(pred_knn == "Yes", p_win, 1 - p_win)
res_knn <- evaluate_prob_model(y_test, prob_knn)
res_knn$cutoff_05$conf
## Truth
## Pred No Yes
## No 442 125
## Yes 5 89
## Accuracy Sensitivity Specificity Precision F1
## 0.8033 0.4159 0.9888 0.9468 0.5779
## Risco (1 - acc) corte 0.5: 0.1967
## AUC (ROC manual): 0.7056
## Corte ótimo (Youden): 0.2353
## Truth
## Pred No Yes
## No 364 78
## Yes 83 136
## Accuracy Sensitivity Specificity Precision F1
## 0.7564 0.6355 0.8143 0.6210 0.6282
linhas <- list(
Logística = res_log,
Linear = res_lm,
`Naive Bayes` = res_nb,
LDA = res_lda,
QDA = res_qda,
KNN = res_knn
)
resumo <- do.call(rbind, lapply(names(linhas), function(nm) {
obj <- linhas[[nm]]
if (is.null(obj)) return(NULL)
data.frame(
Modelo = nm,
Misclassification = as.numeric(obj$cutoff_05$risk),
AUC = as.numeric(obj$auc),
row.names = NULL
)
}))
resumo <- resumo[order(resumo$Misclassification), ]
knitr::kable(resumo, digits = 4)
Modelo | Misclassification | AUC | |
---|---|---|---|
6 | KNN | 0.1967 | 0.7056 |
1 | Logística | 0.2088 | 0.6911 |
2 | Linear | 0.2088 | 0.6900 |
3 | Naive Bayes | 0.2088 | 0.6606 |
4 | LDA | 0.2088 | 0.6900 |
5 | QDA | 0.2436 | 0.7029 |
plot(0:1, 0:1, type="n", xlab="FPR (1 - Especificidade)", ylab="TPR (Sensibilidade)",
main="ROC — Modelos Probabilísticos (Teste)")
abline(0,1,lty=3,col="gray")
add_roc <- function(obj, col) {
if (is.null(obj)) return()
lines(obj$roc$fpr, obj$roc$tpr, lwd=2, col=col)
}
cols <- c("steelblue","tomato","darkgreen","purple","orange","black")
add_roc(res_log, cols[1])
add_roc(res_lm, cols[2])
if (!is.null(res_nb)) add_roc(res_nb, cols[3])
add_roc(res_lda, cols[4])
add_roc(res_qda, cols[5])
add_roc(res_knn, cols[6])
legend("bottomright",
legend = c("Logística","Linear","Naive Bayes","LDA","QDA","KNN"),
col = cols, lwd = 2, bty = "n")