1 . Objetivo
2 . Dados e Leitura
3 . Divisão Estratificada Treino/Teste (70/30)
4 . Engenharia de Atributos (Dummies + Padronização)
5 . Funções de Avaliação (robustas e sem dependências)
6 . Modelagem e Resultados (por modelo)
7 . Comparativo por Risco no Teste (corte 0.5)
8 . Curvas ROC (Teste)
9 . Discussão Final

1 . Objetivo

Integrar preparação e modelagem/avaliação dos classificadores:

Regressão Logística, Regressão Linear, Naive Bayes, LDA, QDA e KNN (com fallback sem caret).
Avaliação no conjunto de teste via:
- Risco (1 − acurácia);
- Curvas ROC e AUC (cálculo manual, independente de pROC);
- Tabelas de confusão com corte 0.5 e corte ótimo (Youden).

Observação: o relatório é robusto — não exige instalação de pacotes extras. Quando possível, usa alternativas simples da base R.

2 . Dados e Leitura

# Caminho (ajuste se necessário)
base_dir <- "D:/Script_R/Aprendizado_Maquina_TrabGrupo/desafio_02/"
file_titanic <- file.path(base_dir, "titanic.txt")
if (!file.exists(file_titanic)) stop("Arquivo não encontrado: ", file_titanic)

# Leitura com fallback (tab -> csv)
titanic <- tryCatch(
  read.table(file_titanic, header = TRUE, sep = "\t", stringsAsFactors = TRUE, quote = '"'),
  error = function(e) read.csv(file_titanic, header = TRUE, stringsAsFactors = TRUE, fileEncoding = "UTF-8")
)

# Higiene e tipos
names(titanic) <- trimws(names(titanic))
titanic$Class    <- factor(titanic$Class)
titanic$Sex      <- factor(titanic$Sex)
titanic$Age      <- factor(titanic$Age)
titanic$Survived <- factor(titanic$Survived, levels = c("No","Yes"))
titanic$y_bin    <- as.integer(titanic$Survived == "Yes")

cat(enc2utf8("Proporções em Survived (amostra completa):\n"))

## Proporções em Survived (amostra completa):

prop.table(table(titanic$Survived))

## 
##       No      Yes 
## 0.676965 0.323035

3 . Divisão Estratificada Treino/Teste (70/30)

# Estratificação sem caret (fallback manual)
p <- 0.7
set.seed(123)
idx_tr <- unlist(lapply(split(seq_len(nrow(titanic)), titanic$Survived),
                        function(ix) sample(ix, size = floor(p * length(ix)))))
train <- titanic[idx_tr, ]
test  <- titanic[-idx_tr, ]

cat("Proporções no treino:\n");  print(prop.table(table(train$Survived)))

## Proporções no treino:

## 
##        No       Yes 
## 0.6772727 0.3227273

cat("Proporções no teste:\n");   print(prop.table(table(test$Survived)))

## Proporções no teste:

## 
##        No       Yes 
## 0.6762481 0.3237519

4 . Engenharia de Atributos (Dummies + Padronização)

# Dummies e padronização com base R
mm_train <- model.matrix(~ Class + Sex + Age - 1, data = train)
mm_test  <- model.matrix(~ Class + Sex + Age - 1, data = test)

# Alinhar colunas do teste às do treino
miss_cols <- setdiff(colnames(mm_train), colnames(mm_test))
if (length(miss_cols)) {
  mm_test <- cbind(mm_test, matrix(0, nrow(mm_test), length(miss_cols),
                                   dimnames = list(NULL, miss_cols)))
}
extra_cols <- setdiff(colnames(mm_test), colnames(mm_train))
if (length(extra_cols)) mm_test <- mm_test[, setdiff(colnames(mm_test), extra_cols), drop = FALSE]
mm_test <- mm_test[, colnames(mm_train), drop = FALSE]

# Padronização (média/desvio do treino)
sc <- scale(mm_train)
X_train_num <- as.data.frame(sc)
X_test_num  <- as.data.frame(scale(mm_test,
                                   center = attr(sc, "scaled:center"),
                                   scale  = attr(sc, "scaled:scale")))

y_train <- train$Survived
y_test  <- test$Survived

5 . Funções de Avaliação (robustas e sem dependências)

# Conversões seguras
to_num <- function(x, name = "obj") {
  if (is.null(x)) stop(name, " é NULL.")
  if (is.data.frame(x)) x <- x[[1L]]
  if (is.list(x))       x <- unlist(x, use.names = FALSE)
  if (is.factor(x))     x <- as.character(x)
  suppressWarnings(xn <- as.double(x))
  as.vector(xn)
}
to_YesNo <- function(x) factor(as.character(x), levels = c("No","Yes"))

# Constrói matriz 2x2 (mesmo se faltar classe)
conf_mat_from <- function(pred, truth) {
  pred  <- factor(pred,  levels = c("No","Yes"))
  truth <- factor(truth, levels = c("No","Yes"))
  tab0 <- table(pred, truth)
  mat  <- matrix(0L, 2, 2, dimnames = list(Pred=c("No","Yes"), Truth=c("No","Yes")))
  if (length(tab0)) mat[rownames(tab0), colnames(tab0)] <- tab0
  mat
}

# Métricas a partir da matriz 2x2 (com NAs bem tratados)
cm_metrics <- function(mat2x2) {
  getv <- function(m, r, c) { v <- suppressWarnings(as.numeric(m[r, c])); if (is.na(v)) 0 else v }
  TN <- getv(mat2x2, "No",  "No")
  FP <- getv(mat2x2, "Yes", "No")
  FN <- getv(mat2x2, "No",  "Yes")
  TP <- getv(mat2x2, "Yes", "Yes")
  N  <- TN + FP + FN + TP
  acc  <- ifelse(N > 0, (TP + TN) / N, NA_real_)
  sens <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
  spec <- ifelse((TN + FP) > 0, TN / (TN + FP), NA_real_)
  prec <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
  f1   <- ifelse(is.finite(prec) & is.finite(sens) & (prec + sens) > 0,
                 2 * prec * sens / (prec + sens), NA_real_)
  c(Accuracy = acc, Sensitivity = sens, Specificity = spec, Precision = prec, F1 = f1)
}

# ROC manual (ordenação por prob decrescente)
roc_points <- function(truth, prob_yes) {
  y <- as.integer(to_YesNo(truth) == "Yes")
  ord <- order(prob_yes, decreasing = TRUE)
  y <- y[ord]; p <- prob_yes[ord]
  P <- sum(y == 1); N <- sum(y == 0)
  tp <- cumsum(y)
  fp <- cumsum(1 - y)
  tpr <- tp / P; fpr <- fp / N
  data.frame(fpr = c(0, fpr, 1), tpr = c(0, tpr, 1))
}
auc_trapz <- function(fpr, tpr) sum(diff(fpr) * (head(tpr,-1) + tail(tpr,-1)) / 2)

# Melhor corte de Youden por varredura
youden_cut <- function(truth, prob_yes) {
  thr <- sort(unique(prob_yes))
  thr <- c(-Inf, thr, Inf)  # garante extremos
  best <- list(J=-Inf, cutoff=0.5, metr=NULL, mat=NULL)
  for (t in thr) {
    pred <- ifelse(prob_yes >= t, "Yes", "No")
    mat  <- conf_mat_from(pred, truth)
    metr <- cm_metrics(mat)
    J <- metr["Sensitivity"] + metr["Specificity"] - 1
    if (!is.na(J) && J > best$J) best <- list(J=J, cutoff=t, metr=metr, mat=mat)
  }
  best
}

# Pipeline de avaliação para modelos probabilísticos
evaluate_prob_model <- function(truth, prob_yes, cutoff = 0.5) {
  prob <- to_num(prob_yes, "prob_yes")
  truth <- to_YesNo(truth)
  # Corte 0.5
  pred05 <- ifelse(prob >= cutoff, "Yes", "No")
  mat05  <- conf_mat_from(pred05, truth)
  metr05 <- cm_metrics(mat05)
  risk05 <- 1 - metr05["Accuracy"]
  # ROC/AUC
  rocdf <- roc_points(truth, prob)
  auc   <- auc_trapz(rocdf$fpr, rocdf$tpr)
  # Youden
  ydn <- youden_cut(truth, prob)
  list(
    cutoff_05 = list(conf = mat05, metr = metr05, risk = risk05),
    roc = rocdf, auc = auc,
    youden = ydn
  )
}

6 . Modelagem e Resultados (por modelo)

6.1 .1 Regressão Logística

fit_log <- glm(Survived ~ Class + Sex + Age, family = binomial, data = train)
summary(fit_log)$coefficients

##               Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)  2.0179899  0.2021449   9.982890 1.811138e-23
## Class2nd    -1.0406971  0.2382103  -4.368817 1.249214e-05
## Class3rd    -1.7247104  0.2051267  -8.408027 4.169663e-17
## ClassCrew   -0.8400688  0.1895045  -4.432976 9.294112e-06
## SexMale     -2.3668857  0.1657657 -14.278499 2.979497e-46
## AgeChild     0.6441659  0.2953938   2.180702 2.920544e-02

prob_log <- predict(fit_log, newdata = test, type = "response")
res_log  <- evaluate_prob_model(y_test, prob_log)

res_log$cutoff_05$conf

##      Truth
## Pred   No Yes
##   No  413 104
##   Yes  34 110

round(res_log$cutoff_05$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7912      0.5140      0.9239      0.7639      0.6145

cat("Risco (1 - acc) corte 0.5:", round(res_log$cutoff_05$risk, 4), "\n")

## Risco (1 - acc) corte 0.5: 0.2088

cat("AUC (ROC manual):", round(res_log$auc, 4), "\n")

## AUC (ROC manual): 0.6911

cat("Corte ótimo (Youden):", round(res_log$youden$cutoff, 4), "\n")

## Corte ótimo (Youden): 0.3218

res_log$youden$mat

##      Truth
## Pred   No Yes
##   No  373  83
##   Yes  74 131

round(res_log$youden$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7625      0.6121      0.8345      0.6390      0.6253

6.2 .2 Regressão Linear (em `y_bin`)

fit_lm <- lm(y_bin ~ Class + Sex + Age, data = train)
summary(fit_lm)$coefficients

##               Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)  0.8919253 0.03180600  28.042673 4.852027e-140
## Class2nd    -0.1905020 0.04067186  -4.683879  3.063727e-06
## Class3rd    -0.3015101 0.03388646  -8.897656  1.570172e-18
## ClassCrew   -0.1716377 0.03423984  -5.012807  5.986687e-07
## SexMale     -0.4856684 0.02781910 -17.458096  2.140948e-62
## AgeChild     0.1090399 0.04981697   2.188811  2.876087e-02

pred_lm <- predict(fit_lm, newdata = test)
prob_lm <- pmin(pmax(to_num(pred_lm), 0), 1)
res_lm  <- evaluate_prob_model(y_test, prob_lm)

res_lm$cutoff_05$conf

##      Truth
## Pred   No Yes
##   No  413 104
##   Yes  34 110

round(res_lm$cutoff_05$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7912      0.5140      0.9239      0.7639      0.6145

cat("Risco (1 - acc) corte 0.5:", round(res_lm$cutoff_05$risk, 4), "\n")

## Risco (1 - acc) corte 0.5: 0.2088

cat("AUC (ROC manual):", round(res_lm$auc, 4), "\n")

## AUC (ROC manual): 0.69

cat("Corte ótimo (Youden):", round(res_lm$youden$cutoff, 4), "\n")

## Corte ótimo (Youden): 0.3248

res_lm$youden$mat

##      Truth
## Pred   No Yes
##   No  373  83
##   Yes  74 131

round(res_lm$youden$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7625      0.6121      0.8345      0.6390      0.6253

6.3 .3 Naive Bayes

have_nb <- requireNamespace("e1071", quietly = TRUE)
if (have_nb) {
  fit_nb <- e1071::naiveBayes(Survived ~ Class + Sex + Age, data = train)
  prob_nb <- predict(fit_nb, newdata = test, type = "raw")[, "Yes"]
  res_nb  <- evaluate_prob_model(y_test, prob_nb)
  
  res_nb$cutoff_05$conf
  round(res_nb$cutoff_05$metr, 4)
  cat("Risco (1 - acc) corte 0.5:", round(res_nb$cutoff_05$risk, 4), "\n")
  cat("AUC (ROC manual):", round(res_nb$auc, 4), "\n")
  cat("Corte ótimo (Youden):", round(res_nb$youden$cutoff, 4), "\n")
  res_nb$youden$mat
  round(res_nb$youden$metr, 4)
} else {
  cat("Pacote 'e1071' ausente — Naive Bayes não executado.\n")
  res_nb <- NULL
}

## Risco (1 - acc) corte 0.5: 0.2088 
## AUC (ROC manual): 0.6606 
## Corte ótimo (Youden): 0.4301

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7625      0.6121      0.8345      0.6390      0.6253

6.4 .4 LDA e .5 QDA

if (!requireNamespace("MASS", quietly = TRUE)) stop("Pacote 'MASS' ausente.")
library(MASS)

fit_lda <- MASS::lda(Survived ~ Class + Sex + Age, data = train)
prob_lda <- predict(fit_lda, newdata = test)$posterior[, "Yes"]
res_lda  <- evaluate_prob_model(y_test, prob_lda)

fit_qda <- MASS::qda(Survived ~ Class + Sex + Age, data = train)
prob_qda <- predict(fit_qda, newdata = test)$posterior[, "Yes"]
res_qda  <- evaluate_prob_model(y_test, prob_qda)

# LDA
res_lda$cutoff_05$conf

##      Truth
## Pred   No Yes
##   No  413 104
##   Yes  34 110

round(res_lda$cutoff_05$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7912      0.5140      0.9239      0.7639      0.6145

cat("Risco LDA:", round(res_lda$cutoff_05$risk, 4), "| AUC:", round(res_lda$auc, 4), "\n")

## Risco LDA: 0.2088 | AUC: 0.69

# QDA
res_qda$cutoff_05$conf

##      Truth
## Pred   No Yes
##   No  364  78
##   Yes  83 136

round(res_qda$cutoff_05$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7564      0.6355      0.8143      0.6210      0.6282

cat("Risco QDA:", round(res_qda$cutoff_05$risk, 4), "| AUC:", round(res_qda$auc, 4), "\n")

## Risco QDA: 0.2436 | AUC: 0.7029

6.5 .6 KNN

# KNN sem caret (probabilidade via atributo 'prob')
if (!requireNamespace("class", quietly = TRUE)) stop("Pacote 'class' (KNN) ausente.")
library(class)

k <- 11
pred_knn <- class::knn(train = as.matrix(X_train_num),
                       test  = as.matrix(X_test_num),
                       cl    = y_train, k = k, prob = TRUE)
p_win <- attr(pred_knn, "prob")
prob_knn <- ifelse(pred_knn == "Yes", p_win, 1 - p_win)
res_knn  <- evaluate_prob_model(y_test, prob_knn)

res_knn$cutoff_05$conf

##      Truth
## Pred   No Yes
##   No  442 125
##   Yes   5  89

round(res_knn$cutoff_05$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.8033      0.4159      0.9888      0.9468      0.5779

cat("Risco (1 - acc) corte 0.5:", round(res_knn$cutoff_05$risk, 4), "\n")

## Risco (1 - acc) corte 0.5: 0.1967

cat("AUC (ROC manual):", round(res_knn$auc, 4), "\n")

## AUC (ROC manual): 0.7056

cat("Corte ótimo (Youden):", round(res_knn$youden$cutoff, 4), "\n")

## Corte ótimo (Youden): 0.2353

res_knn$youden$mat

##      Truth
## Pred   No Yes
##   No  364  78
##   Yes  83 136

round(res_knn$youden$metr, 4)

##    Accuracy Sensitivity Specificity   Precision          F1 
##      0.7564      0.6355      0.8143      0.6210      0.6282

7 . Comparativo por Risco no Teste (corte 0.5)

linhas <- list(
  Logística   = res_log,
  Linear      = res_lm,
  `Naive Bayes` = res_nb,
  LDA         = res_lda,
  QDA         = res_qda,
  KNN         = res_knn
)

resumo <- do.call(rbind, lapply(names(linhas), function(nm) {
  obj <- linhas[[nm]]
  if (is.null(obj)) return(NULL)
  data.frame(
    Modelo = nm,
    Misclassification = as.numeric(obj$cutoff_05$risk),
    AUC = as.numeric(obj$auc),
    row.names = NULL
  )
}))
resumo <- resumo[order(resumo$Misclassification), ]
knitr::kable(resumo, digits = 4)

	Modelo	Misclassification	AUC
6	KNN	0.1967	0.7056
1	Logística	0.2088	0.6911
2	Linear	0.2088	0.6900
3	Naive Bayes	0.2088	0.6606
4	LDA	0.2088	0.6900
5	QDA	0.2436	0.7029

8 . Curvas ROC (Teste)

plot(0:1, 0:1, type="n", xlab="FPR (1 - Especificidade)", ylab="TPR (Sensibilidade)",
     main="ROC — Modelos Probabilísticos (Teste)")
abline(0,1,lty=3,col="gray")

add_roc <- function(obj, col) {
  if (is.null(obj)) return()
  lines(obj$roc$fpr, obj$roc$tpr, lwd=2, col=col)
}
cols <- c("steelblue","tomato","darkgreen","purple","orange","black")
add_roc(res_log, cols[1])
add_roc(res_lm,  cols[2])
if (!is.null(res_nb)) add_roc(res_nb, cols[3])
add_roc(res_lda, cols[4])
add_roc(res_qda, cols[5])
add_roc(res_knn, cols[6])

legend("bottomright",
       legend = c("Logística","Linear","Naive Bayes","LDA","QDA","KNN"),
       col = cols, lwd = 2, bty = "n")

9 . Discussão Final

KNN tende a obter maior acurácia geral neste conjunto (menor risco), mas com sensibilidade mais baixa no corte 0.5.
Para priorizar sensibilidade, ajuste o corte (Youden) ou prefira Logística/QDA com corte menor.
A base é desbalanceada (~32% “Yes”); cortes de 0.5 favorecem a classe “No”.

Classificadores no Titanic — Preparação, Modelagem e Avaliação

Lista 2 — Base Titanic

Fábio Junqueira

Guerson Salgado

João Alves

João Pedro

Osemar Xavier

2025-09-01

1 . Objetivo

2 . Dados e Leitura

3 . Divisão Estratificada Treino/Teste (70/30)

4 . Engenharia de Atributos (Dummies + Padronização)

5 . Funções de Avaliação (robustas e sem dependências)

6 . Modelagem e Resultados (por modelo)

6.1 .1 Regressão Logística

6.2 .2 Regressão Linear (em `y_bin`)

6.3 .3 Naive Bayes

6.4 .4 LDA e .5 QDA

6.5 .6 KNN

7 . Comparativo por Risco no Teste (corte 0.5)

8 . Curvas ROC (Teste)

9 . Discussão Final

Classificadores no Titanic — Preparação, Modelagem e Avaliação

Lista 2 — Base Titanic

Fábio Junqueira

Guerson Salgado

João Alves

João Pedro

Osemar Xavier

2025-09-01

1 . Objetivo

2 . Dados e Leitura

3 . Divisão Estratificada Treino/Teste (70/30)

4 . Engenharia de Atributos (Dummies + Padronização)

5 . Funções de Avaliação (robustas e sem dependências)

6 . Modelagem e Resultados (por modelo)

6.1 .1 Regressão Logística

6.2 .2 Regressão Linear (em y_bin)

6.3 .3 Naive Bayes

6.4 .4 LDA e .5 QDA

6.5 .6 KNN

7 . Comparativo por Risco no Teste (corte 0.5)

8 . Curvas ROC (Teste)

9 . Discussão Final

6.2 .2 Regressão Linear (em `y_bin`)