suppressPackageStartupMessages({
library(MASS)
library(glmnet)
library(caret)
library(readr)
##Q1
})
datos <- read.csv("q1.csv")
summary(datos)
## X payment_status installment grade
## Min. : 5 Length:473 Min. : 33.21 Min. :1.000
## 1st Qu.:224 Class :character 1st Qu.: 268.96 1st Qu.:2.000
## Median :423 Mode :character Median : 403.64 Median :2.000
## Mean :433 Mean : 454.27 Mean :2.556
## 3rd Qu.:660 3rd Qu.: 620.06 3rd Qu.:3.000
## Max. :871 Max. :1240.72 Max. :7.000
## emp_title emp_length home_ownership annual_inc
## Min. : 2.0 Min. : 1.00 Min. :1.000 Min. : 16968
## 1st Qu.:197.0 1st Qu.: 3.00 1st Qu.:1.000 1st Qu.: 50400
## Median :335.5 Median : 3.00 Median :1.000 Median : 70000
## Mean :335.9 Mean : 4.34 Mean :1.805 Mean : 78498
## 3rd Qu.:484.0 3rd Qu.: 6.00 3rd Qu.:3.000 3rd Qu.:100000
## Max. :648.0 Max. :11.00 Max. :3.000 Max. :300000
## verification_status title addr_state delinq_2yrs
## Min. :1.000 Min. : 1.000 Min. : 1.00 Min. : 0.000
## 1st Qu.:1.000 1st Qu.: 3.000 1st Qu.: 9.00 1st Qu.: 0.000
## Median :2.000 Median : 4.000 Median :20.00 Median : 0.000
## Mean :1.814 Mean : 4.209 Mean :21.13 Mean : 0.351
## 3rd Qu.:2.000 3rd Qu.: 4.000 3rd Qu.:31.00 3rd Qu.: 0.000
## Max. :3.000 Max. :11.000 Max. :46.00 Max. :15.000
## earliest_cr_line fico_range_high last_pymnt_amnt last_credit_pull_d
## Min. : 1.0 Min. :664.0 Min. : 0.59 Min. : 1.00
## 1st Qu.: 75.0 1st Qu.:679.0 1st Qu.: 417.35 1st Qu.:13.00
## Median :171.0 Median :694.0 Median : 1575.63 Median :25.00
## Mean :164.8 Mean :704.3 Mean : 5565.16 Mean :20.03
## 3rd Qu.:251.0 3rd Qu.:719.0 3rd Qu.: 8450.13 3rd Qu.:26.00
## Max. :315.0 Max. :850.0 Max. :34484.92 Max. :37.00
## last_fico_range_high last_fico_range_low tot_coll_amt tot_cur_bal
## Min. :499.0 Min. : 0.0 Min. : 0.0 Min. : 1806
## 1st Qu.:639.0 1st Qu.:635.0 1st Qu.: 0.0 1st Qu.: 38464
## Median :699.0 Median :695.0 Median : 0.0 Median :105650
## Mean :683.9 Mean :666.3 Mean : 306.6 Mean :150597
## 3rd Qu.:739.0 3rd Qu.:735.0 3rd Qu.: 0.0 3rd Qu.:229082
## Max. :839.0 Max. :835.0 Max. :102841.0 Max. :787778
## open_acc_6m open_act_il open_il_24m open_rv_12m
## Min. :0.000 Min. : 0.000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.: 0.000
## Median :1.000 Median : 2.000 Median :1.000 Median : 1.000
## Mean :1.239 Mean : 2.808 Mean :1.674 Mean : 1.562
## 3rd Qu.:2.000 3rd Qu.: 3.000 3rd Qu.:3.000 3rd Qu.: 2.000
## Max. :8.000 Max. :19.000 Max. :8.000 Max. :12.000
## open_rv_24m max_bal_bc all_util total_cu_tl
## Min. : 0.000 Min. : 0 Min. : 5.00 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 2493 1st Qu.: 47.00 1st Qu.: 0.000
## Median : 2.000 Median : 4739 Median : 60.00 Median : 0.000
## Mean : 3.091 Mean : 5895 Mean : 58.83 Mean : 1.503
## 3rd Qu.: 4.000 3rd Qu.: 7862 3rd Qu.: 73.00 3rd Qu.: 2.000
## Max. :17.000 Max. :28431 Max. :114.00 Max. :16.000
## inq_last_12m bc_open_to_buy mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## Min. : 0.000 Min. : 0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2076 1st Qu.: 3.00 1st Qu.: 2.000
## Median : 2.000 Median : 6092 Median : 6.00 Median : 4.000
## Mean : 2.296 Mean : 12268 Mean : 11.67 Mean : 6.719
## 3rd Qu.: 3.000 3rd Qu.: 14788 3rd Qu.: 15.00 3rd Qu.: 9.000
## Max. :16.000 Max. :263953 Max. :139.00 Max. :82.000
## mort_acc mths_since_recent_inq num_bc_sats num_bc_tl
## Min. :0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 5.000
## Median :1.000 Median : 5.000 Median : 4.000 Median : 7.000
## Mean :1.776 Mean : 6.326 Mean : 4.901 Mean : 8.243
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 6.000 3rd Qu.:10.000
## Max. :9.000 Max. :23.000 Max. :21.000 Max. :27.000
## num_rev_accts num_sats num_tl_op_past_12m
## Min. : 2.00 Min. : 3.00 Min. : 0.000
## 1st Qu.:10.00 1st Qu.: 9.00 1st Qu.: 1.000
## Median :14.00 Median :11.00 Median : 2.000
## Mean :15.34 Mean :12.23 Mean : 2.495
## 3rd Qu.:19.00 3rd Qu.:15.00 3rd Qu.: 4.000
## Max. :57.00 Max. :34.00 Max. :12.000
drop_idx <- c("Unnamed..0", "Unnamed: 0")
datos <- datos[ , setdiff(names(datos), drop_idx), drop = FALSE]
# variable objetivo
if (!("payment_status" %in% names(datos))) {
stop("No se encontró la columna 'payment_status' en el CSV. Revisa names(df).")
}
#Limpieza
is_char <- sapply(df, is.character)
datos[is_char] <- lapply(datos[is_char], factor)
# factor 2 clases
datos$payment_status <- as.factor(datos$payment_status)
if (nlevels(datos$payment_status) != 2) {
stop("payment_status debe tener exactamente 2 clases.")
}
# Quitar NAs simples
datos <- na.omit(datos)
# ---------- 3) Train/Test split ----------
set.seed(20)
n <- nrow(datos)
idx_train <- sample.int(n, size = floor(0.7 * n)) # 70/30
train <- datos[idx_train, ]
test <- datos[-idx_train, ]
#ver balance de clases
cat("Clases (train):\n"); print(table(train$payment_status))
## Clases (train):
##
## No paga Paga
## 54 277
cat("Clases (test):\n"); print(table(test$payment_status))
## Clases (test):
##
## No paga Paga
## 24 118
# Diseño X consistente para todos los modelos
y_train <- train$payment_status
y_test <- test$payment_status
x_cols <- setdiff(names(train), "payment_status")
# TRAIN
terms_x <- terms(~ ., data = train[, x_cols, drop = FALSE])
# Matrices de diseño para TRAIN y TEST
mm_train <- model.matrix(terms_x, data = train[, x_cols, drop = FALSE])
mm_test <- model.matrix(terms_x, data = test[, x_cols, drop = FALSE])
# Quitamos el intercepto de las matrices
if ("(Intercept)" %in% colnames(mm_train)) {
mm_train <- mm_train[, colnames(mm_train) != "(Intercept)", drop = FALSE]
}
if ("(Intercept)" %in% colnames(mm_test)) {
mm_test <- mm_test[, colnames(mm_test) != "(Intercept)", drop = FALSE]
}
# Data frames para GLM
glm_train <- data.frame(payment_status = y_train, as.data.frame(mm_train))
glm_test <- data.frame(payment_status = y_test, as.data.frame(mm_test))
# Función de Accuracy
acc <- function(y_true, y_pred) mean(y_true == y_pred)
#segundo nivel del factor
positive_level <- levels(y_train)[2]
baseline_level <- levels(y_train)[1]
# LOGIT
logit_fit <- glm(payment_status ~ .,
data = glm_train,
family = binomial(link = "logit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
p_train_logit <- predict(logit_fit, newdata = glm_train, type = "response")
p_test_logit <- predict(logit_fit, newdata = glm_test, type = "response")
pred_train_logit <- factor(ifelse(p_train_logit >= 0.5, positive_level, baseline_level),
levels = levels(y_train))
pred_test_logit <- factor(ifelse(p_test_logit >= 0.5, positive_level, baseline_level),
levels = levels(y_train))
acc_train_logit <- acc(y_train, pred_train_logit)
acc_test_logit <- acc(y_test, pred_test_logit)
# PROBIT
probit_fit <- glm(payment_status ~ .,
data = glm_train,
family = binomial(link = "probit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
p_train_probit <- predict(probit_fit, newdata = glm_train, type = "response")
p_test_probit <- predict(probit_fit, newdata = glm_test, type = "response")
pred_train_probit <- factor(ifelse(p_train_probit >= 0.5, positive_level, baseline_level),
levels = levels(y_train))
pred_test_probit <- factor(ifelse(p_test_probit >= 0.5, positive_level, baseline_level),
levels = levels(y_train))
acc_train_probit <- acc(y_train, pred_train_probit)
acc_test_probit <- acc(y_test, pred_test_probit)
# LDA
lda_fit <- lda(x = mm_train, grouping = y_train)
pred_train_lda <- predict(lda_fit, newdata = mm_train)$class
pred_test_lda <- predict(lda_fit, newdata = mm_test)$class
acc_train_lda <- acc(y_train, pred_train_lda)
acc_test_lda <- acc(y_test, pred_test_lda)
# Resultados
cat("\n=== Accuracy (70/30 split, seed=20) ===\n")
##
## === Accuracy (70/30 split, seed=20) ===
cat(sprintf("LOGIT -> Train: %.4f | Test: %.4f\n", acc_train_logit, acc_test_logit))
## LOGIT -> Train: 1.0000 | Test: 0.8803
cat(sprintf("PROBIT -> Train: %.4f | Test: %.4f\n", acc_train_probit, acc_test_probit))
## PROBIT -> Train: 1.0000 | Test: 0.8803
cat(sprintf("LDA -> Train: %.4f | Test: %.4f\n", acc_train_lda, acc_test_lda))
## LDA -> Train: 0.9456 | Test: 0.9225
# Confusion matrices (Test)
cat("\n--- Confusion Matrices (Test) ---\n")
##
## --- Confusion Matrices (Test) ---
print(confusionMatrix(pred_test_logit, y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No paga Paga
## No paga 13 6
## Paga 11 112
##
## Accuracy : 0.8803
## 95% CI : (0.8152, 0.9287)
## No Information Rate : 0.831
## P-Value [Acc > NIR] : 0.06829
##
## Kappa : 0.5352
##
## Mcnemar's Test P-Value : 0.33198
##
## Sensitivity : 0.54167
## Specificity : 0.94915
## Pos Pred Value : 0.68421
## Neg Pred Value : 0.91057
## Prevalence : 0.16901
## Detection Rate : 0.09155
## Detection Prevalence : 0.13380
## Balanced Accuracy : 0.74541
##
## 'Positive' Class : No paga
##
"Con “No paga” como clase positiva, el modelo logra 88.0% de exactitud , solo un poco por encima del NIR = 0.831 y con mejora no significativa al 5% p = 0.068. Recupera 13 de 24 deudores sensibilidad = 0.542 y clasifica bien 112 de 118 pagadores,con 11 falsos negativos y 6 falsos positivos; cuando alerta “No paga” acierta 68% y cuando dice “Paga” suele acertar 91%. En síntesis, el modelo protege bien a los buenos pagadores pero deja escapar casi la mitad de los deudores; si el costo de perder deudores es alto, conviene subir la sensibilidad o rebalancear clases."
## [1] "Con “No paga” como clase positiva, el modelo logra 88.0% de exactitud , solo un poco por encima del NIR = 0.831 y con mejora no significativa al 5% p = 0.068. Recupera 13 de 24 deudores sensibilidad = 0.542 y clasifica bien 112 de 118 pagadores,con 11 falsos negativos y 6 falsos positivos; cuando alerta “No paga” acierta 68% y cuando dice “Paga” suele acertar 91%. En síntesis, el modelo protege bien a los buenos pagadores pero deja escapar casi la mitad de los deudores; si el costo de perder deudores es alto, conviene subir la sensibilidad o rebalancear clases."
print(confusionMatrix(pred_test_probit, y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No paga Paga
## No paga 13 6
## Paga 11 112
##
## Accuracy : 0.8803
## 95% CI : (0.8152, 0.9287)
## No Information Rate : 0.831
## P-Value [Acc > NIR] : 0.06829
##
## Kappa : 0.5352
##
## Mcnemar's Test P-Value : 0.33198
##
## Sensitivity : 0.54167
## Specificity : 0.94915
## Pos Pred Value : 0.68421
## Neg Pred Value : 0.91057
## Prevalence : 0.16901
## Detection Rate : 0.09155
## Detection Prevalence : 0.13380
## Balanced Accuracy : 0.74541
##
## 'Positive' Class : No paga
##
"Con “No paga” como clase positiva,el modelo recupera 13 de 24 deudores sensibilidad = 0.54167, clasifica bien 112 de 118 pagadores especificidad = 0.94915, y cuando alerta “No paga” acierta 68.4% PPV = 0.68421; cuando dice “Paga” suele acertar 91.1% NPV = 0.91057. En síntesis, protege bien a los buenos pagadores pero deja escapar 46% de deudores, por lo que, si ese error es costoso, conviene subir la sensibilidad ajustando o rebalanceando clases."
## [1] "Con “No paga” como clase positiva,el modelo recupera 13 de 24 deudores sensibilidad = 0.54167, clasifica bien 112 de 118 pagadores especificidad = 0.94915, y cuando alerta “No paga” acierta 68.4% PPV = 0.68421; cuando dice “Paga” suele acertar 91.1% NPV = 0.91057. En síntesis, protege bien a los buenos pagadores pero deja escapar 46% de deudores, por lo que, si ese error es costoso, conviene subir la sensibilidad ajustando o rebalanceando clases."
print(confusionMatrix(pred_test_lda, y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No paga Paga
## No paga 16 3
## Paga 8 115
##
## Accuracy : 0.9225
## 95% CI : (0.8656, 0.9607)
## No Information Rate : 0.831
## P-Value [Acc > NIR] : 0.001224
##
## Kappa : 0.6993
##
## Mcnemar's Test P-Value : 0.227800
##
## Sensitivity : 0.6667
## Specificity : 0.9746
## Pos Pred Value : 0.8421
## Neg Pred Value : 0.9350
## Prevalence : 0.1690
## Detection Rate : 0.1127
## Detection Prevalence : 0.1338
## Balanced Accuracy : 0.8206
##
## 'Positive' Class : No paga
##
"Con “No paga” como clase positiva, este modelo alcanza 92.25% de exactitud y supera de forma significativa al NIR de 0.831 p = 0.0012,recupera 16 de 24 deudores y protege muy bien a los buenos pagadores (especificidad = 0.975); cuando marca “No paga” acierta 84% PPV = 0.842 y cuando dice “Paga” acierta 93.5% NPV = 0.935. Tiene muy buen desempeño, con bajo costo en falsos positivos y una sensibilidad ya razonable;"
## [1] "Con “No paga” como clase positiva, este modelo alcanza 92.25% de exactitud y supera de forma significativa al NIR de 0.831 p = 0.0012,recupera 16 de 24 deudores y protege muy bien a los buenos pagadores (especificidad = 0.975); cuando marca “No paga” acierta 84% PPV = 0.842 y cuando dice “Paga” acierta 93.5% NPV = 0.935. Tiene muy buen desempeño, con bajo costo en falsos positivos y una sensibilidad ya razonable;"
# ibrerías
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
options(repos = c(CRAN = "https://cloud.r-project.org"))
need <- c("MASS", "rpart", "glmnet")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
##Q2
datos <- datos[, !grepl("^Unnamed", names(datos), ignore.case = TRUE), drop = FALSE]
target <- "payment_status"
if (!target %in% names(datos)) stop("No encuentro la columna 'payment_status' en tus datos.")
## limpieza
for (nm in setdiff(names(datos), target)) {
if (is.character(datos[[nm]])) datos[[nm]] <- factor(datos[[nm]])
}
num_cols <- names(which(sapply(datos, is.numeric)))
for (c in num_cols) {
if (all(is.na(datos[[c]]))) next
datos[[c]][is.na(datos[[c]])] <- stats::median(datos[[c]], na.rm = TRUE)
}
fac_cols <- names(which(sapply(datos, is.factor)))
for (c in setdiff(fac_cols, target)) {
if (anyNA(datos[[c]])) {
moda <- names(which.max(table(datos[[c]])))
datos[[c]][is.na(datos[[c]])] <- moda
}
}
##factor binario
datos[[target]] <- as.factor(datos[[target]])
if (nlevels(datos[[target]]) != 2) {
stop("La variable 'payment_status' debe ser binaria (2 clases).")
}
levels_y <- levels(datos[[target]])
## Clase positiva
tbl <- table(datos[[target]])
pos_class <- names(tbl)[which.min(tbl)]
neg_class <- setdiff(levels(datos[[target]]), pos_class)[1]
## Train/Test split 70/30
set.seed(20)
idx_train <- unlist(tapply(seq_len(nrow(datos)), datos[[target]],
function(ix) sample(ix, ceiling(0.7 * length(ix)))))
train <- datos[idx_train, , drop = FALSE]
test <- datos[-idx_train, , drop = FALSE]
# obtener dummies
make_mm <- function(df, drop_col) {
mm <- model.matrix(reformulate(setdiff(names(df), drop_col)), data = df)
mm <- mm[, -1, drop = FALSE] # quitar intercepto
return(mm)
}
metrics_acc_recall <- function(y_true, y_pred, positive) {
y_true <- factor(y_true, levels = c(positive, setdiff(unique(y_true), positive)))
y_pred <- factor(y_pred, levels = levels(y_true))
acc <- mean(y_true == y_pred)
cm <- table(y_true, y_pred)
TP <- ifelse(!is.na(cm[positive, positive]), cm[positive, positive], 0)
FN <- sum(cm[positive, ], na.rm = TRUE) - TP
rec <- ifelse((TP + FN) == 0, NA_real_, TP / (TP + FN))
c(Accuracy = acc, Recall = rec)
}
## Matrices para los modelos LDA y Ridge
Xtr_mm <- make_mm(train, target)
Xte_mm <- make_mm(test, target)
## 0/1 para Ridge 1 = clase positiva
ytr_bin <- ifelse(train[[target]] == pos_class, 1, 0)
yte_lbl <- test[[target]]
results <- list()
## LOGIT glm
fit_logit <- glm(reformulate(setdiff(names(train), target), response = target),
data = train, family = binomial(link = "logit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
p_logit <- predict(fit_logit, newdata = test, type = "response")
#Convertimos a clase según niveles del train:
levs <- levels(train[[target]])
pred_logit <- ifelse(p_logit >= 0.5, levs[2], levs[1])
m_logit <- metrics_acc_recall(test[[target]], pred_logit, pos_class)
results[["Logit"]] <- m_logit
##PROBIT
fit_probit <- glm(reformulate(setdiff(names(train), target), response = target),
data = train, family = binomial(link = "probit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
p_probit <- predict(fit_probit, newdata = test, type = "response")
pred_probit <- ifelse(p_probit >= 0.5, levs[2], levs[1])
m_probit <- metrics_acc_recall(test[[target]], pred_probit, pos_class)
results[["Probit"]] <- m_probit
## LDA
fit_lda <- MASS::lda(x = Xtr_mm, grouping = train[[target]])
pred_lda <- predict(fit_lda, Xte_mm)$class
m_lda <- metrics_acc_recall(test[[target]], pred_lda, pos_class)
results[["LDA"]] <- m_lda
## RIDGE
cv_ridge <- glmnet::cv.glmnet(x = as.matrix(Xtr_mm), y = ytr_bin,
family = "binomial", alpha = 0, nfolds = 5)
p_ridge <- as.numeric(predict(cv_ridge, newx = as.matrix(Xte_mm),
s = "lambda.min", type = "response"))
pred_ridge <- ifelse(p_ridge >= 0.5, pos_class, neg_class)
m_ridge <- metrics_acc_recall(test[[target]], pred_ridge, pos_class)
results[["Ridge"]] <- m_ridge
## DECISION TREE
fit_dt <- rpart::rpart(reformulate(setdiff(names(train), target), response = target),
data = train, method = "class")
pred_dt <- predict(fit_dt, newdata = test, type = "class")
m_dt <- metrics_acc_recall(test[[target]], pred_dt, pos_class)
results[["DT"]] <- m_dt
## ===== 10) Tabla final y mejor modelo =====
results_df <- do.call(rbind, results)
results_df <- data.frame(Model = rownames(results_df), results_df, row.names = NULL)
results_df <- results_df[order(-results_df$Recall, -results_df$Accuracy), ]
# Redondea SOLO las columnas numéricas
results_df$Accuracy <- round(as.numeric(results_df$Accuracy), 4)
results_df$Recall <- round(as.numeric(results_df$Recall), 4)
cat("\nResultados (Accuracy y Recall):\n")
##
## Resultados (Accuracy y Recall):
print(results_df)
## Model Accuracy Recall
## 1 Logit 0.9078 0.9130
## 2 Probit 0.9078 0.9130
## 5 DT 0.9574 0.8696
## 3 LDA 0.9007 0.8696
## 4 Ridge 0.9433 0.8261
"En la comparación de modelos el Árbol de Decisión tiene la mayor exactitud 95.7% pero un recall más bajo 86.9%, o sea, clasifica muy bien en general pero deja escapar más deudores;
Logit y Probit empatan en exactitud 90.8% y logran el mejor recall 91.3%, Ridge ofrece alta exactitud 94.3% pero el recall más bajo 82.6%, es un modelo conservador que protege a buenos pagadores pero deja ir más deudores;
LDA es el más flojo en exactitud 90.1% y tiene un recall intermedio 86.9%.
En general si el queremos que el costo de perder deudores sea alto, Logit/Probit son preferibles; pero si buscamos una máxima exactitud y pocos falsos positivos, DT o Ridge funcionan mejor"
## [1] "En la comparación de modelos el Árbol de Decisión tiene la mayor exactitud 95.7% pero un recall más bajo 86.9%, o sea, clasifica muy bien en general pero deja escapar más deudores; \nLogit y Probit empatan en exactitud 90.8% y logran el mejor recall 91.3%, Ridge ofrece alta exactitud 94.3% pero el recall más bajo 82.6%, es un modelo conservador que protege a buenos pagadores pero deja ir más deudores; \nLDA es el más flojo en exactitud 90.1% y tiene un recall intermedio 86.9%. \nEn general si el queremos que el costo de perder deudores sea alto, Logit/Probit son preferibles; pero si buscamos una máxima exactitud y pocos falsos positivos, DT o Ridge funcionan mejor"
best <- results_df[1, ]
cat(sprintf(
"\nMejor modelo: %s (Recall = %.4f, Accuracy = %.4f).\nSe elige por que tiene un mayor Recall; Accuracy como desempate. Clase positiva usada: '%s'.\n",
best$Model, best$Recall, best$Accuracy, pos_class
))
##
## Mejor modelo: Logit (Recall = 0.9130, Accuracy = 0.9078).
## Se elige por que tiene un mayor Recall; Accuracy como desempate. Clase positiva usada: 'No paga'.
## Q3
options(repos = c(CRAN = "https://cloud.r-project.org"))
need <- c("MASS", "rpart", "glmnet")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(need, require, character.only = TRUE))
## Loading required package: rpart
target <- "payment_status"
mode_one <- function(x) {
tx <- table(x, useNA = "no"); if (length(tx) == 0) return(NA)
names(tx)[which.max(tx)]
}
make_mm <- function(df, drop_col) {
# Si no existe drop_col, no se usa en la fórmula
preds <- setdiff(names(df), drop_col)
mm <- model.matrix(reformulate(preds), data = df)
mm <- mm[, -1, drop = FALSE] # sin intercepto
return(mm)
}
metrics_acc_recall <- function(y_true, y_pred, positive) {
y_true <- factor(y_true, levels = c(positive, setdiff(unique(y_true), positive)))
y_pred <- factor(y_pred, levels = levels(y_true))
acc <- mean(y_true == y_pred)
cm <- table(y_true, y_pred)
TP <- ifelse(!is.na(cm[positive, positive]), cm[positive, positive], 0)
FN <- sum(cm[positive, ], na.rm = TRUE) - TP
rec <- ifelse((TP + FN) == 0, NA_real_, TP / (TP + FN))
c(Accuracy = acc, Recall = rec)
}
if (!exists("train") || !exists("fit_logit") || !exists("fit_probit") ||
!exists("fit_lda") || !exists("cv_ridge") || !exists("fit_dt") ||
!exists("results_df")) {
stop("No encuentro objetos de Q2 (modelos entrenados y 'results_df'). Ejecuta Q2 antes de Q3.")
}
if (!exists("Xtr_mm")) {
# si no quedó en el entorno, lo recalculamos
Xtr_mm <- make_mm(train, target)
}
levs <- levels(train[[target]])
labels_y <- levels(train[[target]])
pat_default <- "(default|impago|moroso|mora|unpaid|late|no.?pago|no.?pay|0|false|no)"
cand <- labels_y[grepl(pat_default, labels_y, ignore.case = TRUE)]
if (length(cand) >= 1) {
default_class <- cand[1]
} else {
if (!exists("pos_class")) {
tbl_all <- table(train[[target]])
pos_class <- names(tbl_all)[which.min(tbl_all)]
}
default_class <- pos_class
}
other_class <- setdiff(labels_y, default_class)[1]
# Cargar prueba.csv
if (file.exists("prueba.csv")) {
newdat_raw <- read.csv("prueba.csv", header = TRUE, stringsAsFactors = FALSE)
} else {
newdat_raw <- read.csv(file.choose(), header = TRUE, stringsAsFactors = FALSE)
}
# Quitar columnas 'Unnamed'
newdat_raw <- newdat_raw[, !grepl("^Unnamed", names(newdat_raw), ignore.case = TRUE), drop = FALSE]
train_preds <- setdiff(names(train), target)
for (nm in train_preds) {
if (!nm %in% names(newdat_raw)) {
if (is.numeric(train[[nm]])) {
med <- suppressWarnings(stats::median(train[[nm]], na.rm = TRUE))
if (is.na(med)) med <- 0
newdat_raw[[nm]] <- med
} else if (is.factor(train[[nm]])) {
md <- mode_one(train[[nm]])
newdat_raw[[nm]] <- md
} else {
newdat_raw[[nm]] <- NA
}
}
}
newdat <- newdat_raw[, train_preds, drop = FALSE]
for (nm in names(newdat)) {
if (is.character(newdat[[nm]])) newdat[[nm]] <- factor(newdat[[nm]])
}
for (nm in intersect(names(newdat), names(train))) {
if (is.factor(train[[nm]])) {
lvl <- levels(train[[nm]])
newdat[[nm]] <- factor(newdat[[nm]], levels = lvl)
if (anyNA(newdat[[nm]])) {
md <- mode_one(train[[nm]])
newdat[[nm]][is.na(newdat[[nm]])] <- md
newdat[[nm]] <- factor(newdat[[nm]], levels = lvl)
}
}
}
for (nm in names(newdat)) {
if (is.numeric(train[[nm]])) {
med <- suppressWarnings(stats::median(train[[nm]], na.rm = TRUE))
if (is.na(med)) med <- 0
newdat[[nm]][is.na(newdat[[nm]])] <- med
if (!is.numeric(newdat[[nm]])) {
newdat[[nm]] <- suppressWarnings(as.numeric(as.character(newdat[[nm]])))
newdat[[nm]][is.na(newdat[[nm]])] <- med
}
}
}
tmp_for_mm <- newdat
tmp_for_mm[[target]] <- factor(rep(levs[1], nrow(tmp_for_mm)), levels = levs)
Xnew_mm <- make_mm(tmp_for_mm, target)
# alinear columnas con el train
missing_cols <- setdiff(colnames(Xtr_mm), colnames(Xnew_mm))
extra_cols <- setdiff(colnames(Xnew_mm), colnames(Xtr_mm))
if (length(missing_cols)) {
Xnew_mm <- cbind(Xnew_mm, matrix(0, nrow = nrow(Xnew_mm), ncol = length(missing_cols),
dimnames = list(NULL, missing_cols)))
}
if (length(extra_cols)) {
Xnew_mm <- Xnew_mm[, setdiff(colnames(Xnew_mm), extra_cols), drop = FALSE]
}
# reordenar
Xnew_mm <- Xnew_mm[, colnames(Xtr_mm), drop = FALSE]
# --- Seleccionar modelo ganador de Q2 ---
if (!exists("best")) best <- results_df[1, ]
chosen_model <- as.character(best$Model)
predict_default <- function(model_name, debug = FALSE) {
model_name <- toupper(model_name)
n <- nrow(newdat)
levs <- levels(train[[target]])
if (model_name == "LOGIT") {
p <- as.numeric(predict(fit_logit, newdata = newdat, type = "response"))
p_default <- if (default_class == levs[2]) p else 1 - p
pred <- ifelse(p_default >= 0.5, default_class, other_class)
} else if (model_name == "PROBIT") {
p <- as.numeric(predict(fit_probit, newdata = newdat, type = "response"))
p_default <- if (default_class == levs[2]) p else 1 - p
pred <- ifelse(p_default >= 0.5, default_class, other_class)
} else if (model_name == "LDA") {
pred_obj <- predict(fit_lda, Xnew_mm)
post <- as.matrix(pred_obj$posterior)
if (!(default_class %in% colnames(post)))
stop(sprintf("No encuentro posterior para '%s' en LDA.", default_class))
p_default <- as.numeric(post[, default_class])
pred <- ifelse(p_default >= 0.5, default_class, other_class)
} else if (model_name == "RIDGE") {
if (!exists("pos_class")) {
tbl_all <- table(train[[target]])
pos_class <- names(tbl_all)[which.min(tbl_all)]
}
p_pos <- as.numeric(predict(cv_ridge, newx = as.matrix(Xnew_mm),
s = "lambda.min", type = "response"))
p_default <- if (identical(default_class, pos_class)) p_pos else (1 - p_pos)
pred <- ifelse(p_default >= 0.5, default_class, other_class)
} else if (model_name == "DT" || model_name == "DECISION TREE") {
pr <- predict(fit_dt, newdata = newdat, type = "prob")
pr_df <- as.data.frame(pr)
if (!(default_class %in% colnames(pr_df)))
stop(sprintf("No encuentro la prob. para '%s' en DT.", default_class))
p_default <- as.numeric(pr_df[[default_class]])
pred <- ifelse(p_default >= 0.5, default_class, other_class)
} else {
stop(sprintf("Modelo '%s' no reconocido.", model_name))
}
if (debug) {
cat("\n[DEBUG] Modelo:", model_name,
"| default_class:", default_class,
"| other_class:", other_class, "\n")
cat("[DEBUG] Primeras 10 probabilidades de default:\n")
print(head(p_default, 10))
cat("[DEBUG] Predicciones (primeras 10):\n")
print(head(pred, 10))
}
out <- data.frame(
row_id = seq_len(n),
Predicted_Class = pred,
Prob_Default = round(p_default, 4),
stringsAsFactors = FALSE
)
return(out)
resultado_Q3 <- predict_default(chosen_model, debug = TRUE)
print(resultado_Q3)
}