act1

suppressPackageStartupMessages({
  library(MASS)    
  library(glmnet)   
  library(caret)  
  library(readr)
##Q1
  })
datos <- read.csv("q1.csv")  
summary(datos)

##        X       payment_status      installment          grade      
##  Min.   :  5   Length:473         Min.   :  33.21   Min.   :1.000  
##  1st Qu.:224   Class :character   1st Qu.: 268.96   1st Qu.:2.000  
##  Median :423   Mode  :character   Median : 403.64   Median :2.000  
##  Mean   :433                      Mean   : 454.27   Mean   :2.556  
##  3rd Qu.:660                      3rd Qu.: 620.06   3rd Qu.:3.000  
##  Max.   :871                      Max.   :1240.72   Max.   :7.000  
##    emp_title       emp_length    home_ownership    annual_inc    
##  Min.   :  2.0   Min.   : 1.00   Min.   :1.000   Min.   : 16968  
##  1st Qu.:197.0   1st Qu.: 3.00   1st Qu.:1.000   1st Qu.: 50400  
##  Median :335.5   Median : 3.00   Median :1.000   Median : 70000  
##  Mean   :335.9   Mean   : 4.34   Mean   :1.805   Mean   : 78498  
##  3rd Qu.:484.0   3rd Qu.: 6.00   3rd Qu.:3.000   3rd Qu.:100000  
##  Max.   :648.0   Max.   :11.00   Max.   :3.000   Max.   :300000  
##  verification_status     title          addr_state     delinq_2yrs    
##  Min.   :1.000       Min.   : 1.000   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.:1.000       1st Qu.: 3.000   1st Qu.: 9.00   1st Qu.: 0.000  
##  Median :2.000       Median : 4.000   Median :20.00   Median : 0.000  
##  Mean   :1.814       Mean   : 4.209   Mean   :21.13   Mean   : 0.351  
##  3rd Qu.:2.000       3rd Qu.: 4.000   3rd Qu.:31.00   3rd Qu.: 0.000  
##  Max.   :3.000       Max.   :11.000   Max.   :46.00   Max.   :15.000  
##  earliest_cr_line fico_range_high last_pymnt_amnt    last_credit_pull_d
##  Min.   :  1.0    Min.   :664.0   Min.   :    0.59   Min.   : 1.00     
##  1st Qu.: 75.0    1st Qu.:679.0   1st Qu.:  417.35   1st Qu.:13.00     
##  Median :171.0    Median :694.0   Median : 1575.63   Median :25.00     
##  Mean   :164.8    Mean   :704.3   Mean   : 5565.16   Mean   :20.03     
##  3rd Qu.:251.0    3rd Qu.:719.0   3rd Qu.: 8450.13   3rd Qu.:26.00     
##  Max.   :315.0    Max.   :850.0   Max.   :34484.92   Max.   :37.00     
##  last_fico_range_high last_fico_range_low  tot_coll_amt       tot_cur_bal    
##  Min.   :499.0        Min.   :  0.0       Min.   :     0.0   Min.   :  1806  
##  1st Qu.:639.0        1st Qu.:635.0       1st Qu.:     0.0   1st Qu.: 38464  
##  Median :699.0        Median :695.0       Median :     0.0   Median :105650  
##  Mean   :683.9        Mean   :666.3       Mean   :   306.6   Mean   :150597  
##  3rd Qu.:739.0        3rd Qu.:735.0       3rd Qu.:     0.0   3rd Qu.:229082  
##  Max.   :839.0        Max.   :835.0       Max.   :102841.0   Max.   :787778  
##   open_acc_6m     open_act_il      open_il_24m     open_rv_12m    
##  Min.   :0.000   Min.   : 0.000   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:0.000   1st Qu.: 1.000   1st Qu.:1.000   1st Qu.: 0.000  
##  Median :1.000   Median : 2.000   Median :1.000   Median : 1.000  
##  Mean   :1.239   Mean   : 2.808   Mean   :1.674   Mean   : 1.562  
##  3rd Qu.:2.000   3rd Qu.: 3.000   3rd Qu.:3.000   3rd Qu.: 2.000  
##  Max.   :8.000   Max.   :19.000   Max.   :8.000   Max.   :12.000  
##   open_rv_24m       max_bal_bc       all_util       total_cu_tl    
##  Min.   : 0.000   Min.   :    0   Min.   :  5.00   Min.   : 0.000  
##  1st Qu.: 1.000   1st Qu.: 2493   1st Qu.: 47.00   1st Qu.: 0.000  
##  Median : 2.000   Median : 4739   Median : 60.00   Median : 0.000  
##  Mean   : 3.091   Mean   : 5895   Mean   : 58.83   Mean   : 1.503  
##  3rd Qu.: 4.000   3rd Qu.: 7862   3rd Qu.: 73.00   3rd Qu.: 2.000  
##  Max.   :17.000   Max.   :28431   Max.   :114.00   Max.   :16.000  
##   inq_last_12m    bc_open_to_buy   mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl  
##  Min.   : 0.000   Min.   :     0   Min.   :  0.00        Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:  2076   1st Qu.:  3.00        1st Qu.: 2.000  
##  Median : 2.000   Median :  6092   Median :  6.00        Median : 4.000  
##  Mean   : 2.296   Mean   : 12268   Mean   : 11.67        Mean   : 6.719  
##  3rd Qu.: 3.000   3rd Qu.: 14788   3rd Qu.: 15.00        3rd Qu.: 9.000  
##  Max.   :16.000   Max.   :263953   Max.   :139.00        Max.   :82.000  
##     mort_acc     mths_since_recent_inq  num_bc_sats       num_bc_tl     
##  Min.   :0.000   Min.   : 0.000        Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:0.000   1st Qu.: 2.000        1st Qu.: 3.000   1st Qu.: 5.000  
##  Median :1.000   Median : 5.000        Median : 4.000   Median : 7.000  
##  Mean   :1.776   Mean   : 6.326        Mean   : 4.901   Mean   : 8.243  
##  3rd Qu.:3.000   3rd Qu.: 9.000        3rd Qu.: 6.000   3rd Qu.:10.000  
##  Max.   :9.000   Max.   :23.000        Max.   :21.000   Max.   :27.000  
##  num_rev_accts      num_sats     num_tl_op_past_12m
##  Min.   : 2.00   Min.   : 3.00   Min.   : 0.000    
##  1st Qu.:10.00   1st Qu.: 9.00   1st Qu.: 1.000    
##  Median :14.00   Median :11.00   Median : 2.000    
##  Mean   :15.34   Mean   :12.23   Mean   : 2.495    
##  3rd Qu.:19.00   3rd Qu.:15.00   3rd Qu.: 4.000    
##  Max.   :57.00   Max.   :34.00   Max.   :12.000

drop_idx <- c("Unnamed..0", "Unnamed: 0")
datos <- datos[ , setdiff(names(datos), drop_idx), drop = FALSE]

# variable objetivo
if (!("payment_status" %in% names(datos))) {
  stop("No se encontró la columna 'payment_status' en el CSV. Revisa names(df).")
}

#Limpieza 
is_char <- sapply(df, is.character)
datos[is_char] <- lapply(datos[is_char], factor)

# factor 2 clases
datos$payment_status <- as.factor(datos$payment_status)
if (nlevels(datos$payment_status) != 2) {
  stop("payment_status debe tener exactamente 2 clases.")
}

# Quitar NAs simples 
datos <- na.omit(datos)

# ---------- 3) Train/Test split ----------
set.seed(20)
n <- nrow(datos)
idx_train <- sample.int(n, size = floor(0.7 * n)) # 70/30
train <- datos[idx_train, ]
test  <- datos[-idx_train, ]

#ver balance de clases
cat("Clases (train):\n"); print(table(train$payment_status))

## Clases (train):

## 
## No paga    Paga 
##      54     277

cat("Clases (test):\n");  print(table(test$payment_status))

## Clases (test):

## 
## No paga    Paga 
##      24     118

# Diseño X consistente para todos los modelos
y_train <- train$payment_status
y_test  <- test$payment_status

x_cols <- setdiff(names(train), "payment_status")

#  TRAIN
terms_x <- terms(~ ., data = train[, x_cols, drop = FALSE])

# Matrices de diseño para TRAIN y TEST 
mm_train <- model.matrix(terms_x, data = train[, x_cols, drop = FALSE])
mm_test  <- model.matrix(terms_x, data = test[,  x_cols, drop = FALSE])

# Quitamos el intercepto de las matrices 
if ("(Intercept)" %in% colnames(mm_train)) {
  mm_train <- mm_train[, colnames(mm_train) != "(Intercept)", drop = FALSE]
}
if ("(Intercept)" %in% colnames(mm_test)) {
  mm_test <- mm_test[, colnames(mm_test) != "(Intercept)", drop = FALSE]
}

# Data frames para GLM 
glm_train <- data.frame(payment_status = y_train, as.data.frame(mm_train))
glm_test  <- data.frame(payment_status = y_test,  as.data.frame(mm_test))

# Función de Accuracy 
acc <- function(y_true, y_pred) mean(y_true == y_pred)

#segundo nivel del factor
positive_level <- levels(y_train)[2]
baseline_level <- levels(y_train)[1]

# LOGIT 
logit_fit <- glm(payment_status ~ .,
                 data = glm_train,
                 family = binomial(link = "logit"))

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

p_train_logit <- predict(logit_fit, newdata = glm_train, type = "response")
p_test_logit  <- predict(logit_fit, newdata = glm_test,  type = "response")

pred_train_logit <- factor(ifelse(p_train_logit >= 0.5, positive_level, baseline_level),
                           levels = levels(y_train))
pred_test_logit  <- factor(ifelse(p_test_logit  >= 0.5, positive_level, baseline_level),
                           levels = levels(y_train))

acc_train_logit <- acc(y_train, pred_train_logit)
acc_test_logit  <- acc(y_test,  pred_test_logit)

# PROBIT 
probit_fit <- glm(payment_status ~ .,
                  data = glm_train,
                  family = binomial(link = "probit"))

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

p_train_probit <- predict(probit_fit, newdata = glm_train, type = "response")
p_test_probit  <- predict(probit_fit, newdata = glm_test,  type = "response")

pred_train_probit <- factor(ifelse(p_train_probit >= 0.5, positive_level, baseline_level),
                            levels = levels(y_train))
pred_test_probit  <- factor(ifelse(p_test_probit  >= 0.5, positive_level, baseline_level),
                            levels = levels(y_train))

acc_train_probit <- acc(y_train, pred_train_probit)
acc_test_probit  <- acc(y_test,  pred_test_probit)

# LDA 
lda_fit <- lda(x = mm_train, grouping = y_train)

pred_train_lda <- predict(lda_fit, newdata = mm_train)$class
pred_test_lda  <- predict(lda_fit, newdata = mm_test)$class

acc_train_lda <- acc(y_train, pred_train_lda)
acc_test_lda  <- acc(y_test,  pred_test_lda)

#  Resultados 
cat("\n=== Accuracy (70/30 split, seed=20) ===\n")

## 
## === Accuracy (70/30 split, seed=20) ===

cat(sprintf("LOGIT  -> Train: %.4f | Test: %.4f\n",  acc_train_logit,  acc_test_logit))

## LOGIT  -> Train: 1.0000 | Test: 0.8803

cat(sprintf("PROBIT -> Train: %.4f | Test: %.4f\n",  acc_train_probit, acc_test_probit))

## PROBIT -> Train: 1.0000 | Test: 0.8803

cat(sprintf("LDA    -> Train: %.4f | Test: %.4f\n",  acc_train_lda,   acc_test_lda))

## LDA    -> Train: 0.9456 | Test: 0.9225

# Confusion matrices (Test)
cat("\n--- Confusion Matrices (Test) ---\n")

## 
## --- Confusion Matrices (Test) ---

print(confusionMatrix(pred_test_logit,  y_test))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No paga Paga
##    No paga      13    6
##    Paga         11  112
##                                           
##                Accuracy : 0.8803          
##                  95% CI : (0.8152, 0.9287)
##     No Information Rate : 0.831           
##     P-Value [Acc > NIR] : 0.06829         
##                                           
##                   Kappa : 0.5352          
##                                           
##  Mcnemar's Test P-Value : 0.33198         
##                                           
##             Sensitivity : 0.54167         
##             Specificity : 0.94915         
##          Pos Pred Value : 0.68421         
##          Neg Pred Value : 0.91057         
##              Prevalence : 0.16901         
##          Detection Rate : 0.09155         
##    Detection Prevalence : 0.13380         
##       Balanced Accuracy : 0.74541         
##                                           
##        'Positive' Class : No paga         
##

"Con “No paga” como clase positiva, el modelo logra 88.0% de exactitud , solo un poco por encima del NIR = 0.831 y con mejora no significativa al 5% p = 0.068. Recupera 13 de 24 deudores sensibilidad = 0.542 y clasifica bien 112 de 118 pagadores,con 11 falsos negativos y 6 falsos positivos; cuando alerta “No paga” acierta 68% y cuando dice “Paga” suele acertar 91%. En síntesis, el modelo protege bien a los buenos pagadores pero deja escapar casi la mitad de los deudores; si el costo de perder deudores es alto, conviene subir la sensibilidad o rebalancear clases."

## [1] "Con “No paga” como clase positiva, el modelo logra 88.0% de exactitud , solo un poco por encima del NIR = 0.831 y con mejora no significativa al 5% p = 0.068. Recupera 13 de 24 deudores sensibilidad = 0.542 y clasifica bien 112 de 118 pagadores,con 11 falsos negativos y 6 falsos positivos; cuando alerta “No paga” acierta 68% y cuando dice “Paga” suele acertar 91%. En síntesis, el modelo protege bien a los buenos pagadores pero deja escapar casi la mitad de los deudores; si el costo de perder deudores es alto, conviene subir la sensibilidad o rebalancear clases."

print(confusionMatrix(pred_test_probit, y_test))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No paga Paga
##    No paga      13    6
##    Paga         11  112
##                                           
##                Accuracy : 0.8803          
##                  95% CI : (0.8152, 0.9287)
##     No Information Rate : 0.831           
##     P-Value [Acc > NIR] : 0.06829         
##                                           
##                   Kappa : 0.5352          
##                                           
##  Mcnemar's Test P-Value : 0.33198         
##                                           
##             Sensitivity : 0.54167         
##             Specificity : 0.94915         
##          Pos Pred Value : 0.68421         
##          Neg Pred Value : 0.91057         
##              Prevalence : 0.16901         
##          Detection Rate : 0.09155         
##    Detection Prevalence : 0.13380         
##       Balanced Accuracy : 0.74541         
##                                           
##        'Positive' Class : No paga         
##

"Con “No paga” como clase positiva,el modelo  recupera 13 de 24 deudores  sensibilidad = 0.54167, clasifica bien 112 de 118 pagadores especificidad = 0.94915, y cuando alerta “No paga” acierta 68.4% PPV = 0.68421; cuando dice “Paga” suele acertar 91.1% NPV = 0.91057. En síntesis, protege bien a los buenos pagadores pero deja escapar 46% de deudores, por lo que, si ese error es costoso, conviene subir la sensibilidad ajustando o rebalanceando clases."

## [1] "Con “No paga” como clase positiva,el modelo  recupera 13 de 24 deudores  sensibilidad = 0.54167, clasifica bien 112 de 118 pagadores especificidad = 0.94915, y cuando alerta “No paga” acierta 68.4% PPV = 0.68421; cuando dice “Paga” suele acertar 91.1% NPV = 0.91057. En síntesis, protege bien a los buenos pagadores pero deja escapar 46% de deudores, por lo que, si ese error es costoso, conviene subir la sensibilidad ajustando o rebalanceando clases."

print(confusionMatrix(pred_test_lda,    y_test))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No paga Paga
##    No paga      16    3
##    Paga          8  115
##                                           
##                Accuracy : 0.9225          
##                  95% CI : (0.8656, 0.9607)
##     No Information Rate : 0.831           
##     P-Value [Acc > NIR] : 0.001224        
##                                           
##                   Kappa : 0.6993          
##                                           
##  Mcnemar's Test P-Value : 0.227800        
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.9746          
##          Pos Pred Value : 0.8421          
##          Neg Pred Value : 0.9350          
##              Prevalence : 0.1690          
##          Detection Rate : 0.1127          
##    Detection Prevalence : 0.1338          
##       Balanced Accuracy : 0.8206          
##                                           
##        'Positive' Class : No paga         
##

"Con “No paga” como clase positiva, este modelo alcanza 92.25% de exactitud y supera de forma significativa al NIR de 0.831 p = 0.0012,recupera 16 de 24 deudores  y protege muy bien a los buenos pagadores (especificidad = 0.975); cuando marca “No paga” acierta 84% PPV = 0.842 y cuando dice “Paga” acierta 93.5% NPV = 0.935. Tiene muy buen desempeño, con bajo costo en falsos positivos y una sensibilidad ya razonable;"

## [1] "Con “No paga” como clase positiva, este modelo alcanza 92.25% de exactitud y supera de forma significativa al NIR de 0.831 p = 0.0012,recupera 16 de 24 deudores  y protege muy bien a los buenos pagadores (especificidad = 0.975); cuando marca “No paga” acierta 84% PPV = 0.842 y cuando dice “Paga” acierta 93.5% NPV = 0.935. Tiene muy buen desempeño, con bajo costo en falsos positivos y una sensibilidad ya razonable;"

# ibrerías 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
options(repos = c(CRAN = "https://cloud.r-project.org"))
need <- c("MASS", "rpart", "glmnet")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)

##Q2
datos <- datos[, !grepl("^Unnamed", names(datos), ignore.case = TRUE), drop = FALSE]

target <- "payment_status"
if (!target %in% names(datos)) stop("No encuentro la columna 'payment_status' en tus datos.")

## limpieza 
for (nm in setdiff(names(datos), target)) {
  if (is.character(datos[[nm]])) datos[[nm]] <- factor(datos[[nm]])
}

num_cols <- names(which(sapply(datos, is.numeric)))
for (c in num_cols) {
  if (all(is.na(datos[[c]]))) next
  datos[[c]][is.na(datos[[c]])] <- stats::median(datos[[c]], na.rm = TRUE)
}
fac_cols <- names(which(sapply(datos, is.factor)))
for (c in setdiff(fac_cols, target)) {
  if (anyNA(datos[[c]])) {
    moda <- names(which.max(table(datos[[c]])))
    datos[[c]][is.na(datos[[c]])] <- moda
  }
}

##factor binario
datos[[target]] <- as.factor(datos[[target]])
if (nlevels(datos[[target]]) != 2) {
  stop("La variable 'payment_status' debe ser binaria (2 clases).")
}
levels_y <- levels(datos[[target]])

## Clase positiva 
tbl <- table(datos[[target]])
pos_class <- names(tbl)[which.min(tbl)]
neg_class <- setdiff(levels(datos[[target]]), pos_class)[1]

## Train/Test split 70/30
set.seed(20)
idx_train <- unlist(tapply(seq_len(nrow(datos)), datos[[target]],
                           function(ix) sample(ix, ceiling(0.7 * length(ix)))))
train <- datos[idx_train, , drop = FALSE]
test  <- datos[-idx_train, , drop = FALSE]

# obtener dummies 
make_mm <- function(df, drop_col) {
  mm <- model.matrix(reformulate(setdiff(names(df), drop_col)), data = df)
  mm <- mm[, -1, drop = FALSE]  # quitar intercepto
  return(mm)
}

metrics_acc_recall <- function(y_true, y_pred, positive) {
  y_true <- factor(y_true, levels = c(positive, setdiff(unique(y_true), positive)))
  y_pred <- factor(y_pred, levels = levels(y_true))
  acc <- mean(y_true == y_pred)

  cm <- table(y_true, y_pred)
  TP <- ifelse(!is.na(cm[positive, positive]), cm[positive, positive], 0)
  FN <- sum(cm[positive, ], na.rm = TRUE) - TP
  rec <- ifelse((TP + FN) == 0, NA_real_, TP / (TP + FN))

  c(Accuracy = acc, Recall = rec)
}

## Matrices para los modelos LDA y Ridge
Xtr_mm <- make_mm(train, target)
Xte_mm <- make_mm(test,  target)

##  0/1 para Ridge 1 = clase positiva
ytr_bin <- ifelse(train[[target]] == pos_class, 1, 0)
yte_lbl <- test[[target]]

results <- list()

## LOGIT glm
fit_logit <- glm(reformulate(setdiff(names(train), target), response = target),
                 data = train, family = binomial(link = "logit"))

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

p_logit <- predict(fit_logit, newdata = test, type = "response")
#Convertimos a clase según niveles del train:
levs <- levels(train[[target]])
pred_logit <- ifelse(p_logit >= 0.5, levs[2], levs[1])
m_logit <- metrics_acc_recall(test[[target]], pred_logit, pos_class)
results[["Logit"]] <- m_logit

##PROBIT 
fit_probit <- glm(reformulate(setdiff(names(train), target), response = target),
                  data = train, family = binomial(link = "probit"))

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

p_probit <- predict(fit_probit, newdata = test, type = "response")
pred_probit <- ifelse(p_probit >= 0.5, levs[2], levs[1])
m_probit <- metrics_acc_recall(test[[target]], pred_probit, pos_class)
results[["Probit"]] <- m_probit

## LDA
fit_lda <- MASS::lda(x = Xtr_mm, grouping = train[[target]])
pred_lda <- predict(fit_lda, Xte_mm)$class
m_lda <- metrics_acc_recall(test[[target]], pred_lda, pos_class)
results[["LDA"]] <- m_lda

## RIDGE
cv_ridge <- glmnet::cv.glmnet(x = as.matrix(Xtr_mm), y = ytr_bin,
                              family = "binomial", alpha = 0, nfolds = 5)
p_ridge <- as.numeric(predict(cv_ridge, newx = as.matrix(Xte_mm),
                              s = "lambda.min", type = "response"))
pred_ridge <- ifelse(p_ridge >= 0.5, pos_class, neg_class)
m_ridge <- metrics_acc_recall(test[[target]], pred_ridge, pos_class)
results[["Ridge"]] <- m_ridge

## DECISION TREE 
fit_dt <- rpart::rpart(reformulate(setdiff(names(train), target), response = target),
                       data = train, method = "class")
pred_dt <- predict(fit_dt, newdata = test, type = "class")
m_dt <- metrics_acc_recall(test[[target]], pred_dt, pos_class)
results[["DT"]] <- m_dt
## ===== 10) Tabla final y mejor modelo =====
results_df <- do.call(rbind, results)
results_df <- data.frame(Model = rownames(results_df), results_df, row.names = NULL)
results_df <- results_df[order(-results_df$Recall, -results_df$Accuracy), ]

# Redondea SOLO las columnas numéricas
results_df$Accuracy <- round(as.numeric(results_df$Accuracy), 4)
results_df$Recall   <- round(as.numeric(results_df$Recall), 4)

cat("\nResultados (Accuracy y Recall):\n")

## 
## Resultados (Accuracy y Recall):

print(results_df)

##    Model Accuracy Recall
## 1  Logit   0.9078 0.9130
## 2 Probit   0.9078 0.9130
## 5     DT   0.9574 0.8696
## 3    LDA   0.9007 0.8696
## 4  Ridge   0.9433 0.8261

"En la comparación de modelos  el Árbol de Decisión tiene la mayor exactitud 95.7% pero un recall más bajo 86.9%, o sea, clasifica muy bien en general pero deja escapar más deudores; 
Logit y Probit empatan en exactitud 90.8% y logran el mejor recall 91.3%,  Ridge ofrece alta exactitud 94.3% pero el recall más bajo 82.6%,  es un modelo conservador que protege a buenos pagadores pero deja ir más deudores; 
LDA es el más flojo en exactitud 90.1% y tiene un recall intermedio 86.9%. 
En general  si el queremos que el costo de perder deudores sea alto, Logit/Probit son preferibles; pero si buscamos una máxima exactitud y pocos falsos positivos, DT o Ridge funcionan mejor"

## [1] "En la comparación de modelos  el Árbol de Decisión tiene la mayor exactitud 95.7% pero un recall más bajo 86.9%, o sea, clasifica muy bien en general pero deja escapar más deudores; \nLogit y Probit empatan en exactitud 90.8% y logran el mejor recall 91.3%,  Ridge ofrece alta exactitud 94.3% pero el recall más bajo 82.6%,  es un modelo conservador que protege a buenos pagadores pero deja ir más deudores; \nLDA es el más flojo en exactitud 90.1% y tiene un recall intermedio 86.9%. \nEn general  si el queremos que el costo de perder deudores sea alto, Logit/Probit son preferibles; pero si buscamos una máxima exactitud y pocos falsos positivos, DT o Ridge funcionan mejor"

best <- results_df[1, ]
cat(sprintf(
  "\nMejor modelo: %s (Recall = %.4f, Accuracy = %.4f).\nSe elige por que tiene un mayor Recall; Accuracy como desempate. Clase positiva usada: '%s'.\n",
  best$Model, best$Recall, best$Accuracy, pos_class
))

## 
## Mejor modelo: Logit (Recall = 0.9130, Accuracy = 0.9078).
## Se elige por que tiene un mayor Recall; Accuracy como desempate. Clase positiva usada: 'No paga'.

##  Q3 

options(repos = c(CRAN = "https://cloud.r-project.org"))
need <- c("MASS", "rpart", "glmnet")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(need, require, character.only = TRUE))

## Loading required package: rpart

target <- "payment_status"

mode_one <- function(x) {
  tx <- table(x, useNA = "no"); if (length(tx) == 0) return(NA)
  names(tx)[which.max(tx)]
}
make_mm <- function(df, drop_col) {
  # Si no existe drop_col, no se usa en la fórmula
  preds <- setdiff(names(df), drop_col)
  mm <- model.matrix(reformulate(preds), data = df)
  mm <- mm[, -1, drop = FALSE]  # sin intercepto
  return(mm)
}
metrics_acc_recall <- function(y_true, y_pred, positive) {
  y_true <- factor(y_true, levels = c(positive, setdiff(unique(y_true), positive)))
  y_pred <- factor(y_pred, levels = levels(y_true))
  acc <- mean(y_true == y_pred)
  cm <- table(y_true, y_pred)
  TP <- ifelse(!is.na(cm[positive, positive]), cm[positive, positive], 0)
  FN <- sum(cm[positive, ], na.rm = TRUE) - TP
  rec <- ifelse((TP + FN) == 0, NA_real_, TP / (TP + FN))
  c(Accuracy = acc, Recall = rec)
}

if (!exists("train") || !exists("fit_logit") || !exists("fit_probit") ||
    !exists("fit_lda") || !exists("cv_ridge") || !exists("fit_dt") ||
    !exists("results_df")) {
  stop("No encuentro objetos de Q2 (modelos entrenados y 'results_df'). Ejecuta Q2 antes de Q3.")
}
if (!exists("Xtr_mm")) {
  # si no quedó en el entorno, lo recalculamos
  Xtr_mm <- make_mm(train, target)
}
levs <- levels(train[[target]])

labels_y <- levels(train[[target]])
pat_default <- "(default|impago|moroso|mora|unpaid|late|no.?pago|no.?pay|0|false|no)"
cand <- labels_y[grepl(pat_default, labels_y, ignore.case = TRUE)]
if (length(cand) >= 1) {
  default_class <- cand[1]
} else {
  if (!exists("pos_class")) {
    tbl_all <- table(train[[target]])
    pos_class <- names(tbl_all)[which.min(tbl_all)]
  }
  default_class <- pos_class
}
other_class <- setdiff(labels_y, default_class)[1]

# Cargar prueba.csv 
if (file.exists("prueba.csv")) {
  newdat_raw <- read.csv("prueba.csv", header = TRUE, stringsAsFactors = FALSE)
} else {
  newdat_raw <- read.csv(file.choose(), header = TRUE, stringsAsFactors = FALSE)
}

# Quitar columnas  'Unnamed'
newdat_raw <- newdat_raw[, !grepl("^Unnamed", names(newdat_raw), ignore.case = TRUE), drop = FALSE]

train_preds <- setdiff(names(train), target)
for (nm in train_preds) {
  if (!nm %in% names(newdat_raw)) {
    if (is.numeric(train[[nm]])) {
      med <- suppressWarnings(stats::median(train[[nm]], na.rm = TRUE))
      if (is.na(med)) med <- 0
      newdat_raw[[nm]] <- med
    } else if (is.factor(train[[nm]])) {
      md <- mode_one(train[[nm]])
      newdat_raw[[nm]] <- md
    } else {
      newdat_raw[[nm]] <- NA
    }
  }
}
newdat <- newdat_raw[, train_preds, drop = FALSE]

for (nm in names(newdat)) {
  if (is.character(newdat[[nm]])) newdat[[nm]] <- factor(newdat[[nm]])
}
for (nm in intersect(names(newdat), names(train))) {
  if (is.factor(train[[nm]])) {
    lvl <- levels(train[[nm]])
    newdat[[nm]] <- factor(newdat[[nm]], levels = lvl)
    if (anyNA(newdat[[nm]])) {
      md <- mode_one(train[[nm]])
      newdat[[nm]][is.na(newdat[[nm]])] <- md
      newdat[[nm]] <- factor(newdat[[nm]], levels = lvl)
    }
  }
}

for (nm in names(newdat)) {
  if (is.numeric(train[[nm]])) {
    med <- suppressWarnings(stats::median(train[[nm]], na.rm = TRUE))
    if (is.na(med)) med <- 0
    newdat[[nm]][is.na(newdat[[nm]])] <- med
    if (!is.numeric(newdat[[nm]])) {
      newdat[[nm]] <- suppressWarnings(as.numeric(as.character(newdat[[nm]])))
      newdat[[nm]][is.na(newdat[[nm]])] <- med
    }
  }
}

tmp_for_mm <- newdat
tmp_for_mm[[target]] <- factor(rep(levs[1], nrow(tmp_for_mm)), levels = levs)
Xnew_mm <- make_mm(tmp_for_mm, target)

# alinear columnas con el train
missing_cols <- setdiff(colnames(Xtr_mm), colnames(Xnew_mm))
extra_cols   <- setdiff(colnames(Xnew_mm), colnames(Xtr_mm))
if (length(missing_cols)) {
  Xnew_mm <- cbind(Xnew_mm, matrix(0, nrow = nrow(Xnew_mm), ncol = length(missing_cols),
                                   dimnames = list(NULL, missing_cols)))
}
if (length(extra_cols)) {
  Xnew_mm <- Xnew_mm[, setdiff(colnames(Xnew_mm), extra_cols), drop = FALSE]
}
# reordenar
Xnew_mm <- Xnew_mm[, colnames(Xtr_mm), drop = FALSE]

# --- Seleccionar modelo ganador de Q2 ---
if (!exists("best")) best <- results_df[1, ]
chosen_model <- as.character(best$Model)

predict_default <- function(model_name, debug = FALSE) {
  model_name <- toupper(model_name)
  n <- nrow(newdat)
  levs <- levels(train[[target]])

  if (model_name == "LOGIT") {
    p <- as.numeric(predict(fit_logit, newdata = newdat, type = "response"))
    p_default <- if (default_class == levs[2]) p else 1 - p
    pred <- ifelse(p_default >= 0.5, default_class, other_class)

  } else if (model_name == "PROBIT") {
    p <- as.numeric(predict(fit_probit, newdata = newdat, type = "response"))
    p_default <- if (default_class == levs[2]) p else 1 - p
    pred <- ifelse(p_default >= 0.5, default_class, other_class)

  } else if (model_name == "LDA") {
    pred_obj <- predict(fit_lda, Xnew_mm)
    post <- as.matrix(pred_obj$posterior)
    if (!(default_class %in% colnames(post)))
      stop(sprintf("No encuentro posterior para '%s' en LDA.", default_class))
    p_default <- as.numeric(post[, default_class])
    pred <- ifelse(p_default >= 0.5, default_class, other_class)

  } else if (model_name == "RIDGE") {
    if (!exists("pos_class")) {
      tbl_all <- table(train[[target]])
      pos_class <- names(tbl_all)[which.min(tbl_all)]
    }
    p_pos <- as.numeric(predict(cv_ridge, newx = as.matrix(Xnew_mm),
                                s = "lambda.min", type = "response"))
    p_default <- if (identical(default_class, pos_class)) p_pos else (1 - p_pos)
    pred <- ifelse(p_default >= 0.5, default_class, other_class)

  } else if (model_name == "DT" || model_name == "DECISION TREE") {
    pr <- predict(fit_dt, newdata = newdat, type = "prob")
    pr_df <- as.data.frame(pr)
    if (!(default_class %in% colnames(pr_df)))
      stop(sprintf("No encuentro la prob. para '%s' en DT.", default_class))
    p_default <- as.numeric(pr_df[[default_class]])
    pred <- ifelse(p_default >= 0.5, default_class, other_class)

  } else {
    stop(sprintf("Modelo '%s' no reconocido.", model_name))
  }

  if (debug) {
    cat("\n[DEBUG] Modelo:", model_name,
        "| default_class:", default_class,
        "| other_class:", other_class, "\n")
    cat("[DEBUG] Primeras 10 probabilidades de default:\n")
    print(head(p_default, 10))
    cat("[DEBUG] Predicciones (primeras 10):\n")
    print(head(pred, 10))
  }

  out <- data.frame(
    row_id = seq_len(n),
    Predicted_Class = pred,
    Prob_Default = round(p_default, 4),
    stringsAsFactors = FALSE
  )
  return(out)
  
  resultado_Q3 <- predict_default(chosen_model, debug = TRUE)
print(resultado_Q3)

}

act1

Andrea Cruz A01635123

2025-08-22