Actividad 3 — Rotación de cargo

1) Carga, inspección y preparación

recode_ord <- function(x, labels) {
  factor(x, levels = seq_along(labels), labels = labels, ordered = TRUE)
}

### Carga y nombres
if ("rotacion" %in% data(package = "paqueteMODELOS")$results[, "Item"]) {
  data("rotacion", package = "paqueteMODELOS")
} else {
  rotacion <- tryCatch(paqueteMODELOS::rotacion, error = function(e) NULL)
}
stopifnot(!is.null(rotacion))

rotacion <- janitor::clean_names(rotacion)
glimpse(rotacion)

## Rows: 1,470
## Columns: 24
## $ rotacion                    <chr> "Si", "No", "Si", "No", "No", "No", "No", …
## $ edad                        <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35…
## $ viaje_de_negocios           <chr> "Raramente", "Frecuentemente", "Raramente"…
## $ departamento                <chr> "Ventas", "IyD", "IyD", "IyD", "IyD", "IyD…
## $ distancia_casa              <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 2…
## $ educacion                   <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, …
## $ campo_educacion             <chr> "Ciencias", "Ciencias", "Otra", "Ciencias"…
## $ satisfaccion_ambiental      <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, …
## $ genero                      <chr> "F", "M", "M", "F", "M", "M", "F", "M", "M…
## $ cargo                       <chr> "Ejecutivo_Ventas", "Investigador_Cientifi…
## $ satisfacion_laboral         <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, …
## $ estado_civil                <chr> "Soltero", "Casado", "Soltero", "Casado", …
## $ ingreso_mensual             <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, …
## $ trabajos_anteriores         <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, …
## $ horas_extra                 <chr> "Si", "No", "Si", "Si", "No", "No", "Si", …
## $ porcentaje_aumento_salarial <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13…
## $ rendimiento_laboral         <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, …
## $ anos_experiencia            <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5…
## $ capacitaciones              <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, …
## $ equilibrio_trabajo_vida     <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, …
## $ antiguedad                  <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2,…
## $ antiguedad_cargo            <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, …
## $ anos_ultima_promocion       <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, …
## $ anos_acargo_con_mismo_jefe  <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, …

### Recodificación
## --- Variable respuesta: 'rotacion' y binaria 'y' ---
if ("rotacion" %in% names(rotacion)) {
  v <- rotacion$rotacion

  if (is.numeric(v)) {
    v <- ifelse(v == 1, "Si", "No")
  } else if (is.logical(v)) {
    v <- ifelse(v, "Si", "No")
  } else {
    v <- as.character(v)
    v <- dplyr::case_when(
      v %in% c("1","si","sí","Si","SI","Yes","TRUE","True") ~ "Si",
      v %in% c("0","no","No","NO","FALSE","False")          ~ "No",
      TRUE ~ v
    )
  }

  rotacion$rotacion <- factor(v, levels = c("No","Si"))
  rotacion$y <- as.integer(rotacion$rotacion == "Si")   # 1 = Sí rota; 0 = No
}

## --- Ordinales (Likert) como factores ordenados ---
if ("rendimiento_laboral" %in% names(rotacion)) {
  rotacion$rendimiento_laboral <- recode_ord(rotacion$rendimiento_laboral,
    c("Bajo","Medio","Alto","Muy alto"))
}
if ("educacion" %in% names(rotacion)) {
  rotacion$educacion <- recode_ord(rotacion$educacion,
    c("Primaria","Secundaria","Técnico/tecnólogo","Pregrado","Posgrado"))
}
if ("satisfaccion_ambiental" %in% names(rotacion)) {
  rotacion$satisfaccion_ambiental <- recode_ord(rotacion$satisfaccion_ambiental,
    c("Muy insatisfecho","Insatisfecho","Satisfecho","Muy satisfecho"))
}
if ("satisfacion_laboral" %in% names(rotacion)) {
  rotacion$satisfacion_laboral <- recode_ord(rotacion$satisfacion_laboral,
    c("Muy insatisfecho","Insatisfecho","Satisfecho","Muy satisfecho"))
}
if ("equilibrio_trabajo_vida" %in% names(rotacion)) {
  rotacion$equilibrio_trabajo_vida <- recode_ord(rotacion$equilibrio_trabajo_vida,
    c("Muy bajo","Bajo","Medio","Alto"))
}

## --- Numéricas / enteras ---
if ("distancia_de_la_casa" %in% names(rotacion)) {
  rotacion$distancia_de_la_casa <- suppressWarnings(as.numeric(rotacion$distancia_de_la_casa))
}
if ("trabajos_anteriores" %in% names(rotacion)) {
  rotacion$trabajos_anteriores <- suppressWarnings(as.integer(rotacion$trabajos_anteriores))
}

glimpse(rotacion)

## Rows: 1,470
## Columns: 25
## $ rotacion                    <fct> Si, No, Si, No, No, No, No, No, No, No, No…
## $ edad                        <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35…
## $ viaje_de_negocios           <chr> "Raramente", "Frecuentemente", "Raramente"…
## $ departamento                <chr> "Ventas", "IyD", "IyD", "IyD", "IyD", "IyD…
## $ distancia_casa              <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 2…
## $ educacion                   <ord> Secundaria, Primaria, Secundaria, Pregrado…
## $ campo_educacion             <chr> "Ciencias", "Ciencias", "Otra", "Ciencias"…
## $ satisfaccion_ambiental      <ord> Insatisfecho, Satisfecho, Muy satisfecho, …
## $ genero                      <chr> "F", "M", "M", "F", "M", "M", "F", "M", "M…
## $ cargo                       <chr> "Ejecutivo_Ventas", "Investigador_Cientifi…
## $ satisfacion_laboral         <ord> Muy satisfecho, Insatisfecho, Satisfecho, …
## $ estado_civil                <chr> "Soltero", "Casado", "Soltero", "Casado", …
## $ ingreso_mensual             <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, …
## $ trabajos_anteriores         <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, …
## $ horas_extra                 <chr> "Si", "No", "Si", "Si", "No", "No", "Si", …
## $ porcentaje_aumento_salarial <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13…
## $ rendimiento_laboral         <ord> Alto, Muy alto, Alto, Alto, Alto, Alto, Mu…
## $ anos_experiencia            <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5…
## $ capacitaciones              <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, …
## $ equilibrio_trabajo_vida     <ord> Muy bajo, Medio, Medio, Medio, Medio, Bajo…
## $ antiguedad                  <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2,…
## $ antiguedad_cargo            <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, …
## $ anos_ultima_promocion       <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, …
## $ anos_acargo_con_mismo_jefe  <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, …
## $ y                           <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

2) Selección previa de variables (screening)

2.1 Variables numéricas

y <- rotacion$y  # objetivo 0/1

vars_num <- c("edad","distancia_casa","ingreso_mensual","trabajos_anteriores",
              "porcentaje_aumento_salarial","anos_experiencia","capacitaciones",
              "antiguedad","antiguedad_cargo","anos_ultima_promocion",
              "anos_acargo_con_mismo_jefe")

# coerción segura por si alguna quedó como char
for (v in vars_num) if (!is.numeric(rotacion[[v]])) {
  rotacion[[v]] <- suppressWarnings(as.numeric(rotacion[[v]]))
}

rank_num <- lapply(vars_num, function(v){
  x  <- rotacion[[v]]
  ok <- complete.cases(x, y)
  if (!any(ok)) return(NULL)
  r   <- suppressWarnings(cor(x[ok], y[ok], method="pearson"))  # punto-biserial
  m1  <- glm(y ~ x, family=binomial(), data=data.frame(x=x[ok], y=y[ok]))
  auc1<- as.numeric(pROC::auc(y[ok], predict(m1, type="response")))
  data.frame(var=v, r=unname(r), auc=auc1)
})

rank_num <- do.call(rbind, rank_num)
rank_num <- rank_num[order(-abs(rank_num$r), -rank_num$auc), ]
rank_num

2.2 Variables categóricas

vars_nom <- c("viaje_de_negocios","departamento","campo_educacion",
              "genero","cargo","estado_civil","horas_extra")
for (v in vars_nom) if (!is.factor(rotacion[[v]])) rotacion[[v]] <- factor(rotacion[[v]])

vars_ord <- c("educacion","satisfaccion_ambiental","satisfacion_laboral",
              "rendimiento_laboral","equilibrio_trabajo_vida")  # ya vienen <ord>

vars_cat <- c(vars_nom, vars_ord)

y <- rotacion$y  # 0/1
stopifnot(all(y %in% c(0,1)))

rank_cat_list <- lapply(vars_cat, function(v){
  x  <- rotacion[[v]]
  ok <- complete.cases(x, y)
  if (!any(ok)) return(NULL)

  df <- data.frame(y = y[ok], x = x[ok], stringsAsFactors = FALSE)
  m1 <- glm(y ~ x, family = binomial(), data = df)
  p  <- predict(m1, type = "response")

  auc1 <- as.numeric(pROC::auc(df$y, p))  # AUC univariada
  m0 <- glm(y ~ 1, family = binomial(), data = df)
  lr <- anova(m0, m1, test = "Chisq")
  dev_drop <- lr$Deviance[2]; p_lr <- lr$`Pr(>Chi)`[2]

  rho <- NA_real_
  if (is.ordered(x)) {
    tasas <- tapply(df$y, df$x, mean)
    rho   <- suppressWarnings(cor(seq_along(tasas), as.numeric(tasas), method = "spearman"))
  }

  data.frame(var = v, auc = auc1, dev_drop = dev_drop, p_lr = p_lr, rho = rho)
})

rank_cat <- do.call(rbind, Filter(Negate(is.null), rank_cat_list))
rank_cat <- rank_cat[order(-rank_cat$auc, -rank_cat$dev_drop, -abs(rank_cat$rho)), ]
rank_cat

2.3 Variables seleccionadas y hipótesis

top3_num <- head(rank_num$var, 3)
top3_cat <- head(rank_cat$var, 3)

top3_num; top3_cat

## [1] "anos_experiencia" "antiguedad_cargo" "ingreso_mensual"

## [1] "cargo"        "horas_extra"  "estado_civil"

form_pre <- as.formula(paste("y ~", paste(c(top3_cat, top3_num), collapse=" + ")))
form_pre

## y ~ cargo + horas_extra + estado_civil + anos_experiencia + antiguedad_cargo + 
##     ingreso_mensual

Hipótesis (resumen):
- Numéricas: anos_experiencia ↓, antiguedad_cargo ↓, ingreso_mensual ↓ los odds de rotación.
- Categóricas: cargo (hay roles con ↑ rotación), horas_extra (↑ rotación), estado_civil (Soltero ↑, Casado ↓).

3) Análisis univariado y bivariado

3.1 Rotación (prevalencia)

prev_tbl <- rotacion |> 
  dplyr::count(y) |> 
  dplyr::mutate(prop = n / sum(n))

prev_tbl

ggplot(prev_tbl, aes(x = factor(y), y = prop)) +
  geom_col() +
  geom_text(aes(label = scales::percent(prop)), vjust = -0.2) +
  labs(x = "Rotación (0=No, 1=Sí)", y = "Proporción", title = "Prevalencia de rotación") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

Interpretación. La proporción de rotación es 16%; la clase es desbalanceada, por lo que evaluamos con ROC/AUC y ajustamos umbrales en la fase predictiva.

3.2 Numéricas: resumen y gráficos

vars_num <- c(
  "edad","distancia_casa","ingreso_mensual","trabajos_anteriores",
  "porcentaje_aumento_salarial","anos_experiencia","capacitaciones",
  "antiguedad","antiguedad_cargo","anos_ultima_promocion",
  "anos_acargo_con_mismo_jefe"
)
vars_num <- intersect(vars_num, names(rotacion))

num_summary <- lapply(vars_num, function(v){
  x <- rotacion[[v]]
  data.frame(
    variable = v,
    n        = sum(!is.na(x)),
    na       = sum(is.na(x)),
    min      = min(x, na.rm = TRUE),
    q1       = quantile(x, 0.25, na.rm = TRUE),
    mediana  = median(x, na.rm = TRUE),
    q3       = quantile(x, 0.75, na.rm = TRUE),
    max      = max(x, na.rm = TRUE),
    media    = mean(x, na.rm = TRUE),
    sd       = sd(x, na.rm = TRUE),
    stringsAsFactors = FALSE
  )
}) |> dplyr::bind_rows()

knitr::kable(num_summary, caption = "Resumen descriptivo de variables numéricas")

Resumen descriptivo de variables numéricas
	variable	n	min	q1	mediana	q3	max	media	sd
25%…1	edad	1470	18	30	36	43	60	36.924490	9.135938
25%…2	distancia_casa	1470	1	2	7	14	29	9.192517	8.106864
25%…3	ingreso_mensual	1470	1009	2911	4919	8379	19999	6502.931293	4707.956783
25%…4	trabajos_anteriores	1470	0	1	2	4	9	2.693197	2.498009
25%…5	porcentaje_aumento_salarial	1470	11	12	14	18	25	15.209524	3.659938
25%…6	anos_experiencia	1470	0	6	10	15	40	11.279592	7.780782
25%…7	capacitaciones	1470	0	2	3	3	6	2.799320	1.289271
25%…8	antiguedad	1470	0	3	5	9	40	7.008163	6.126525
25%…9	antiguedad_cargo	1470	0	2	3	7	18	4.229252	3.623137
25%…10	anos_ultima_promocion	1470	0	0	1	3	15	2.187755	3.222430
25%…11	anos_acargo_con_mismo_jefe	1470	0	2	3	7	17	4.123129	3.568136

for(v in vars_num){
  print(
    ggplot(rotacion, aes(x = .data[[v]])) +
      geom_histogram(bins = 30) +
      labs(title = paste("Histograma -", v), x = v, y = "Frecuencia") +
      theme_minimal()
  )
  print(
    ggplot(rotacion, aes(y = .data[[v]])) +
      geom_boxplot() +
      labs(title = paste("Boxplot -", v), x = NULL, y = v) +
      theme_minimal()
  )
}

3.3 Categóricas: frecuencias y gráficos

vars_nom <- c("viaje_de_negocios","departamento","campo_educacion",
              "genero","cargo","estado_civil","horas_extra")
vars_ord <- c("educacion","satisfaccion_ambiental","satisfacion_laboral",
              "rendimiento_laboral","equilibrio_trabajo_vida")

vars_nom <- intersect(vars_nom, names(rotacion))
vars_ord <- intersect(vars_ord, names(rotacion))
vars_cat <- c(vars_nom, vars_ord)

for(v in vars_cat){
  df <- rotacion |> 
    dplyr::mutate(across(all_of(v), ~ as.factor(.))) |>
    dplyr::count(.data[[v]]) |>
    dplyr::mutate(prop = n / sum(n)) |>
    dplyr::arrange(dplyr::desc(n))
  print(df)

  g <- ggplot(df, aes(x = .data[[v]], y = prop)) +
    geom_col() +
    geom_text(aes(label = scales::percent(prop)), vjust = -0.2, size = 3) +
    labs(title = paste("Distribución -", v), x = v, y = "Proporción") +
    scale_y_continuous(labels = scales::percent_format()) +
    theme_minimal()
  print(g)
}

## # A tibble: 3 × 3
##   viaje_de_negocios     n  prop
##   <fct>             <int> <dbl>
## 1 Raramente          1043 0.710
## 2 Frecuentemente      277 0.188
## 3 No_Viaja            150 0.102

## # A tibble: 3 × 3
##   departamento     n   prop
##   <fct>        <int>  <dbl>
## 1 IyD            961 0.654 
## 2 Ventas         446 0.303 
## 3 RH              63 0.0429

## # A tibble: 6 × 3
##   campo_educacion     n   prop
##   <fct>           <int>  <dbl>
## 1 Ciencias          606 0.412 
## 2 Salud             464 0.316 
## 3 Mercadeo          159 0.108 
## 4 Tecnicos          132 0.0898
## 5 Otra               82 0.0558
## 6 Humanidades        27 0.0184

## # A tibble: 2 × 3
##   genero     n  prop
##   <fct>  <int> <dbl>
## 1 M        882   0.6
## 2 F        588   0.4

## # A tibble: 9 × 3
##   cargo                       n   prop
##   <fct>                   <int>  <dbl>
## 1 Ejecutivo_Ventas          326 0.222 
## 2 Investigador_Cientifico   292 0.199 
## 3 Tecnico_Laboratorio       259 0.176 
## 4 Director_Manofactura      145 0.0986
## 5 Representante_Salud       131 0.0891
## 6 Gerente                   102 0.0694
## 7 Representante_Ventas       83 0.0565
## 8 Director_Investigación     80 0.0544
## 9 Recursos_Humanos           52 0.0354

## # A tibble: 3 × 3
##   estado_civil     n  prop
##   <fct>        <int> <dbl>
## 1 Casado         673 0.458
## 2 Soltero        470 0.320
## 3 Divorciado     327 0.222

## # A tibble: 2 × 3
##   horas_extra     n  prop
##   <fct>       <int> <dbl>
## 1 No           1054 0.717
## 2 Si            416 0.283

## # A tibble: 5 × 3
##   educacion             n   prop
##   <ord>             <int>  <dbl>
## 1 Técnico/tecnólogo   572 0.389 
## 2 Pregrado            398 0.271 
## 3 Secundaria          282 0.192 
## 4 Primaria            170 0.116 
## 5 Posgrado             48 0.0327

## # A tibble: 4 × 3
##   satisfaccion_ambiental     n  prop
##   <ord>                  <int> <dbl>
## 1 Satisfecho               453 0.308
## 2 Muy satisfecho           446 0.303
## 3 Insatisfecho             287 0.195
## 4 Muy insatisfecho         284 0.193

## # A tibble: 4 × 3
##   satisfacion_laboral     n  prop
##   <ord>               <int> <dbl>
## 1 Muy satisfecho        459 0.312
## 2 Satisfecho            442 0.301
## 3 Muy insatisfecho      289 0.197
## 4 Insatisfecho          280 0.190

## # A tibble: 2 × 3
##   rendimiento_laboral     n  prop
##   <ord>               <int> <dbl>
## 1 Alto                 1244 0.846
## 2 Muy alto              226 0.154

## # A tibble: 4 × 3
##   equilibrio_trabajo_vida     n   prop
##   <ord>                   <int>  <dbl>
## 1 Medio                     893 0.607 
## 2 Bajo                      344 0.234 
## 3 Alto                      153 0.104 
## 4 Muy bajo                   80 0.0544

3.4 Bivariado: numéricas vs rotación

vars_num <- intersect(vars_num, names(rotacion))
num_biv <- lapply(vars_num, function(v){
  x <- rotacion[[v]]; y <- rotacion$y
  ok <- complete.cases(x,y)
  if(!any(ok)) return(NULL)
  df <- data.frame(x=x[ok], y=y[ok])

  gsum <- df |>
    dplyr::group_by(y) |>
    dplyr::summarise(n=dplyr::n(), media=mean(x), sd=sd(x), .groups="drop")

  tt <- tryCatch(t.test(x ~ y, data=df), error=function(e) NULL)
  dif_media <- if (!is.null(tt)) unname(diff(tt$estimate)) else NA_real_
  p_ttest   <- if (!is.null(tt)) tt$p.value else NA_real_

  m  <- glm(y ~ x, family=binomial(), data=df)
  b  <- coef(summary(m))["x","Estimate"]
  se <- coef(summary(m))["x","Std. Error"]
  or <- exp(b)
  li <- exp(b - 1.96*se); ls <- exp(b + 1.96*se)
  p  <- coef(summary(m))["x","Pr(>|z|)"]

  tibble::tibble(
    variable = v,
    media_y0 = gsum$media[gsum$y==0],
    media_y1 = gsum$media[gsum$y==1],
    dif_media = dif_media, p_ttest = p_ttest,
    beta = b, OR = or, OR_LI = li, OR_LS = ls, p_glm = p,
    signo = ifelse(b>0,"(+)","(-)")
  )
}) |> dplyr::bind_rows()

knitr::kable(num_biv, digits = 3, caption = "Bivariado numéricas vs rotación")

Bivariado numéricas vs rotación
variable	media_y0	media_y1	dif_media	p_ttest	beta	OR	OR_LI	OR_LS	p_glm	signo
edad	37.562	33.608	-3.954	0.000	-0.052	0.949	0.933	0.965	0.000	(-)
distancia_casa	8.916	10.633	1.717	0.004	0.025	1.025	1.008	1.042	0.003	(+)
ingreso_mensual	6832.740	4787.093	-2045.647	0.000	0.000	1.000	1.000	1.000	0.000	(-)
trabajos_anteriores	2.646	2.941	0.295	0.116	0.046	1.047	0.992	1.104	0.096	(+)
porcentaje_aumento_salarial	15.231	15.097	-0.134	0.614	-0.010	0.990	0.953	1.029	0.605	(-)
anos_experiencia	11.863	8.245	-3.618	0.000	-0.078	0.925	0.903	0.948	0.000	(-)
capacitaciones	2.833	2.624	-0.208	0.020	-0.130	0.878	0.785	0.982	0.023	(-)
antiguedad	7.369	5.131	-2.238	0.000	-0.081	0.922	0.894	0.952	0.000	(-)
antiguedad_cargo	4.484	2.903	-1.581	0.000	-0.146	0.864	0.824	0.906	0.000	(-)
anos_ultima_promocion	2.234	1.945	-0.289	0.199	-0.030	0.971	0.927	1.017	0.206	(-)
anos_acargo_con_mismo_jefe	4.367	2.852	-1.515	0.000	-0.141	0.868	0.828	0.910	0.000	(-)

3.5 Bivariado: categóricas vs rotación

cat_biv <- lapply(vars_cat, function(v){
  x  <- rotacion[[v]]; y <- rotacion$y
  ok <- complete.cases(x, y)
  if (!any(ok)) return(NULL)

  df <- data.frame(f = x[ok], y = y[ok])
  if (is.ordered(df$f)) df$f <- factor(df$f, ordered = FALSE)
  df$f <- factor(df$f)
  if (nlevels(df$f) < 2) return(NULL)

  tasas <- df |>
    dplyr::count(f) |>
    dplyr::mutate(rotacion = tapply(df$y, df$f, mean)[as.character(f)]) |>
    dplyr::arrange(dplyr::desc(rotacion))

  tab <- table(df$f, df$y)
  p_chi <- suppressWarnings(chisq.test(tab))$p.value

  df$f <- stats::relevel(df$f, ref = levels(df$f)[1])
  m  <- glm(y ~ f, family = binomial(), data = df)
  co <- broom::tidy(m, conf.int = TRUE, exponentiate = TRUE) |>
        dplyr::filter(term != "(Intercept)") |>
        dplyr::transmute(
          nivel = sub("^f", "", term),
          OR = estimate, OR_LI = conf.low, OR_LS = conf.high, p_glm = p.value
        )

  list(variable = v, tasas = tasas, p_chi = p_chi, ORs = co)
})

for(res in cat_biv){
  cat("\n\n==== Variable:", res$variable, "====\n")
  print(res$tasas)
  cat("Chi-cuadrado p-value:", signif(res$p_chi, 4), "\n")
  cat("OR por nivel vs. referencia (primer nivel):\n")
  print(res$ORs)
}

## 
## 
## ==== Variable: viaje_de_negocios ====
##                f    n  rotacion
## 1 Frecuentemente  277 0.2490975
## 2      Raramente 1043 0.1495686
## 3       No_Viaja  150 0.0800000
## Chi-cuadrado p-value: 5.609e-06 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 2 × 5
##   nivel        OR OR_LI OR_LS     p_glm
##   <chr>     <dbl> <dbl> <dbl>     <dbl>
## 1 No_Viaja  0.262 0.131 0.485 0.0000536
## 2 Raramente 0.530 0.386 0.734 0.000107 
## 
## 
## ==== Variable: departamento ====
##        f   n  rotacion
## 1 Ventas 446 0.2062780
## 2     RH  63 0.1904762
## 3    IyD 961 0.1383975
## Chi-cuadrado p-value: 0.004526 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 2 × 5
##   nivel     OR OR_LI OR_LS   p_glm
##   <chr>  <dbl> <dbl> <dbl>   <dbl>
## 1 RH      1.46 0.729  2.73 0.253  
## 2 Ventas  1.62 1.20   2.17 0.00131
## 
## 
## ==== Variable: campo_educacion ====
##             f   n  rotacion
## 1 Humanidades  27 0.2592593
## 2    Tecnicos 132 0.2424242
## 3    Mercadeo 159 0.2201258
## 4    Ciencias 606 0.1468647
## 5       Salud 464 0.1357759
## 6        Otra  82 0.1341463
## Chi-cuadrado p-value: 0.006774 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 5 × 5
##   nivel          OR OR_LI OR_LS   p_glm
##   <chr>       <dbl> <dbl> <dbl>   <dbl>
## 1 Humanidades 2.03  0.779  4.74 0.118  
## 2 Mercadeo    1.64  1.05   2.52 0.0267 
## 3 Otra        0.900 0.437  1.70 0.759  
## 4 Salud       0.913 0.642  1.29 0.607  
## 5 Tecnicos    1.86  1.16   2.91 0.00787
## 
## 
## ==== Variable: genero ====
##   f   n  rotacion
## 1 M 882 0.1700680
## 2 F 588 0.1479592
## Chi-cuadrado p-value: 0.2906 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 1 × 5
##   nivel    OR OR_LI OR_LS p_glm
##   <chr> <dbl> <dbl> <dbl> <dbl>
## 1 M      1.18 0.887  1.58 0.259
## 
## 
## ==== Variable: cargo ====
##                         f   n   rotacion
## 1    Representante_Ventas  83 0.39759036
## 2     Tecnico_Laboratorio 259 0.23938224
## 3        Recursos_Humanos  52 0.23076923
## 4        Ejecutivo_Ventas 326 0.17484663
## 5 Investigador_Cientifico 292 0.16095890
## 6    Director_Manofactura 145 0.06896552
## 7     Representante_Salud 131 0.06870229
## 8                 Gerente 102 0.04901961
## 9  Director_Investigación  80 0.02500000
## Chi-cuadrado p-value: 2.752e-15 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 8 × 5
##   nivel                      OR OR_LI OR_LS     p_glm
##   <chr>                   <dbl> <dbl> <dbl>     <dbl>
## 1 Director_Manofactura     2.89 0.738  19.1 0.178    
## 2 Ejecutivo_Ventas         8.26 2.50   51.1 0.00385  
## 3 Gerente                  2.01 0.421  14.3 0.412    
## 4 Investigador_Cientifico  7.48 2.24   46.4 0.00608  
## 5 Recursos_Humanos        11.7  3.00   77.6 0.00180  
## 6 Representante_Salud      2.88 0.718  19.2 0.184    
## 7 Representante_Ventas    25.7  7.37  163.  0.0000150
## 8 Tecnico_Laboratorio     12.3  3.71   75.9 0.000601 
## 
## 
## ==== Variable: estado_civil ====
##            f   n  rotacion
## 1    Soltero 470 0.2553191
## 2     Casado 673 0.1248143
## 3 Divorciado 327 0.1009174
## Chi-cuadrado p-value: 9.456e-11 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 2 × 5
##   nivel         OR OR_LI OR_LS        p_glm
##   <chr>      <dbl> <dbl> <dbl>        <dbl>
## 1 Divorciado 0.787 0.508  1.19 0.271       
## 2 Soltero    2.40  1.77   3.28 0.0000000254
## 
## 
## ==== Variable: horas_extra ====
##    f    n  rotacion
## 1 Si  416 0.3052885
## 2 No 1054 0.1043643
## Chi-cuadrado p-value: 8.158e-21 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 1 × 5
##   nivel    OR OR_LI OR_LS    p_glm
##   <chr> <dbl> <dbl> <dbl>    <dbl>
## 1 Si     3.77  2.83  5.03 1.35e-19
## 
## 
## ==== Variable: educacion ====
##                   f   n  rotacion
## 1          Primaria 170 0.1823529
## 2 Técnico/tecnólogo 572 0.1730769
## 3        Secundaria 282 0.1560284
## 4          Pregrado 398 0.1457286
## 5          Posgrado  48 0.1041667
## Chi-cuadrado p-value: 0.5455 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 4 × 5
##   nivel                OR OR_LI OR_LS p_glm
##   <chr>             <dbl> <dbl> <dbl> <dbl>
## 1 Secundaria        0.829 0.502  1.38 0.467
## 2 Técnico/tecnólogo 0.938 0.607  1.48 0.780
## 3 Pregrado          0.765 0.477  1.25 0.272
## 4 Posgrado          0.521 0.170  1.32 0.204
## 
## 
## ==== Variable: satisfaccion_ambiental ====
##                  f   n  rotacion
## 1 Muy insatisfecho 284 0.2535211
## 2     Insatisfecho 287 0.1498258
## 3       Satisfecho 453 0.1368653
## 4   Muy satisfecho 446 0.1345291
## Chi-cuadrado p-value: 5.123e-05 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 3 × 5
##   nivel             OR OR_LI OR_LS     p_glm
##   <chr>          <dbl> <dbl> <dbl>     <dbl>
## 1 Insatisfecho   0.519 0.339 0.787 0.00221  
## 2 Satisfecho     0.467 0.319 0.681 0.0000801
## 3 Muy satisfecho 0.458 0.312 0.669 0.0000590
## 
## 
## ==== Variable: satisfacion_laboral ====
##                  f   n  rotacion
## 1 Muy insatisfecho 289 0.2283737
## 2       Satisfecho 442 0.1651584
## 3     Insatisfecho 280 0.1642857
## 4   Muy satisfecho 459 0.1132898
## Chi-cuadrado p-value: 0.0005563 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 3 × 5
##   nivel             OR OR_LI OR_LS     p_glm
##   <chr>          <dbl> <dbl> <dbl>     <dbl>
## 1 Insatisfecho   0.664 0.435 1.01  0.0555   
## 2 Satisfecho     0.668 0.461 0.971 0.0339   
## 3 Muy satisfecho 0.432 0.289 0.642 0.0000359
## 
## 
## ==== Variable: rendimiento_laboral ====
##          f    n  rotacion
## 1 Muy alto  226 0.1637168
## 2     Alto 1244 0.1607717
## Chi-cuadrado p-value: 0.9901 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 1 × 5
##   nivel       OR OR_LI OR_LS p_glm
##   <chr>    <dbl> <dbl> <dbl> <dbl>
## 1 Muy alto  1.02 0.688  1.48 0.912
## 
## 
## ==== Variable: equilibrio_trabajo_vida ====
##          f   n  rotacion
## 1 Muy bajo  80 0.3125000
## 2     Alto 153 0.1764706
## 3     Bajo 344 0.1686047
## 4    Medio 893 0.1422172
## Chi-cuadrado p-value: 0.0009726 
## OR por nivel vs. referencia (primer nivel):
## # A tibble: 3 × 5
##   nivel    OR OR_LI OR_LS    p_glm
##   <chr> <dbl> <dbl> <dbl>    <dbl>
## 1 Bajo  0.446 0.259 0.782 0.00407 
## 2 Medio 0.365 0.221 0.615 0.000102
## 3 Alto  0.471 0.251 0.887 0.0192

4) Estimación del modelo logístico (multivariable)

Usamos las 6 variables seleccionadas previamente (top3_cat y top3_num). El modelo se ajusta con glm(family = binomial); para las variables categóricas, R crea dummies automáticamente (nivel de referencia = primer nivel del factor).

sel_num <- top3_num
sel_cat <- top3_cat
vars_modelo <- c("y", sel_cat, sel_num)

df_mod <- rotacion |>
  dplyr::select(dplyr::all_of(vars_modelo)) |>
  tidyr::drop_na()

form <- as.formula(paste("y ~", paste(c(sel_cat, sel_num), collapse = " + ")))
form

## y ~ cargo + horas_extra + estado_civil + anos_experiencia + antiguedad_cargo + 
##     ingreso_mensual

mod <- glm(form, family = binomial(), data = df_mod)

4.1 Coeficientes (OR, IC95% y p-valor)

coefs <- broom::tidy(mod, conf.int = TRUE, exponentiate = TRUE) |>
  dplyr::rename(OR = estimate, LI = conf.low, LS = conf.high) |>
  dplyr::mutate(dplyr::across(c(OR, LI, LS), ~ round(.x, 3))) |>
  dplyr::arrange(dplyr::desc(abs(OR - 1)))

knitr::kable(coefs, caption = "Modelo logístico: OR (IC95%) y significancia (prueba de Wald)")

Modelo logístico: OR (IC95%) y significancia (prueba de Wald)
term	OR	std.error	statistic	p.value	LI	LS
cargoRepresentante_Ventas	24.166	0.9663641	3.2957977	0.0009814	4.157	206.585
cargoRecursos_Humanos	15.948	0.9597271	2.8855494	0.0039073	2.762	134.575
cargoTecnico_Laboratorio	14.607	0.9361533	2.8643831	0.0041782	2.691	119.447
cargoEjecutivo_Ventas	8.946	0.8470058	2.5870039	0.0096814	2.021	64.411
cargoInvestigador_Cientifico	6.756	0.9386043	2.0353642	0.0418142	1.236	55.407
horas_extraSi	4.662	0.1621626	9.4934107	0.0000000	3.400	6.425
cargoRepresentante_Salud	3.258	0.8964787	1.3175351	0.1876593	0.644	24.985
cargoDirector_Manofactura	3.085	0.8918746	1.2632328	0.2065055	0.617	23.521
estado_civilSoltero	2.311	0.1724618	4.8573022	0.0000012	1.651	3.248
cargoGerente	2.065	0.8662947	0.8370202	0.4025812	0.418	15.017
(Intercept)	0.014	1.0139212	-4.2129254	0.0000252	0.002	0.090
estado_civilDivorciado	0.726	0.2327369	-1.3743312	0.1693389	0.455	1.136
antiguedad_cargo	0.915	0.0291595	-3.0466793	0.0023138	0.863	0.968
anos_experiencia	0.975	0.0189616	-1.3530876	0.1760277	0.938	1.011
ingreso_mensual	1.000	0.0000521	0.9294941	0.3526331	1.000	1.000

4.2 Significancia global por término (LR)

anova_glob <- anova(mod, test = "Chisq")

anova_tbl <- anova_glob |>
  as.data.frame() |>
  tibble::rownames_to_column("Término") |>
  dplyr::filter(Término != "NULL") |>
  dplyr::rename(
    `g.l.`            = Df,
    `ΔDeviance`       = Deviance,
    `g.l. resid.`     = `Resid. Df`,
    `Deviance resid.` = `Resid. Dev`,
    `p (LR)`          = `Pr(>Chi)`
  ) |>
  dplyr::mutate(
    `ΔDeviance`        = round(`ΔDeviance`, 3),
    `Deviance resid.`  = round(`Deviance resid.`, 3),
    `p (LR)`           = format.pval(`p (LR)`, digits = 3, eps = 1e-4)
  )

knitr::kable(
  anova_tbl,
  caption = "Prueba LR global por término (modelo logístico, sin modelo nulo)",
  align = "lrrrrr"
)

Prueba LR global por término (modelo logístico, sin modelo nulo)
Término	g.l.	ΔDeviance	g.l. resid.	Deviance resid.	p (LR)
cargo	8	88.909	1461	1209.674	< 1e-04
horas_extra	1	89.679	1460	1119.995	< 1e-04
estado_civil	2	39.986	1458	1080.009	< 1e-04
anos_experiencia	1	6.349	1457	1073.660	0.01175
antiguedad_cargo	1	9.533	1456	1064.127	0.00202
ingreso_mensual	1	0.861	1455	1063.266	0.35338

4.3 ANOVA Tipo II (orden-independiente)

anova_t2 <- car::Anova(mod, test = "LR", type = 2) |>
  as.data.frame() |>
  tibble::rownames_to_column("Término") |>
  dplyr::rename(`Chi²` = `LR Chisq`, `g.l.` = Df, `p (LR)` = `Pr(>Chisq)`)

knitr::kable(
  anova_t2 |>
    dplyr::mutate(`p (LR)` = format.pval(`p (LR)`, digits = 3, eps = 1e-4)),
  caption = "ANOVA Tipo II (LR): significancia global por término (orden-independiente)",
  align = "lrrr"
)

ANOVA Tipo II (LR): significancia global por término (orden-independiente)
Término	Chi²	g.l.	p (LR)
cargo	44.5999581	8	< 1e-04
horas_extra	92.8478127	1	< 1e-04
estado_civil	36.0564094	2	< 1e-04
anos_experiencia	1.8754622	1	0.17085
antiguedad_cargo	9.5974168	1	0.00195
ingreso_mensual	0.8612899	1	0.35338

4.4 Colinealidad (VIF) y tamaño de efecto comparable

v <- car::vif(mod)
vif_tbl <- tibble::tibble(
  Termino = names(v),
  VIF = as.numeric(v),
  df = ifelse(is.matrix(v), v[,2], 1)
) |>
  dplyr::mutate(VIF_ajust = ifelse(df>1, VIF^(1/(2*df)), VIF),
                Nota = dplyr::case_when(
                  VIF_ajust < 3 ~ "Bajo",
                  VIF_ajust < 5 ~ "Moderado",
                  TRUE          ~ "Alto"
                ))

knitr::kable(vif_tbl, digits = 3, caption = "Colinealidad: GVIF y GVIF^(1/(2*Df))")

Colinealidad: GVIF y GVIF^(1/(2*Df))
VIF	df	VIF_ajust	Nota
4.222	8	1.094	Bajo
1.066	8	1.004	Bajo
1.043	8	1.003	Bajo
2.417	8	1.057	Bajo
1.258	8	1.014	Bajo
5.449	8	1.112	Bajo
8.000	8	1.139	Bajo
1.000	8	1.000	Bajo
2.000	8	1.044	Bajo
1.000	8	1.000	Bajo
1.000	8	1.000	Bajo
1.000	8	1.000	Bajo
1.094	8	1.006	Bajo
1.032	8	1.002	Bajo
1.011	8	1.001	Bajo
1.555	8	1.028	Bajo
1.122	8	1.007	Bajo
2.334	8	1.054	Bajo

4.5 OR “estandarizados” (por 1 DE e IQR) para numéricas

num_betas <- broom::tidy(mod) |>
  dplyr::filter(term %in% sel_num) |>
  dplyr::select(term, estimate)

esc <- lapply(sel_num, function(v){
  x <- df_mod[[v]]
  sdv <- stats::sd(x)
  iqr <- as.numeric(stats::IQR(x))
  b   <- num_betas$estimate[num_betas$term == v]
  tibble::tibble(variable=v,
                 SD=sdv, IQR=iqr,
                 OR_por_1SD = exp(b*sdv),
                 OR_por_IQR = exp(b*iqr))
}) |> dplyr::bind_rows()

knitr::kable(dplyr::mutate(esc,
                           dplyr::across(c(OR_por_1SD, OR_por_IQR), ~ round(.x,3))),
             caption="OR por 1 desviación estándar y por IQR (comparables entre numéricas)")

OR por 1 desviación estándar y por IQR (comparables entre numéricas)
variable	SD	IQR	OR_por_1SD	OR_por_IQR
anos_experiencia	7.780782	9	0.819	0.794
antiguedad_cargo	3.623137	5	0.725	0.641
ingreso_mensual	4707.956783	5468	1.256	1.303

4.6 No linealidad e interacción

form_nl <- as.formula(
  paste("y ~", paste(sel_cat, collapse=" + "), "+",
        paste(sprintf("splines::ns(%s, 3)", sel_num), collapse=" + "))
)
mod_nl <- glm(form_nl, family=binomial(), data=df_mod)
comp_nl <- anova(mod, mod_nl, test="Chisq") |> as.data.frame()

form_int <- update(form, . ~ . + cargo:horas_extra)
mod_int  <- glm(form_int, family=binomial(), data=df_mod)
comp_int <- anova(mod, mod_int, test="Chisq") |> as.data.frame()

knitr::kable(comp_nl, caption="¿Mejora permitir curvatura? (LR: base vs. no lineal)")

¿Mejora permitir curvatura? (LR: base vs. no lineal)
Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
1455	1063.266	NA	NA	NA
1449	1031.590	6	31.67606	1.88e-05

knitr::kable(comp_int, caption="¿Aporta la interacción cargo×horas_extra? (LR)")

¿Aporta la interacción cargo×horas_extra? (LR)
Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
1455	1063.266	NA	NA	NA
1447	1052.068	8	11.19745	0.1907607

5) Evaluación (ROC/AUC, CV, calibración y umbral)

5.1 Hold-out 70/30 + ROC/AUC + métricas por umbral

set.seed(9026655)
idx  <- caret::createDataPartition(df_mod$y, p = 0.7, list = FALSE)
train <- df_mod[idx, ]; test <- df_mod[-idx, ]

mod_tr <- glm(form, family=binomial(), data=train)
prob_test <- as.numeric(predict(mod_tr, newdata=test, type="response"))

stopifnot(length(unique(test$y))>=2)
roc_obj <- pROC::roc(response=test$y, predictor=prob_test, quiet=TRUE)
auc_val <- as.numeric(pROC::auc(roc_obj))
plot(roc_obj, main=paste0("ROC — AUC (test) = ", round(auc_val,3)))

opt_coords <- pROC::coords(roc_obj, x="best", best.method="youden",
                           ret=c("threshold","sensitivity","specificity"))
opt_thresh <- as.numeric(opt_coords["threshold"])

clasifica <- function(p, thr) as.integer(as.numeric(p) >= as.numeric(thr))
metricas  <- function(y, yhat){
  y <- as.integer(y); yhat <- as.integer(yhat)
  TP <- sum(y==1 & yhat==1); TN <- sum(y==0 & yhat==0)
  FP <- sum(y==0 & yhat==1); FN <- sum(y==1 & yhat==0)
  tibble::tibble(
    Accuracy=(TP+TN)/length(y),
    Precision=ifelse(TP+FP==0, NA, TP/(TP+FP)),
    Sensibilidad=ifelse(TP+FN==0, NA, TP/(TP+FN)),
    Especificidad=ifelse(TN+FP==0, NA, TN/(TN+FP))
  )
}

pred_05 <- clasifica(prob_test, 0.5)
pred_op <- clasifica(prob_test, opt_thresh)

tab_met <- dplyr::bind_rows(
  metricas(test$y, pred_05) |> dplyr::mutate(Umbral=0.5, .before=1),
  metricas(test$y, pred_op) |> dplyr::mutate(Umbral=round(opt_thresh,3), .before=1)
)
knitr::kable(tab_met, digits=3, caption="Desempeño en test (0.5 vs. Youden)")

Desempeño en test (0.5 vs. Youden)
Umbral	Accuracy	Precision	Sensibilidad	Especificidad
0.500	0.848	0.750	0.123	0.992
0.131	0.644	0.294	0.822	0.609

5.2 AUC con validación cruzada (5×3)

df_cv <- train |>
  dplyr::mutate(resp = factor(ifelse(y==1,"Si","No"), levels=c("No","Si"))) |>
  dplyr::select(-y)

ctrl <- caret::trainControl(
  method = "repeatedcv", number = 5, repeats = 3,
  summaryFunction = caret::twoClassSummary, classProbs = TRUE, savePredictions = "final"
)

set.seed(9026655)
fit_cv <- caret::train(
  stats::as.formula(paste("resp ~", paste(c(sel_cat, sel_num), collapse=" + "))), 
  data = df_cv, method = "glm", family = binomial(),
  trControl = ctrl, metric = "ROC"
)

cv_auc <- mean(fit_cv$results$ROC)
knitr::kable(tibble::tibble(`AUC CV (5x3)` = round(cv_auc,3)),
             caption = "AUC promedio en validación cruzada")

AUC promedio en validación cruzada
AUC CV (5x3)
0.768

5.3 Calibración y Brier score

brier <- mean( (test$y - prob_test)^2 )

calib <- tibble::tibble(y=test$y, p=prob_test) |>
  dplyr::mutate(bin = dplyr::ntile(p, 10)) |>
  dplyr::group_by(bin) |>
  dplyr::summarise(p_prom = mean(p), y_obs = mean(y), n=dplyr::n(), .groups="drop")

ggplot(calib, aes(x=p_prom, y=y_obs)) +
  geom_point() + geom_line() +
  geom_abline(slope=1, intercept=0, linetype=2) +
  labs(x="Prob. predicha (promedio por decil)", y="Frecuencia observada",
       title=paste0("Calibración (test) — Brier=", round(brier,3))) +
  theme_minimal()

5.4 Umbral por costos (sensibilidad a FN vs FP)

cost_fn <- 5   # costo relativo de un falso negativo
cost_fp <- 1   # costo relativo de un falso positivo

grid <- seq(0.05, 0.95, by=0.05)
tbl_cost <- lapply(grid, function(t){
  yhat <- clasifica(prob_test, t)
  y <- test$y
  FN <- sum(y==1 & yhat==0); FP <- sum(y==0 & yhat==1)
  costo <- cost_fn*FN + cost_fp*FP
  tibble::tibble(umbral=t, costo=costo)
}) |> dplyr::bind_rows()

t_opt <- tbl_cost$umbral[which.min(tbl_cost$costo)]
knitr::kable(dplyr::mutate(tbl_cost, costo=round(costo,1)),
             caption=paste("Costo esperado (FN=",cost_fn,", FP=",cost_fp,") — umbral óptimo ≈", round(t_opt,3)))

Costo esperado (FN= 5 , FP= 1 ) — umbral óptimo ≈ 0.15
umbral	costo
0.05	288
0.10	232
0.15	224
0.20	229
0.25	237
0.30	244
0.35	242
0.40	265
0.45	315
0.50	323
0.55	330
0.60	340
0.65	350
0.70	350
0.75	360
0.80	365
0.85	365
0.90	365
0.95	365

6) Predicciones y decisión operativa

6.1 Caso hipotético

new_emp <- tibble::tibble(
  cargo = factor("Ejecutivo_Ventas", levels=levels(df_mod$cargo)),
  horas_extra = factor("Si", levels=levels(df_mod$horas_extra)),
  estado_civil = factor("Soltero", levels=levels(df_mod$estado_civil)),
  ingreso_mensual = 5000,
  anos_experiencia = 3,
  antiguedad_cargo = 1
)

p_new  <- predict(mod_tr, newdata=new_emp, type="response")
decision <- ifelse(p_new >= t_opt, "INTERVENIR (alto riesgo)", "No intervenir")

knitr::kable(tibble::tibble(
  Prob_rotacion = round(as.numeric(p_new),3),
  Umbral = round(t_opt,3),
  Decision = decision
), caption = "Predicción individual y decisión basada en costo mínimo")

Predicción individual y decisión basada en costo mínimo
Prob_rotacion	Umbral	Decision
0.616	0.15	INTERVENIR (alto riesgo)

6.2 Segmentación de riesgo (deciles)

seg <- tibble::tibble(id = 1:nrow(test), y=test$y, p=prob_test) |>
  dplyr::mutate(dec = dplyr::ntile(p, 10)) |>
  dplyr::group_by(dec) |>
  dplyr::summarise(n=dplyr::n(), p_prom=mean(p), tasa_rot=mean(y), .groups="drop") |>
  dplyr::arrange(dplyr::desc(dec))

knitr::kable(dplyr::mutate(seg, dplyr::across(c(p_prom, tasa_rot), ~ round(.x,3))),
             caption="Deciles de riesgo: prob. predicha y tasa observada (test)")

Deciles de riesgo: prob. predicha y tasa observada (test)
dec	n	p_prom	tasa_rot
10	44	0.465	0.636
9	44	0.288	0.250
8	44	0.212	0.159
7	44	0.168	0.159
6	44	0.133	0.159
5	44	0.097	0.114
4	44	0.067	0.023
3	44	0.049	0.045
2	44	0.033	0.045
1	45	0.016	0.067

7) Conclusiones

Resumen ejecutivo (al grano).
- Determinantes (modelo multivariable): horas_extra = Sí y ciertos cargos elevan el riesgo; estado_civil = Soltero se asocia a mayor propensión; antiguedad_cargo y anos_experiencia protegen (OR < 1).
- Poder predictivo: AUC (test) ≈ 0.774; AUC (CV 5×3) ≈ 0.768.
- Umbral operativo: con costos FN:FP = 5:1, el corte óptimo ≈ 0.15.

Plan de acción (vinculado a los hallazgos): - Horas extra (palanca modificable, prioridad alta): limitar y redistribuir turnos; refuerzos en picos; automatizar tareas repetitivas.
- Cargos con mayor propensión (“calientes”): revisar carga/procesos; rutas de desarrollo y upskilling; incentivos de retención focalizados.
- Baja antigüedad en el cargo: onboarding y mentoring 30–60–90 días, feedback quincenal.
- Baja experiencia: itinerarios de aprendizaje, rotación interna planificada, “buddy”.
- Beneficios universales (no segmentar por estado civil): flexibilidad, movilidad, bienestar.

Política de decisión con el modelo: - Intervenir a quienes tengan Prob(rotación) ≥ 0.15 y, si aplica, pertenezcan a cargos calientes o reporten horas extra.
- Recalibrar umbral mensualmente según costos y capacidad operativa (si se prioriza retener talento clave, bajar umbral → ↑ sensibilidad).
- Monitorear equidad (desempeño por subgrupos) y drift trimestral.