# Cargar librerías necesarias
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(cluster)
library(survey)
## Warning: package 'survey' was built under R version 4.4.3
## Cargando paquete requerido: grid
## Cargando paquete requerido: Matrix
## Cargando paquete requerido: survival
## 
## Adjuntando el paquete: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
# Crear el dataframe con los datos
datos <- data.frame(
  No_Empleados = 1:150,
  Estrato = c(rep(1, 50), rep(2, 50), rep(3, 50)),
  Ingreso_Neto = c(895212, 822988, 812288, 627853, 673359, 696733, 807765, 862058, 730068, 712263, 
                   717301, 760384, 863677, 698134, 754593, 623313, 892024, 653688, 689467, 580557, 
                   810195, 818755, 551942, 581640, 674142, 624423, 733533, 618427, 666012, 835102, 
                   706667, 671992, 621673, 583886, 818574, 575680, 601307, 733753, 817579, 591608, 
                   865841, 875263, 798527, 812552, 808481, 661405, 817393, 748897, 797257, 830968, 
                   992000, 1016687, 947507, 1067155, 1171985, 1012223, 1089864, 1188057, 1169672, 
                   1161676, 1094383, 981592, 1086857, 1130201, 1011842, 1125425, 1011184, 1024521, 
                   975849, 1156689, 1071879, 1086091, 978796, 993162, 1039345, 934938, 960173, 
                   947692, 1078029, 1106095, 1127228, 1129931, 1000968, 966183, 965288, 963745, 
                   1014802, 1124355, 991637, 1012785, 1093853, 1107761, 909744, 1050192, 1147289, 
                   1196883, 1099008, 1148256, 1079237, 1160936, 1711165, 1630678, 1486808, 1930872, 
                   1238480, 1491703, 1855658, 1533965, 1715683, 1706631, 1285885, 1913618, 1894917, 
                   1402940, 1286128, 1656382, 1993289, 1375618, 1264670, 1499591, 1611107, 1519154, 
                   1902240, 1573037, 1215203, 1589702, 1208222, 1360446, 1427615, 1960288, 1628354, 
                   1664025, 1302706, 1963135, 1347579, 1364081, 1977855, 1266223, 1546094, 1944575, 
                   1708847, 1768507, 1453808, 1462913, 1319356, 1294243, 1308641, 1280682, 1881446, 
                   1711280),
  Personas_Cargo = c(rep("Ninguna", 12), rep("Entre 1 y 2", 25), rep("Más de 2", 13),
                     rep("Ninguna", 12), rep("Entre 1 y 2", 25), rep("Más de 2", 13),
                     rep("Ninguna", 12), rep("Entre 1 y 2", 25), rep("Más de 2", 13)),
  Gastos_Familiares = c(367037, 288046, 324915, 282534, 282811, 320497, 298873, 344823, 306629, 306273, 
                        329958, 334569, 380018, 376992, 392388, 324123, 401411, 333381, 351628, 307695, 
                        348384, 360252, 287010, 314086, 357295, 324700, 359431, 309214, 346326, 384147, 
                        360400, 329276, 310837, 315298, 384730, 305110, 300654, 440252, 408790, 343133, 
                        424262, 437632, 431205, 422527, 420410, 370387, 425044, 419382, 390656, 407174, 
                        307520, 345674, 360053, 384176, 492234, 374523, 435946, 427701, 409385, 476287, 
                        437753, 333741, 521691, 531194, 435092, 495187, 434809, 471280, 429374, 497376, 
                        471627, 467019, 460034, 456855, 467705, 420722, 422476, 435938, 517454, 519865, 
                        507253, 542367, 430416, 463768, 424727, 424048, 456661, 584665, 515651, 526648, 
                        590681, 587113, 491262, 546100, 596590, 622379, 549504, 608576, 582788, 626905, 
                        641482, 623434, 446389, 652479, 654247, 534410, 612186, 604781, 566791, 524465, 
                        389532, 555255, 544871, 612958, 577229, 662603, 772156, 581769, 584077, 739080, 
                        582449, 531653, 588743, 720060, 585170, 595294, 657666, 789130, 628575, 709964, 
                        844478, 729182, 476568, 498819, 741645, 784733, 701230, 887825, 707525, 851442, 
                        656860, 863625, 657570, 737962, 754389, 811046, 693874, 569852, 864462, 1023314),
  Gastos_Arriendo = c(250659, 222207, 243686, 175799, 195274, 195085, 234252, 258617, 197118, 213679, 
                      215190, 228115, 241830, 209440, 211286, 180761, 267607, 196106, 186156, 168362, 
                      218753, 237439, 165583, 162859, 182018, 168594, 205389, 185528, 186483, 225478, 
                      190800, 188158, 180285, 169327, 229201, 172704, 180392, 198113, 237098, 171566, 
                      242435, 262579, 239558, 243766, 234459, 178579, 220696, 202202, 231205, 240981, 
                      238080, 254172, 227402, 266789, 281276, 253056, 261567, 285134, 292418, 290419, 
                      262652, 245398, 282583, 282550, 252961, 281356, 262908, 245885, 253721, 300739, 
                      257251, 282384, 234911, 258222, 249443, 233735, 249645, 227446, 269507, 265463, 
                      281807, 282483, 260252, 231884, 241322, 240936, 253701, 292332, 257826, 243068, 
                      262525, 276940, 227436, 252046, 275349, 299221, 274752, 287064, 269809, 290234, 
                      415076, 356248, 262582, 380613, 442579, 311739, 367312, 391329, 377861, 268629, 
                      243457, 347034, 286774, 337127, 312016, 380997, 345006, 328177, 322779, 416917, 
                      318960, 259343, 288108, 315026, 312998, 320543, 313174, 434021, 380454, 338846, 
                      451698, 409053, 283365, 267659, 389364, 367322, 306788, 400392, 280340, 455423, 
                      260265, 335854, 305846, 329728, 290150, 347591, 354647, 253268, 349110, 452620)
)

# Definir salario mínimo legal vigente 2020
smlv_2020 <- 877803

# Crear variable binaria para ingresos menores al SMLV
datos$Ingreso_Bajo <- ifelse(datos$Ingreso_Neto < smlv_2020, 1, 0)

# Resumen estadístico general
summary(datos$Ingreso_Neto)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  551942  817440 1069517 1119099 1340523 1993289
# Resumen por estrato
datos %>% 
  group_by(Estrato) %>% 
  summarise(
    Media_Ingreso = mean(Ingreso_Neto),
    Mediana_Ingreso = median(Ingreso_Neto),
    Desviacion = sd(Ingreso_Neto),
    Porcentaje_Bajo_SMLV = mean(Ingreso_Bajo)*100
  )
## # A tibble: 3 × 5
##   Estrato Media_Ingreso Mediana_Ingreso Desviacion Porcentaje_Bajo_SMLV
##     <dbl>         <dbl>           <dbl>      <dbl>                <dbl>
## 1       1       730544.         731800.     98545.                   96
## 2       2      1058033         1069517      77125.                    0
## 3       3      1568721.        1540030.    245133.                    0
# Histograma de ingresos
ggplot(datos, aes(x = Ingreso_Neto)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  geom_vline(xintercept = smlv_2020, color = "red", linetype = "dashed") +
  labs(title = "Distribución de Ingresos Netos", 
       x = "Ingreso Neto (COP)", 
       y = "Frecuencia") +
  theme_minimal()

# Boxplot por estrato
ggplot(datos, aes(x = factor(Estrato), y = Ingreso_Neto)) +
  geom_boxplot(fill = "lightblue") +
  geom_hline(yintercept = smlv_2020, color = "red", linetype = "dashed") +
  labs(title = "Distribución de Ingresos por Estrato", 
       x = "Estrato", 
       y = "Ingreso Neto (COP)") +
  theme_minimal()

# Diseño muestral por conglomerados (estratos)
design <- svydesign(id = ~Estrato, data = datos)
## Warning in svydesign.default(id = ~Estrato, data = datos): No weights or
## probabilities supplied, assuming equal probability
# Estimación del ingreso promedio
ingreso_promedio <- svymean(~Ingreso_Neto, design)
ingreso_promedio
##                 mean     SE
## Ingreso_Neto 1119099 243880
# Estimación del porcentaje con ingresos bajos
porcentaje_bajo <- svymean(~Ingreso_Bajo, design)
porcentaje_bajo
##              mean   SE
## Ingreso_Bajo 0.32 0.32
# Intervalos de confianza
confint(ingreso_promedio, level = 0.95)
##                 2.5 %  97.5 %
## Ingreso_Neto 641103.8 1597095
confint(porcentaje_bajo, level = 0.95)
##                   2.5 %    97.5 %
## Ingreso_Bajo -0.3071885 0.9471885
# Prueba si el ingreso promedio es menor que el SMLV
t.test(datos$Ingreso_Neto, mu = smlv_2020, alternative = "less")
## 
##  One Sample t-test
## 
## data:  datos$Ingreso_Neto
## t = 7.7699, df = 149, p-value = 1
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
##     -Inf 1170500
## sample estimates:
## mean of x 
##   1119099
# Prueba de proporciones para porcentaje bajo SMLV
prop.test(sum(datos$Ingreso_Bajo), n = nrow(datos), p = 0.5, alternative = "greater")
## 
##  1-sample proportions test with continuity correction
## 
## data:  sum(datos$Ingreso_Bajo) out of nrow(datos), null probability 0.5
## X-squared = 18.727, df = 1, p-value = 1
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.2579117 1.0000000
## sample estimates:
##    p 
## 0.32
# Análisis para cada estrato
for (estrato in 1:3) {
  subset_data <- datos[datos$Estrato == estrato, ]
  cat("\n--- Análisis para Estrato", estrato, "---\n")
  
  # Estadísticas descriptivas
  cat("\nMedia de ingresos:", mean(subset_data$Ingreso_Neto))
  cat("\nPorcentaje bajo SMLV:", mean(subset_data$Ingreso_Bajo)*100, "%\n")
  
  # Prueba t para el estrato
  print(t.test(subset_data$Ingreso_Neto, mu = smlv_2020, alternative = "less"))
}
## 
## --- Análisis para Estrato 1 ---
## 
## Media de ingresos: 730544
## Porcentaje bajo SMLV: 96 %
## 
##  One Sample t-test
## 
## data:  subset_data$Ingreso_Neto
## t = -10.567, df = 49, p-value = 1.565e-14
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
##    -Inf 753909
## sample estimates:
## mean of x 
##    730544 
## 
## 
## --- Análisis para Estrato 2 ---
## 
## Media de ingresos: 1058033
## Porcentaje bajo SMLV: 0 %
## 
##  One Sample t-test
## 
## data:  subset_data$Ingreso_Neto
## t = 16.524, df = 49, p-value = 1
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
##     -Inf 1076319
## sample estimates:
## mean of x 
##   1058033 
## 
## 
## --- Análisis para Estrato 3 ---
## 
## Media de ingresos: 1568721
## Porcentaje bajo SMLV: 0 %
## 
##  One Sample t-test
## 
## data:  subset_data$Ingreso_Neto
## t = 19.93, df = 49, p-value = 1
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
##     -Inf 1626842
## sample estimates:
## mean of x 
##   1568721