# Cargar librerías necesarias
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(cluster)
library(survey)
## Warning: package 'survey' was built under R version 4.4.3
## Cargando paquete requerido: grid
## Cargando paquete requerido: Matrix
## Cargando paquete requerido: survival
##
## Adjuntando el paquete: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
# Crear el dataframe con los datos
datos <- data.frame(
No_Empleados = 1:150,
Estrato = c(rep(1, 50), rep(2, 50), rep(3, 50)),
Ingreso_Neto = c(895212, 822988, 812288, 627853, 673359, 696733, 807765, 862058, 730068, 712263,
717301, 760384, 863677, 698134, 754593, 623313, 892024, 653688, 689467, 580557,
810195, 818755, 551942, 581640, 674142, 624423, 733533, 618427, 666012, 835102,
706667, 671992, 621673, 583886, 818574, 575680, 601307, 733753, 817579, 591608,
865841, 875263, 798527, 812552, 808481, 661405, 817393, 748897, 797257, 830968,
992000, 1016687, 947507, 1067155, 1171985, 1012223, 1089864, 1188057, 1169672,
1161676, 1094383, 981592, 1086857, 1130201, 1011842, 1125425, 1011184, 1024521,
975849, 1156689, 1071879, 1086091, 978796, 993162, 1039345, 934938, 960173,
947692, 1078029, 1106095, 1127228, 1129931, 1000968, 966183, 965288, 963745,
1014802, 1124355, 991637, 1012785, 1093853, 1107761, 909744, 1050192, 1147289,
1196883, 1099008, 1148256, 1079237, 1160936, 1711165, 1630678, 1486808, 1930872,
1238480, 1491703, 1855658, 1533965, 1715683, 1706631, 1285885, 1913618, 1894917,
1402940, 1286128, 1656382, 1993289, 1375618, 1264670, 1499591, 1611107, 1519154,
1902240, 1573037, 1215203, 1589702, 1208222, 1360446, 1427615, 1960288, 1628354,
1664025, 1302706, 1963135, 1347579, 1364081, 1977855, 1266223, 1546094, 1944575,
1708847, 1768507, 1453808, 1462913, 1319356, 1294243, 1308641, 1280682, 1881446,
1711280),
Personas_Cargo = c(rep("Ninguna", 12), rep("Entre 1 y 2", 25), rep("Más de 2", 13),
rep("Ninguna", 12), rep("Entre 1 y 2", 25), rep("Más de 2", 13),
rep("Ninguna", 12), rep("Entre 1 y 2", 25), rep("Más de 2", 13)),
Gastos_Familiares = c(367037, 288046, 324915, 282534, 282811, 320497, 298873, 344823, 306629, 306273,
329958, 334569, 380018, 376992, 392388, 324123, 401411, 333381, 351628, 307695,
348384, 360252, 287010, 314086, 357295, 324700, 359431, 309214, 346326, 384147,
360400, 329276, 310837, 315298, 384730, 305110, 300654, 440252, 408790, 343133,
424262, 437632, 431205, 422527, 420410, 370387, 425044, 419382, 390656, 407174,
307520, 345674, 360053, 384176, 492234, 374523, 435946, 427701, 409385, 476287,
437753, 333741, 521691, 531194, 435092, 495187, 434809, 471280, 429374, 497376,
471627, 467019, 460034, 456855, 467705, 420722, 422476, 435938, 517454, 519865,
507253, 542367, 430416, 463768, 424727, 424048, 456661, 584665, 515651, 526648,
590681, 587113, 491262, 546100, 596590, 622379, 549504, 608576, 582788, 626905,
641482, 623434, 446389, 652479, 654247, 534410, 612186, 604781, 566791, 524465,
389532, 555255, 544871, 612958, 577229, 662603, 772156, 581769, 584077, 739080,
582449, 531653, 588743, 720060, 585170, 595294, 657666, 789130, 628575, 709964,
844478, 729182, 476568, 498819, 741645, 784733, 701230, 887825, 707525, 851442,
656860, 863625, 657570, 737962, 754389, 811046, 693874, 569852, 864462, 1023314),
Gastos_Arriendo = c(250659, 222207, 243686, 175799, 195274, 195085, 234252, 258617, 197118, 213679,
215190, 228115, 241830, 209440, 211286, 180761, 267607, 196106, 186156, 168362,
218753, 237439, 165583, 162859, 182018, 168594, 205389, 185528, 186483, 225478,
190800, 188158, 180285, 169327, 229201, 172704, 180392, 198113, 237098, 171566,
242435, 262579, 239558, 243766, 234459, 178579, 220696, 202202, 231205, 240981,
238080, 254172, 227402, 266789, 281276, 253056, 261567, 285134, 292418, 290419,
262652, 245398, 282583, 282550, 252961, 281356, 262908, 245885, 253721, 300739,
257251, 282384, 234911, 258222, 249443, 233735, 249645, 227446, 269507, 265463,
281807, 282483, 260252, 231884, 241322, 240936, 253701, 292332, 257826, 243068,
262525, 276940, 227436, 252046, 275349, 299221, 274752, 287064, 269809, 290234,
415076, 356248, 262582, 380613, 442579, 311739, 367312, 391329, 377861, 268629,
243457, 347034, 286774, 337127, 312016, 380997, 345006, 328177, 322779, 416917,
318960, 259343, 288108, 315026, 312998, 320543, 313174, 434021, 380454, 338846,
451698, 409053, 283365, 267659, 389364, 367322, 306788, 400392, 280340, 455423,
260265, 335854, 305846, 329728, 290150, 347591, 354647, 253268, 349110, 452620)
)
# Definir salario mínimo legal vigente 2020
smlv_2020 <- 877803
# Crear variable binaria para ingresos menores al SMLV
datos$Ingreso_Bajo <- ifelse(datos$Ingreso_Neto < smlv_2020, 1, 0)
# Resumen estadístico general
summary(datos$Ingreso_Neto)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 551942 817440 1069517 1119099 1340523 1993289
# Resumen por estrato
datos %>%
group_by(Estrato) %>%
summarise(
Media_Ingreso = mean(Ingreso_Neto),
Mediana_Ingreso = median(Ingreso_Neto),
Desviacion = sd(Ingreso_Neto),
Porcentaje_Bajo_SMLV = mean(Ingreso_Bajo)*100
)
## # A tibble: 3 × 5
## Estrato Media_Ingreso Mediana_Ingreso Desviacion Porcentaje_Bajo_SMLV
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 730544. 731800. 98545. 96
## 2 2 1058033 1069517 77125. 0
## 3 3 1568721. 1540030. 245133. 0
# Histograma de ingresos
ggplot(datos, aes(x = Ingreso_Neto)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
geom_vline(xintercept = smlv_2020, color = "red", linetype = "dashed") +
labs(title = "Distribución de Ingresos Netos",
x = "Ingreso Neto (COP)",
y = "Frecuencia") +
theme_minimal()

# Boxplot por estrato
ggplot(datos, aes(x = factor(Estrato), y = Ingreso_Neto)) +
geom_boxplot(fill = "lightblue") +
geom_hline(yintercept = smlv_2020, color = "red", linetype = "dashed") +
labs(title = "Distribución de Ingresos por Estrato",
x = "Estrato",
y = "Ingreso Neto (COP)") +
theme_minimal()

# Diseño muestral por conglomerados (estratos)
design <- svydesign(id = ~Estrato, data = datos)
## Warning in svydesign.default(id = ~Estrato, data = datos): No weights or
## probabilities supplied, assuming equal probability
# Estimación del ingreso promedio
ingreso_promedio <- svymean(~Ingreso_Neto, design)
ingreso_promedio
## mean SE
## Ingreso_Neto 1119099 243880
# Estimación del porcentaje con ingresos bajos
porcentaje_bajo <- svymean(~Ingreso_Bajo, design)
porcentaje_bajo
## mean SE
## Ingreso_Bajo 0.32 0.32
# Intervalos de confianza
confint(ingreso_promedio, level = 0.95)
## 2.5 % 97.5 %
## Ingreso_Neto 641103.8 1597095
confint(porcentaje_bajo, level = 0.95)
## 2.5 % 97.5 %
## Ingreso_Bajo -0.3071885 0.9471885
# Prueba si el ingreso promedio es menor que el SMLV
t.test(datos$Ingreso_Neto, mu = smlv_2020, alternative = "less")
##
## One Sample t-test
##
## data: datos$Ingreso_Neto
## t = 7.7699, df = 149, p-value = 1
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
## -Inf 1170500
## sample estimates:
## mean of x
## 1119099
# Prueba de proporciones para porcentaje bajo SMLV
prop.test(sum(datos$Ingreso_Bajo), n = nrow(datos), p = 0.5, alternative = "greater")
##
## 1-sample proportions test with continuity correction
##
## data: sum(datos$Ingreso_Bajo) out of nrow(datos), null probability 0.5
## X-squared = 18.727, df = 1, p-value = 1
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.2579117 1.0000000
## sample estimates:
## p
## 0.32
# Análisis para cada estrato
for (estrato in 1:3) {
subset_data <- datos[datos$Estrato == estrato, ]
cat("\n--- Análisis para Estrato", estrato, "---\n")
# Estadísticas descriptivas
cat("\nMedia de ingresos:", mean(subset_data$Ingreso_Neto))
cat("\nPorcentaje bajo SMLV:", mean(subset_data$Ingreso_Bajo)*100, "%\n")
# Prueba t para el estrato
print(t.test(subset_data$Ingreso_Neto, mu = smlv_2020, alternative = "less"))
}
##
## --- Análisis para Estrato 1 ---
##
## Media de ingresos: 730544
## Porcentaje bajo SMLV: 96 %
##
## One Sample t-test
##
## data: subset_data$Ingreso_Neto
## t = -10.567, df = 49, p-value = 1.565e-14
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
## -Inf 753909
## sample estimates:
## mean of x
## 730544
##
##
## --- Análisis para Estrato 2 ---
##
## Media de ingresos: 1058033
## Porcentaje bajo SMLV: 0 %
##
## One Sample t-test
##
## data: subset_data$Ingreso_Neto
## t = 16.524, df = 49, p-value = 1
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
## -Inf 1076319
## sample estimates:
## mean of x
## 1058033
##
##
## --- Análisis para Estrato 3 ---
##
## Media de ingresos: 1568721
## Porcentaje bajo SMLV: 0 %
##
## One Sample t-test
##
## data: subset_data$Ingreso_Neto
## t = 19.93, df = 49, p-value = 1
## alternative hypothesis: true mean is less than 877803
## 95 percent confidence interval:
## -Inf 1626842
## sample estimates:
## mean of x
## 1568721