En este notebook se presentan ejemplos de análisis de la Encuesta Caracterización Socioecónomica (CASEN) usando R, con loa ayuda del paquete survey, del cual se pueden obtener mayores antecedentes en:
https://cran.r-project.org/web/packages/survey/survey.pdf
http://r-survey.r-forge.r-project.org/survey/
En particular, se estimará la pobreza por ingreso (pobres, no pobres y tasa de pobreza por ingresos), por Región.
Importante: Este documento no se trata de una publicación oficial y se hace solo con fines ilustrativos. No obstante, la correctitud de los resultados se puede comprobar con los cuadros estadísticos publicados para este producto en https://observatorio.ministeriodesarrollosocial.gob.cl/encuesta-casen-2022.
Mi información de contacto se puede encontrar en mi web personal.
En primer lugar, se cargan las las librerías
>
>
> library(dplyr)
> library(survey)
> library (haven)
> library(openxlsx)
> library(readxl)
> library(ggplot2)
Se trabajará con el conjunto de datos referencia a 2022. https://observatorio.ministeriodesarrollosocial.gob.cl/encuesta-casen-2022
En este caso, se va a descargar los archivos, que están en formato zip, se extraerán los contenidos y finalmente se leerán y guardarán en un objeto de R.
> url_casen <-"https://observatorio.ministeriodesarrollosocial.gob.cl/storage/docs/casen/2022/Base%20de%20datos%20Casen%202022%20STATA.dta.zip"
> url_casen_provincia_comuna<-"https://observatorio.ministeriodesarrollosocial.gob.cl/storage/docs/casen/2022/Base%20de%20datos%20provincia%20y%20comuna%20Casen%202022%20STATA.dta.zip"
>
>
>
> ### descargar base de datos CASEN
>
> temp <- tempfile()
>
> download.file(url_casen, temp)
>
> unzip(zipfile = temp, exdir = getwd())
>
> CASEN<- read_dta("Base de datos Casen 2022 STATA.dta")
>
> unlink(c(temp))
>
>
>
> ### descargar base de datos CASEN provincia comuna
>
> temp <- tempfile()
>
> download.file(url_casen_provincia_comuna, temp)
>
> unzip(zipfile = temp, exdir = getwd())
>
> CASEN_PROVINCIA_COMUNA<- read_dta("Base de datos provincia y comuna Casen 2022 STATA.dta")
>
> unlink(c(temp))
>
>
>
>
>
>
Se examinan los nombres de las variables de la base CASEN 2022.
>
> names(CASEN)
[1] "id_vivienda" "folio" "id_persona"
[4] "region" "area" "cod_upm"
[7] "nse" "estrato" "hogar"
[10] "expr" "expr_osig" "varstrat"
[13] "varunit" "fecha_entrev" "p1"
[16] "p2" "p3" "p4"
[19] "p9" "p10" "p11"
[22] "tot_per_h" "h1" "edad"
[25] "mes_nac_nna" "ano_nac_nna" "sexo"
[28] "pco1_a" "pco1_b" "pco1"
[31] "h5_cp" "h5_sp" "h5_b1_1"
[34] "h5_b1_2" "h5a_2" "h5_b2_1"
[37] "h5_b2_2" "h5a_3" "h5_b3_1"
[40] "h5_b3_2" "h5a_4" "h5b"
[43] "ecivil" "h5_10" "h5_1a"
[46] "h5_1b" "h5_20" "h5_2"
[49] "n_nucleos" "nucleo" "pco2_a"
[52] "pco2_b" "pco2" "h7a"
[55] "h7b" "h7c" "h7d"
[58] "h7e" "h7f" "informante"
[61] "e1" "e3" "e4a"
[64] "e4a_esp" "e5a" "e5a_esp"
[67] "e5b" "e6a_asiste" "e6a_no_asiste"
[70] "e6a" "e6b_asiste" "e6b_no_asiste"
[73] "e6b" "e6c_completo" "e6d_preg"
[76] "e6d_postg" "e7" "cinef13_area"
[79] "cinef13_subarea" "e8" "e9nom"
[82] "e9dir" "e9com_cod" "e9pais_cod"
[85] "e9rbd" "e9rbd_sup" "e9dv"
[88] "e9depen" "e10" "e11"
[91] "e12a" "e12b" "e12c"
[94] "e12d" "e12e" "e13a"
[97] "e13b_1" "e13b_2" "e13b_3"
[100] "e13b_4" "e13b_5" "e13b_6"
[103] "e13b_7" "e13b_8" "e13b_9"
[106] "e13b_10" "e13b_11" "e13b1"
[109] "e13b2" "e13b_esp1" "e13b_esp2"
[112] "e14a" "e14b" "e14c"
[115] "e14d" "e14e" "e16"
[118] "e18" "o1" "o2"
[121] "o3" "o4" "o5"
[124] "o6" "o7" "o7_esp"
[127] "o8" "o9a" "o9b"
[130] "oficio1_08" "oficio4_08" "o10"
[133] "o11" "o12" "o14"
[136] "o15" "o16" "o19"
[139] "o18" "o20" "o21"
[142] "o22" "o23" "o24"
[145] "rama1_sub" "rama4_sub" "rama1"
[148] "rama4" "o25" "o26a"
[151] "o26b" "o26c" "o26d"
[154] "o28a_hr" "o28a_min" "o28b"
[157] "o28c" "o28c_esp" "o28d"
[160] "o28e" "o29" "o30"
[163] "o31" "o32" "o32_esp"
[166] "o32b" "y1" "y2_dias"
[169] "y2_hrs" "y3a_preg" "y3b_preg"
[172] "y3c_preg" "y3d_preg" "y3e_preg"
[175] "y3f_preg" "y3a" "y3ap"
[178] "y3b" "y3bp" "y3c"
[181] "y3cp" "y3d" "y3dp"
[184] "y3e" "y3ep" "y3f_esp"
[187] "y3f" "y3fp" "y4a_preg"
[190] "y4b_preg" "y4c_preg" "y4d_preg"
[193] "y4a" "y4b" "y4c"
[196] "y4d_esp" "y4d" "y5a_preg"
[199] "y5b_preg" "y5c_preg" "y5d_preg"
[202] "y5e_preg" "y5f_preg" "y5g_preg"
[205] "y5h_preg" "y5i_preg" "y5j_preg"
[208] "y5k_preg" "y5l_preg" "y5a"
[211] "y5b" "y5c" "y5d"
[214] "y5e" "y5f" "y5g"
[217] "y5h" "y5i" "y5j"
[220] "y5k" "y5l" "y6"
[223] "y7" "y8" "y9"
[226] "y10" "y11_preg" "y11"
[229] "y12a_preg" "y12a" "y12b_preg"
[232] "y12b" "y13a_preg" "y13a"
[235] "y13b_preg" "y13b" "y13c_preg"
[238] "y13c" "y14a_preg" "y14a"
[241] "y14b_preg" "y14b" "y14c_preg"
[244] "y14c" "y15a_preg" "y15a"
[247] "y15b_preg" "y15b" "y15c_preg"
[250] "y15c" "y16a_preg" "y16a"
[253] "y16b_preg" "y16b" "y17_preg"
[256] "y17" "y18a_preg" "y18a"
[259] "y18b_preg" "y18b" "y18c_preg"
[262] "y18c" "y18d_preg" "y18d_esp"
[265] "y18d" "y19" "y19t"
[268] "y19n" "y20a" "y20b"
[271] "y20c" "y20d" "y20e"
[274] "y20amonto" "y20bmonto" "y20cmonto"
[277] "y20dmonto" "y20emonto" "y21_canasta"
[280] "y22_preg" "y22" "y22amonto"
[283] "y22bmonto" "y22cmonto" "y22dmonto"
[286] "y23a_preg" "y23a" "y23b"
[289] "y23c" "y23bmonto" "y23cmonto"
[292] "y24_preg" "y24" "y25a_preg"
[295] "y25a" "y25amonto" "y25b_preg"
[298] "y25b" "y25bmonto" "y25c"
[301] "y25cmonto" "y25d" "y25dmonto"
[304] "y25ep" "y25e" "y25fp"
[307] "y25f" "y25g_preg" "y25g"
[310] "y25h_preg" "y25hp" "y25h"
[313] "y25i_preg" "y25imonto" "y25ip"
[316] "y25j_preg" "y25j" "y25jmonto"
[319] "y26d_hog" "y26d_preg" "y26d_integrantes"
[322] "y26d_monto" "y27_preg" "y27_esp"
[325] "y27" "y28_1b" "y28_1c"
[328] "y28_1d" "y28_1dmonto" "y28_1e"
[331] "y28_1f" "y28_1g" "y28_1h"
[334] "y28_1i" "y28_1j" "y28j_esp"
[337] "y28_2b1" "y28_2b2" "y28_3b"
[340] "y28_4b" "y28_1c1" "y28_1c2"
[343] "y28_1c2monto" "y28_2c1" "y28_2c2"
[346] "y28_2c" "y28_3c" "y28_4c"
[349] "y28_2e1" "y28_2e2" "y28_3e"
[352] "y28_4e" "y28_2f" "y28_3f"
[355] "y28_4f" "y28_1g1" "y28_2g1"
[358] "y28_2g2" "y28_2g" "y28_3g"
[361] "y28_4g" "y28_2h" "y28_3h"
[364] "y28_4h" "y28_1i1" "y28_2i1"
[367] "y28_2i2" "y28_2i" "y28_2j"
[370] "y28_3j" "y28_4j" "s2"
[373] "s2c" "s3_1" "s3_2"
[376] "s3_3" "s3_4" "s3_5"
[379] "s3_6" "s3_7" "s3_8"
[382] "s3_88" "s3a1" "s3a2"
[385] "s4" "s5" "s6"
[388] "s7" "s7_meses" "s8"
[391] "s9a" "s9b" "s10"
[394] "s11a" "s11b" "s12"
[397] "s13" "s13_fonasa" "s15"
[400] "s16" "s17" "s17b"
[403] "s18" "s18_esp" "s19a"
[406] "s19b" "s19c" "s19d"
[409] "s19e" "s20a_preg" "s20a"
[412] "s20b" "s21a_preg" "s21a"
[415] "s21b" "s22a_preg" "s22a"
[418] "s22b" "s23a_preg" "s23a"
[421] "s23b" "s24a_preg" "s24a"
[424] "s24b" "s25a1_preg" "s25b1"
[427] "s25a2_preg" "s25b2" "s26a"
[430] "s26b_1" "s26b_2" "s26b_3"
[433] "s26b_4" "s26b_5" "s26b_6"
[436] "s26b_7" "s26b_8" "s26b_88"
[439] "s26b_esp" "s26u" "s26c"
[442] "s27a" "s27b" "s27c"
[445] "s28" "s28_esp" "s29"
[448] "s30" "s30_esp" "s31_1"
[451] "s31_2" "s31_3" "s31_4"
[454] "s31_5" "s31_6" "s31_7"
[457] "s32a" "s32b" "s32c"
[460] "s32d" "s32e" "s32f"
[463] "s32g" "s32h" "s32i"
[466] "s32j" "s33a" "s33b"
[469] "s33c" "s33d" "s33e"
[472] "s33f" "s33g" "s33h"
[475] "s33i" "s33j" "s34a"
[478] "s34b" "s34c" "r1a"
[481] "r1a_esp" "r1a_esp_cod" "r1b"
[484] "r1b_comuna_esp" "r1b_comuna_esp_cod" "r1b_pais_esp"
[487] "r1b_pais_esp_cod" "r1c" "r1cp"
[490] "r2" "r2_comuna_esp" "r2_comuna_esp_cod"
[493] "r2_pais_esp" "r2_pais_esp_cod" "r3"
[496] "r4" "r5" "r6"
[499] "r7a" "r7b" "r7c"
[502] "r7d" "r7e" "r7f"
[505] "r7g" "r7h" "r7i"
[508] "r7j" "r7k" "r8a"
[511] "r8b" "r8c" "r8d"
[514] "r8e" "r8f" "r8g"
[517] "r8h" "r9a" "r9b"
[520] "r9c" "r9d" "r9e"
[523] "r9f" "r9g" "r9h"
[526] "r9i" "r9j" "r9k"
[529] "r9l" "r9m" "r9n"
[532] "r9o" "r9p" "r9q"
[535] "r9r" "r9s" "r9t"
[538] "r9_esp" "r11" "r12a"
[541] "r12b" "r13a" "r13b"
[544] "r14" "r15" "r17a"
[547] "r17b" "r17c" "r17d"
[550] "r17e" "r18" "v1"
[553] "v2" "v3" "v4"
[556] "v5" "v6" "v7"
[559] "v9" "v10" "v11_o1"
[562] "v11_o2" "v12" "v12mt"
[565] "v13" "v13_propia" "v13_arrendada"
[568] "v13_cedida" "v13b_1" "v13b_2"
[571] "v13b_3" "v13b_4" "v13b_5"
[574] "v13b_6" "v13b_7" "v14"
[577] "v15" "v16" "v17"
[580] "v18" "v19" "v20"
[583] "v20_esp" "v20_red" "v21"
[586] "v22" "v23" "v23_sistema"
[589] "v23_cajon" "v24" "v25"
[592] "v26" "v27a" "v27b"
[595] "v28" "v29a" "v29b"
[598] "v30" "v31" "v32"
[601] "v33" "v34a" "v34b"
[604] "v34c" "v35a" "v35b"
[607] "v35c" "v35d" "v35e"
[610] "v35f" "v35g" "v35h"
[613] "v35i" "v36a" "v36b"
[616] "v36c" "v36d" "v36e"
[619] "v37a" "v37b" "v37c"
[622] "v37d" "v37e" "v37f"
[625] "v37g" "v38" "os_presente"
[628] "os1" "os1_esp" "genero"
[631] "genero_esp" "trans" "y0101"
[634] "y0301" "y0302" "y0303"
[637] "y0304" "y0305" "y0306"
[640] "y0401" "y0402" "y0403"
[643] "y0404" "y0501" "y0502"
[646] "y0503" "y0504" "y0505"
[649] "y0506" "y0507" "y0508"
[652] "y0509" "y0510" "y0511"
[655] "y0512" "yosa" "y0701"
[658] "y0801" "y0901" "yosi"
[661] "y1101" "yre1" "yama"
[664] "ymes" "yfa1" "yfa2"
[667] "ytro" "yta1" "yta2"
[670] "ydes" "yah1" "yah2"
[673] "yrut" "yre2" "yre3"
[676] "yac2" "yids" "ydon"
[679] "ydim" "yotr" "yfam"
[682] "y2001" "y2002" "y2003"
[685] "y2004" "y2005" "y2101"
[688] "y2201" "y2202" "y2203"
[691] "y2204" "y2301" "y2302"
[694] "y2303" "y2401" "y2501"
[697] "y2502" "y2503" "y2504"
[700] "y2505" "y2506" "y2507"
[703] "y2508p" "y2508" "y2509"
[706] "y2510" "y2604" "y2701"
[709] "y2804" "y280201" "y280202"
[712] "y280101" "y280301" "y280302"
[715] "y2803" "yinv0101" "yinv0102"
[718] "yinv02" "ymon0101" "ymon0102"
[721] "ymon02" "yorf" "yesp0101"
[724] "yesp0102" "yesp" "yotp"
[727] "yaut" "ysub1" "ysub2"
[730] "ysub" "ytot" "y0101h"
[733] "y0301h" "y0302h" "y0303h"
[736] "y0304h" "y0305h" "y0306h"
[739] "y0401h" "y0402h" "y0403h"
[742] "y0404h" "y0501h" "y0502h"
[745] "y0503h" "y0504h" "y0505h"
[748] "y0506h" "y0507h" "y0508h"
[751] "y0509h" "y0510h" "y0511h"
[754] "y0512h" "yosah" "y0701h"
[757] "y0801h" "y0901h" "yosih"
[760] "y1101h" "yre1h" "yamah"
[763] "ymesh" "yfa1h" "yfa2h"
[766] "ytroh" "yta1h" "yta2h"
[769] "ydesh" "yah1h" "yah2h"
[772] "yruth" "yre2h" "yre3h"
[775] "yac2h" "yidsh" "ydonh"
[778] "ydimh" "yotrh" "yfamh"
[781] "y2001h" "y2002h" "y2003h"
[784] "y2004h" "y2005h" "y2101h"
[787] "y2201h" "y2202h" "y2203h"
[790] "y2204h" "y2301h" "y2302h"
[793] "y2303h" "y2401h" "y2501h"
[796] "y2502h" "y2503h" "y2504h"
[799] "y2505h" "y2506h" "y2507h"
[802] "y2508h" "y2509h" "y2510h"
[805] "y2604h" "y2701h" "y2804h"
[808] "y280201h" "y280202h" "y280101h"
[811] "y280301h" "y280302h" "y2803h"
[814] "yinv0101h" "yinv0102h" "yinv02h"
[817] "ymon0101h" "ymon0102h" "ymon02h"
[820] "yorfh" "yesp0101h" "yesp0102h"
[823] "yesph" "yotph" "yauth"
[826] "ysub1h" "ysub2h" "ysubh"
[829] "yaimh" "ytoth" "ypch"
[832] "y0101c" "y0701c" "y280201c"
[835] "y280301c" "y2803c" "yautcor"
[838] "ytotcor" "y0101ch" "y0701ch"
[841] "y280201ch" "y280301ch" "y2803ch"
[844] "yautcorh" "yaimcorh" "ytotcorh"
[847] "ypc" "li" "lp"
[850] "nae" "yae" "pobreza"
[853] "yoprcor" "yoprcorh" "ytrabajocor"
[856] "ytrabajocorh" "ymonecorh" "ypchtrabcor"
[859] "ypchautcor" "dau" "qaut"
[862] "dautr" "qautr" "hh_d_asis"
[865] "hh_d_rez" "hh_d_esc" "hh_d_mal"
[868] "hh_d_prevs" "hh_d_acc" "hh_d_act"
[871] "hh_d_cot" "hh_d_jub" "hh_d_hacina"
[874] "hh_d_estado" "hh_d_habitab" "hh_d_servbas"
[877] "hh_d_medio" "hh_d_equipo" "hh_d_tiempo"
[880] "hh_d_accesi" "hh_d_entorno" "hh_d_hapoyo"
[883] "hh_d_part" "hh_d_tsocial" "hh_d_seg"
[886] "hh_d_appart" "pobreza_multi_5d" "pobreza_multi_4d"
[889] "disc_wg" "esc" "desercion"
[892] "rezago" "asiste" "educ"
[895] "depen" "activ" "asal"
[898] "contrato" "cotiza" "lugar_nac"
[901] "pueblos_indigenas" "n_ocupados" "n_desocupados"
[904] "n_inactivos" "conyuge_jh" "numper"
[907] "numnuc" "men18c" "may60c"
[910] "tipohogar" "tot_hog" "ind_hacina"
[913] "indsan" "ten_viv" "ten_viv_f"
[916] "allega_ext" "allega_int"
Es muy importante tener a mano los antecedentes metodológicos y el libro de códigos. Allí se puede ver que la variable pobreza tiene lo que nos interesa.
> attributes(CASEN$pobreza)$labels
Pobreza extrema Pobreza no extrema No pobreza
1 2 3
Factor de expansión
En este caso se debe usar el factor de expansión regional expr.
El diseño muestral de CASEN es del tipo diseño complejo, por ello se utilizará el paquete survey. Dicho paquete, permite obtener estimaciones y su variabilidad , permitiendo evaluar la calidad de dichas estimaciones.
Por conveniencia, se crearán variables binarias que permitirán identificar casos de pobreza extrema, pobreza no extrema y no pobres.
>
> CASEN<-mutate(CASEN, pobreza_extrema = case_when( pobreza==1 ~1, TRUE~ 0)
+ , pobreza_no_extrema = case_when( pobreza==2 ~1, TRUE~ 0)
+ , no_pobreza = case_when( pobreza==3 ~1, TRUE~ 0))
>
Se debe crear el diseño complejo (para conocer las variables necesarias se debe consultar los documentos metodológicos que están publicados)
>
> ##creamos el disenio
> disenio =svydesign(id=~varunit, # Etiquetas UPM
+ strata=~varstrat, #Estratos
+ check.strata=TRUE, # Comprueba que los clusters est?n anidados en los estratos
+ weights=~expr, # Ponderador
+ data=CASEN)
>
> options(survey.lonely.psu="remove")
>
A continuación, se utilizan las funciones del paquete survey para obtener la cantidad de pobres extremos, pobres no extremos y no pobres. Esto se guarda finalmente en el objeto llamado resumen_2.
> ## probres extremos
> pobres_extremos<-svyby(~pobreza_extrema,by=~region
+ ,data=CASEN
+ ,drop.empty.groups=FALSE
+ , na.rm.all=FALSE
+ ,disenio
+ ,svytotal
+ ,vartype=c("se","cv"))
>
> pobres_no_extremos<-svyby(~pobreza_no_extrema,by=~region
+ ,data=CASEN
+ ,drop.empty.groups=FALSE
+ , na.rm.all=FALSE
+ ,disenio
+ ,svytotal
+ ,vartype=c("se","cv"))
>
>
> no_pobres<-svyby(~no_pobreza,by=~region
+ ,data=CASEN
+ ,drop.empty.groups=FALSE
+ , na.rm.all=FALSE
+ ,disenio
+ ,svytotal
+ ,vartype=c("se","cv"))
>
>
>
> resumen_2<-as.data.frame(cbind(region=pobres_extremos$region, pobreza_extrema=pobres_extremos$pobreza_extrema, no_extrema=pobres_no_extremos$pobreza_no_extrema
+ , no_pobres = no_pobres$no_pobreza)
+ )
>
> resumen_2
region pobreza_extrema no_extrema no_pobres
1 1 17635 26226 354636
2 2 19618 34593 656641
3 3 9039 17060 292250
4 4 21689 46590 794311
5 5 38850 92721 1869211
6 6 22911 48195 941244
7 7 29810 70157 1056754
8 8 38972 87238 1551426
9 9 33991 84591 906665
10 10 19336 43755 840796
11 11 1142 3190 103705
12 12 1885 4371 175219
13 13 108743 259237 7954598
14 14 5099 18931 385878
15 15 7070 16567 234183
16 16 21818 40794 455317
Finalmente, se calcula la tasa de pobreza por ingresos por región.
>
>
> resumen_2<-mutate(resumen_2, tasa_pobreza = (pobreza_extrema+no_extrema)/ (pobreza_extrema+no_extrema+no_pobres)*100)
>
>
> resumen_2
region pobreza_extrema no_extrema no_pobres tasa_pobreza
1 1 17635 26226 354636 11.006607
2 2 19618 34593 656641 7.626201
3 3 9039 17060 292250 8.198235
4 4 21689 46590 794311 7.915580
5 5 38850 92721 1869211 6.575979
6 6 22911 48195 941244 7.023855
7 7 29810 70157 1056754 8.642274
8 8 38972 87238 1551426 7.523086
9 9 33991 84591 906665 11.566188
10 10 19336 43755 840796 6.979965
11 11 1142 3190 103705 4.009737
12 12 1885 4371 175219 3.447307
13 13 108743 259237 7954598 4.421467
14 14 5099 18931 385878 5.862291
15 15 7070 16567 234183 9.168024
16 16 21818 40794 455317 12.088916
Estos resultados se pueden corroborar con el set de cuadros estadísticos disponibles en [https://observatorio.ministeriodesarrollosocial.gob.cl/encuesta-casen-2022] (https://observatorio.ministeriodesarrollosocial.gob.cl/encuesta-casen-2022)., sección Estadísticas.