rm(list=ls())
setwd("/home/daniel/Dropbox/Papers-ponencias/Segregacion2020-isaforum/Analisis")
library(readr)
library(DBI)
library(RSQLite)
library(dplyr)
library(dbplyr)

Para calcular el Índice de disimilitud de Duncan (IDC) se emplearon los microdatos del Censo 2017, nivel personas.

Para hacer posible analizar una base de datos masiva, en un computador modesto y con el software R, se aplicó el código publicado por Julián Cabezas.

1. Cargar el Censo 2017 en R.

El primer paso es conocer la estructura de los microdatos personas Censo 2017.

delim_file <- "/home/daniel/Dropbox/Papers-ponencias/Segregacion2020-isaforum/Microdato_Censo2017-Personas/censo.csv"
readLines(delim_file, n=2)
## [1] "REGION;PROVINCIA;COMUNA;DC;AREA;ZC_LOC;ID_ZONA_LOC;NVIV;NHOGAR;PERSONAN;P07;P08;P09;P10;P10COMUNA;P10PAIS;P11;P11COMUNA;P11PAIS;P12;P12COMUNA;P12PAIS;P12A_LLEGADA;P12A_TRAMO;P13;P14;P15;P15A;P16;P16A;P16A_OTRO;P17;P18;P19;P20;P21M;P21A;P10PAIS_GRUPO;P11PAIS_GRUPO;P12PAIS_GRUPO;ESCOLARIDAD;P16A_GRUPO;REGION_15R;PROVINCIA_15R;COMUNA_15R;P10COMUNA_15R;P11COMUNA_15R;P12COMUNA_15R"
## [2] "15;152;15202;1;2;6;13225;1;1;1;1;1;73;1;98;998;3;15101;998;1;98;998;9998;98;2;4;6;2;1;2;98;7;98;98;98;98;9998;998;998;998;4;2;15;152;15202;98;15101;98"
total_records <- system2("wc", args = c("-l", delim_file), stdout = TRUE) %>%
 sub(normalizePath(delim_file), "", .) %>%
 as.integer() %>%
       {
         . - 1
       }
total_records
## [1] 17574003

Se concluye que:

table_name_censo <- "censo2017"
sqlite_file_censo <- "censo2017.sqlite3"
read_delim2sqlite <- function(delim_file, delim, sqlite_file, table_name, batch_size = 10000, OS = "Unix") {

  ## establish a connection to the database
  condb <- dbConnect(SQLite(), sqlite_file)

  ## get the total number of records in the file
  # in Unix
  if (OS == "Unix") {
    total_records <- system2("wc", args = c("-l", delim_file), stdout = TRUE) %>%
      sub(normalizePath(delim_file), "", .) %>%
      as.integer() %>%
      {
        . - 1
      }
  } else {
    # In windows
    total_records <- system2("powershell", args = c("Get-content", delim_file, "|", "Measure-Object", "–Line"), stdout = TRUE)
    total_records <- as.numeric(gsub("[^0-9]", "", paste(total_records, collapse = ""))) - 1
  }

  message("Total records: ", total_records)

  ## find the number of passes needed based on size of each batch
  passes <- total_records %/% batch_size
  remaining <- total_records %% batch_size

  message("Total Passes to complete: ", passes)

  ## first pass determines header and column types
  dat <- read_delim(delim_file, delim, n_max = batch_size, progress = FALSE) %>% as.data.frame()
  if (nrow(problems(dat)) > 0) print(problems(dat))
  col_names <- colnames(dat)
  col_types <- c(character = "c", numeric = "d", integer = "i", logical = "l", Date = "c") %>%
    .[sapply(dat, class)] %>%
    paste0(collapse = "")

  ## write to database table
  dbWriteTable(condb, table_name, dat, overwrite = TRUE)

  ## multiple passes
  for (p in 2:passes) {
    message("Pass number: ", p, ", Progress:", round(p / passes, 2) * 100, "%")
    read_delim(delim_file, delim,
      col_names = col_names, col_types = col_types,
      skip = (p - 1) * batch_size + 1, n_max = batch_size, progress = FALSE
    ) %>%
      as.data.frame() %>%
      dbWriteTable(condb, table_name, ., append = TRUE)
  }

  if (remaining) {
    read_delim(delim_file, delim,
      col_names = col_names, col_types = col_types,
      skip = p * batch_size + 1, n_max = remaining, progress = FALSE
    ) %>%
      as.data.frame() %>%
      dbWriteTable(condb, table_name, ., append = TRUE)
  }

  ## close the database connection
  dbDisconnect(condb)
}

Luego, se carga la base de datos en formato sqlite3 para hacerla manejable.

read_delim2sqlite(delim_file, delim=";", sqlite_file = sqlite_file_censo, 
                  table_name = table_name_censo, batch_size=500000)
con <- dbConnect(SQLite(), sqlite_file_censo)
con
## <SQLiteConnection>
##   Path: /home/daniel/Dropbox/Papers-ponencias/Segregacion2020-isaforum/Analisis/censo2017.sqlite3
##   Extensions: TRUE
dbListTables(con)
## [1] "censo2017"
datos_db <- tbl(con, "censo2017")
datos_db
## # Source:   table<censo2017> [?? x 48]
## # Database: sqlite 3.34.1
## #   [/home/daniel/Dropbox/Papers-ponencias/Segregacion2020-isaforum/Analisis/censo2017.sqlite3]
##    REGION PROVINCIA COMUNA    DC  AREA ZC_LOC ID_ZONA_LOC  NVIV NHOGAR PERSONAN
##     <dbl>     <dbl>  <dbl> <dbl> <dbl>  <dbl>       <dbl> <dbl>  <dbl>    <dbl>
##  1     15       152  15202     1     2      6       13225     1      1        1
##  2     15       152  15202     1     2      6       13225     3      1        1
##  3     15       152  15202     1     2      6       13225     3      1        2
##  4     15       152  15202     1     2      6       13225     3      1        3
##  5     15       152  15202     1     2      6       13225     3      1        4
##  6     15       152  15202     1     2      6       13225     9      1        1
##  7     15       152  15202     1     2      6       13225     9      1        2
##  8     15       152  15202     1     2      6       13225     9      1        3
##  9     15       152  15202     1     2      6       13225     9      1        4
## 10     15       152  15202     1     2      6       13225    10      1        1
## # … with more rows, and 38 more variables: P07 <dbl>, P08 <dbl>, P09 <dbl>,
## #   P10 <dbl>, P10COMUNA <dbl>, P10PAIS <dbl>, P11 <dbl>, P11COMUNA <dbl>,
## #   P11PAIS <dbl>, P12 <dbl>, P12COMUNA <dbl>, P12PAIS <dbl>,
## #   P12A_LLEGADA <dbl>, P12A_TRAMO <dbl>, P13 <dbl>, P14 <dbl>, P15 <dbl>,
## #   P15A <dbl>, P16 <dbl>, P16A <dbl>, P16A_OTRO <dbl>, P17 <dbl>, P18 <chr>,
## #   P19 <dbl>, P20 <dbl>, P21M <dbl>, P21A <dbl>, P10PAIS_GRUPO <dbl>,
## #   P11PAIS_GRUPO <dbl>, P12PAIS_GRUPO <dbl>, ESCOLARIDAD <dbl>,
## #   P16A_GRUPO <dbl>, REGION_15R <dbl>, PROVINCIA_15R <dbl>, COMUNA_15R <dbl>,
## #   P10COMUNA_15R <dbl>, P11COMUNA_15R <dbl>, P12COMUNA_15R <dbl>

2. Procesamiento de datos

Primero, se calculó la cantidad de jefes de hogar, que viven en zonas urbanas, a nivel comunal.

jefes_comuna <- datos_db %>%
  filter(P07==1 & AREA ==1) %>%
  group_by(COMUNA_15R) %>%
  tally()

jefes_comuna <- collect(jefes_comuna)
jefes_comuna <- as.data.frame(jefes_comuna)

Luego, se calculó la cantidad de jefes de hogar, que viven en zonas urbanas, con menos de 12 años de escolaridad a nivel comunal.

jefes_comuna_esc11 <- datos_db %>%
  filter(P07==1 & AREA ==1 & ESCOLARIDAD < 12) %>%
  group_by(COMUNA_15R) %>%
  tally()

jefes_comuna_esc11 <- collect(jefes_comuna_esc11)
jefes_comuna_esc11 <- as.data.frame(jefes_comuna_esc11)

En tercer lugar, se calculó la cantidad de jefes de hogar urbanos por zona censal.

jefes_zc <- datos_db %>%
  filter(P07==1 & AREA ==1) %>%
  group_by(COMUNA_15R, DC, ZC_LOC) %>%
  tally()

jefes_zc <- collect(jefes_zc)
jefes_zc <- as.data.frame(jefes_zc)

Finalmente, la cantidad de jefes de hogar urbanos, con menos de 12 años de escolaridad, por zona censal.

jefes_zc_esc11 <- datos_db %>%
  filter(P07==1 & AREA ==1 & ESCOLARIDAD < 12) %>%
  group_by(COMUNA_15R, DC, ZC_LOC) %>%
  tally()

jefes_zc_esc11 <- collect(jefes_zc_esc11)
jefes_zc_esc11 <- as.data.frame(jefes_zc_esc11)

3. Cálculo de Índice de disimilitud de Duncan (IDC)

Empleando la siguiente fórmula se calculó el IDC:

\[\frac{1}{2}*\sum^n_{i=1}\left|\frac{X_i}{X}-\frac{Y_i}{Y}\right|\]

Donde:

-\(X_i\) corresponde a la cantidad de jefes de hogar con menos de 12 años de escolaridad en la zona censal i de la comuna;

-\(X\) es la cantidad de jefes de hogar con menos de 12 años de escolaridad en la comuna;

-\(Y_i\) corresponde a la cantidad de jefes de hogar con 12 años de escolaridad o más en la zona censal i de la comuna;

-\(Y\) es la cantidad de jefes de hogar con 12 años de escolaridad o más en la comuna.

indice <- full_join(jefes_zc, jefes_zc_esc11, by = c("COMUNA_15R", "DC", "ZC_LOC"))
indice <- indice %>%
  group_by(COMUNA_15R) %>%
  mutate(jefes_comuna = sum(n.x, na.rm = TRUE))
indice <- indice %>%
  group_by(COMUNA_15R) %>%
  mutate(jefes_comuna_esc11 = sum(n.y, na.rm = TRUE))
indice <- rename(indice, jefes_zc = n.x , jefes_zc_esc11 = n.y)
indice <- as.data.frame(indice)
indice$jefes_zc_esc12 <- indice$jefes_zc - indice$jefes_zc_esc11
indice$jefes_comuna_esc12 <- indice$jefes_comuna - indice$jefes_comuna_esc11
indice$DC[indice$DC==99] <- NA
indice$DC[indice$ZC_LOC==999] <- NA
indice<-na.omit(indice)

indice <- indice %>%
  group_by(COMUNA_15R) %>%
  mutate(IDC = round((1/2)*sum(abs((jefes_zc_esc11/jefes_comuna_esc11)-
                                     ((jefes_zc_esc12/jefes_comuna_esc12)))), 
                     digits = 3))

indice <- indice %>%
  group_by(COMUNA_15R) %>%
  summarise(unique(IDC))
indice <- rename(indice, IDC = "unique(IDC)")

3. Cálculo de Índice de acumulación de desventajas sociales.

Empleando la siguiente fórmula se calculó el IDS:

\[\frac{X_i}{X}\]

Donde:

-\(X_i\) corresponde a la cantidad de jefes de hogar con menos de 12 años de escolaridad en la comuna;

-\(X\) es la cantidad de jefes de hogar en la comuna.

indice$IDS <- round(jefes_comuna_esc11$n/jefes_comuna$n,3) 

Para terminar, un poco de orden a la variable comuna.

etiquetas <- c("iquique", "alto hospicio", "pozo almonte", "camiña", "colchane", "huara", "pica",
               "antofagasta","mejillones", "sierra gorda", "taltal", "calama", "ollagüe",
               "san pedro de atacama", "tocopilla", "maría elena", "copiapó", "caldera", 
               "tierra amarilla", "chañaral", "diego de almagro", "vallenar", "alto del carmen",
               "freirina", "huasco", "la serena", "coquimbo", "andacollo", "la higuera", "paiguano",
               "vicuña", "illapel", "canela", "los vilos", "salamanca", "ovalle", "combarbalá",
               "monte patria", "punitaqui", "río hurtado", "valparaíso", "casablanca", "concón",
               "juan fernández", "puchuncaví", "quintero", "viña del mar", "isla de pascua",
               "los andes", "calle larga", "rinconada", "san esteban", "la ligua", "cabildo",
               "papudo", "petorca", "zapallar", "quillota", "calera", "hijuelas", "la cruz",
               "nogales", "san antonio", "algarrobo", "cartagena", "el quisco", "el tabo",
               "santo domingo", "san felipe", "catemu", "llaillay", "panquehue", "putaendo",
               "santa maría", "quilpué", "limache", "olmué", "villa alemana", "rancagua",
               "codegua", "coinco", "coltauco", "doñihue", "graneros", "las cabras", "machalí",
               "malloa", "mostazal", "olivar", "peumo", "pichidegua", "quinta de tilcoco",
               "rengo", "requínoa", "san vicente", "pichilemu", "la estrella", "litueche",
               "marchihue", "navidad", "paredones", "san fernando", "chépica", "chimbarongo",
               "lolol", "nancagua", "palmilla", "peralillo", "placilla", "pumanque","santa cruz",
               "talca", "constitución", "curepto", "empedrado", "maule", "pelarco", "pencahue",
               "río claro", "san clemente", "san rafael", "cauquenes", "chanco", "pelluhue",
               "curicó", "hualañé", "licantén", "molina", "rauco", "romeral", "sagrada familia",
               "teno", "vichuquén", "linares", "colbún", "longaví", "parral", "retiro", "san javier",
               "villa alegre", "yerbas buenas", "concepción", "coronel", "chiguayante", "florida",
               "hualqui", "lota", "penco", "san pedro de la paz", "santa juana", "talcahuano",
               "tomé", "hualpén", "lebu", "arauco", "cañete", "contulmo", "curanilahue", 
               "los álamos", "tirúa", "los ángeles", "antuco", "cabrero", "laja", "mulchén",
               "nacimiento", "negrete", "quilaco", "quilleco", "san rosendo", "santa bárbara",
               "tucapel", "yumbel", "alto biobío", "chillán", "bulnes", "cobquecura", "coelemu",
               "coihueco", "chillán viejo", "el carmen", "ninhue", "ñiquén", "pemuco", "pinto",
               "portezuelo", "quillón", "quirihue", "ránquil", "san carlos", "san fabián",
               "san ignacio", "san nicolás", "treguaco", "yungay", "temuco", "carahue", "cunco",
               "curarrehue", "freire", "galvarino", "gorbea", "lautaro", "loncoche", "melipeuco",
               "nueva imperial", "padre las casas", "perquenco", "pitrufquén", "pucón", "saavedra",
               "teodoro schmidt", "toltén", "vilcún", "villarrica", "cholchol", "angol", 
               "collipulli", "curacautín", "ercilla", "lonquimay", "los sauces", "lumaco", 
               "purén", "renaico", "traiguén", "victoria", "puerto montt", "calbuco", "cochamó",
               "fresia", "frutillar", "los muermos", "llanquihue", "maullín", "puerto varas",
               "castro", "ancud", "chonchi", "curaco de vélez", "dalcahue", "puqueldón", "queilén",
               "quellón", "quemchi", "quinchao", "osorno", "puerto octay", "purranque", "puyehue",
               "río negro", "san juan de la costa", "san pablo", "chaitén", "futaleufú", "hualaihué",
               "palena", "coyhaique", "lago verde", "aysén", "cisnes", "guaitecas", "cochrane",
               "o'higgins", "tortel", "chile chico", "río ibáñez", "punta arenas", "laguna blanca",
               "río verde", "san gregorio", "cabo de hornos", "antártica", "porvenir", "primavera",
               "timaukel", "natales", "torres del paine", "santiago", "cerrillos", "cerro navia",
               "conchalí", "el bosque", "estación central", "huechuraba", "independencia", 
               "la cisterna", "la florida", "la granja", "la pintana", "la reina", "las condes",
               "lo barnechea", "lo espejo", "lo prado", "macul", "maipú", "ñuñoa", 
               "pedro aguirre cerda", "peñalolén", "providencia", "pudahuel", "quilicura", 
               "quinta normal", "recoleta", "renca", "san joaquín", "san miguel", "san ramón",
               "vitacura", "puente alto", "pirque", "san josé de maipo", "colina", "lampa",
               "tiltil", "san bernardo", "buin", "calera de tango", "paine", "melipilla", "alhué",
               "curacaví", "maría pinto", "san pedro", "talagante", "el monte", "isla de maipo", 
               "padre hurtado", "peñaflor", "valdivia", "corral", "lanco", "los lagos", "máfil",
               "mariquina", "paillaco", "panguipulli", "la unión", "futrono", "lago ranco",
               "río bueno", "arica", "camarones", "putre", "general lagos")
FirstCap <- function(x) {
    s <- strsplit(x, " ")[[1]]
    paste(toupper(substring(s, 1,1)), 
          substring(s, 2),
          sep="", collapse=" ")
}
etiquetas <- unlist(lapply(etiquetas, FUN=FirstCap))

indice$COMUNA_15R <- factor(indice$COMUNA_15R, 
                            levels= c(1101, 1107, 1401, 1402, 1403, 1404, 1405, 2101, 2102, 2103, 
                                      2104, 2201, 2202, 2203, 2301, 2302, 3101, 3102, 3103, 3201,
                                      3202, 3301, 3302, 3303, 3304, 4101, 4102, 4103, 4104, 4105,
                                      4106, 4201, 4202, 4203, 4204, 4301, 4302, 4303, 4304, 4305,
                                      5101, 5102, 5103, 5104, 5105, 5107, 5109, 5201, 5301, 5302,
                                      5303, 5304, 5401, 5402, 5403, 5404, 5405, 5501, 5502, 5503,
                                      5504, 5506, 5601, 5602, 5603, 5604, 5605, 5606, 5701, 5702,
                                      5703, 5704, 5705, 5706, 5801, 5802, 5803, 5804, 6101, 6102,
                                      6103, 6104, 6105, 6106, 6107, 6108, 6109, 6110, 6111, 6112,
                                      6113, 6114, 6115, 6116, 6117, 6201, 6202, 6203, 6204, 6205,
                                      6206, 6301, 6302, 6303, 6304, 6305, 6306, 6307, 6308, 6309,
                                      6310, 7101, 7102, 7103, 7104, 7105, 7106, 7107, 7108, 7109,
                                      7110, 7201, 7202, 7203, 7301, 7302, 7303, 7304, 7305, 7306,
                                      7307, 7308, 7309, 7401, 7402, 7403, 7404, 7405, 7406, 7407,
                                      7408, 8101, 8102, 8103, 8104, 8105, 8106, 8107, 8108, 8109,
                                      8110, 8111, 8112, 8201, 8202, 8203, 8204, 8205, 8206, 8207,
                                      8301, 8302, 8303, 8304, 8305, 8306, 8307, 8308, 8309, 8310,
                                      8311, 8312, 8313, 8314, 8401, 8402, 8403, 8404, 8405, 8406,
                                      8407, 8408, 8409, 8410, 8411, 8412, 8413, 8414, 8415, 8416,
                                      8417, 8418, 8419, 8420, 8421, 9101, 9102, 9103, 9104, 9105,
                                      9106, 9107, 9108, 9109, 9110, 9111, 9112, 9113, 9114, 9115,
                                      9116, 9117, 9118, 9119, 9120, 9121, 9201, 9202, 9203, 9204,
                                      9205, 9206, 9207, 9208, 9209, 9210, 9211, 10101, 10102,
                                      10103, 10104, 10105, 10106, 10107, 10108, 10109, 10201,
                                      10202, 10203, 10204, 10205, 10206, 10207, 10208, 10209,
                                      10210, 10301, 10302, 10303, 10304, 10305, 10306, 10307,
                                      10401, 10402, 10403, 10404, 11101, 11102, 11201, 11202,
                                      11203, 11301, 11302, 11303, 11401, 11402, 12101, 12102,
                                      12103, 12104, 12201, 12202, 12301, 12302, 12303, 12401,
                                      12402, 13101, 13102, 13103, 13104, 13105, 13106, 13107,
                                      13108, 13109, 13110, 13111, 13112, 13113, 13114, 13115,
                                      13116, 13117, 13118, 13119, 13120, 13121, 13122, 13123,
                                      13124, 13125, 13126, 13127, 13128, 13129, 13130, 13131,
                                      13132, 13201, 13202, 13203, 13301, 13302, 13303, 13401,
                                      13402, 13403, 13404, 13501, 13502, 13503, 13504, 13505,
                                      13601, 13602, 13603, 13604, 13605, 14101, 14102, 14103,
                                      14104, 14105, 14106, 14107, 14108, 14201, 14202, 14203,
                                      14204, 15101, 15102, 15201, 15202), 
                            labels=etiquetas)

save(indice,file="segregacion2017.rdata")
indice2<-select(indice,-COMUNA_15R)
summary(indice2)
##       IDC              IDS        
##  Min.   :0.0000   Min.   :0.0540  
##  1st Qu.:0.0475   1st Qu.:0.4265  
##  Median :0.1180   Median :0.5130  
##  Mean   :0.1372   Mean   :0.4939  
##  3rd Qu.:0.1985   3rd Qu.:0.5810  
##  Max.   :0.5750   Max.   :0.7010
rm(indice2)