Dataset available at http://portal.inep.gov.br/microdados

Init

library(pacman)
p_load(kirkegaard, sf, rms, caret, doFuture)
options(digits = 3)

Parallel

doFuture::registerDoFuture()
plan(multiprocess(workers = 4))
options(future.globals.maxSize = Inf)

Ad hoc functions

names2clip = function(x) tibble(colnames(x)) %>% write_clipboard(print = F)

na_outside = function(x, lower = -Inf, upper = Inf) {
  x[x < lower | x > upper] = NA
  x
}

1:10 %>% na_outside(2, 7)

##  [1] NA  2  3  4  5  6  7 NA NA NA

Data

Microdata

#read some large datasets
ana = read_csv("data/microdados_ana_2014/DADOS/TS_ALUNO.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   CO_CADERNO_PROVA_LP = col_character(),
##   CO_CADERNO_PROVA_MT = col_character(),
##   TX_RESPOSTA_LP = col_character(),
##   TX_RESPOSTA_MT = col_character(),
##   CONCEITO_Q1 = col_character(),
##   CONCEITO_Q2 = col_character(),
##   CONCEITO_Q3_ORTOGRAFIA = col_character(),
##   CONCEITO_Q3_COESAO = col_character(),
##   CONCEITO_Q3_SEGMENTACAO = col_character(),
##   CONCEITO_Q3_PONTUACAO = col_character(),
##   CONCEITO_Q3_PROGRESSAO_TEMATICA = col_character(),
##   CONCEITO_Q3_ELEMENTOS_NARRATIVA = col_character()
## )

## See spec(...) for full column specifications.

#there are precomputed files
ana_munis1 = readxl::read_excel("data/microdados_ana_2014/PLANILHAS DE RESULTADOS/TS_MUNICIPIO.xlsx", skip = 3)

## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * … and 12 more problems

colnames(ana_munis1) = c("state", "id", "name", "subset1", "subset2", "reading" + 1:4, "writing" + 1:5, "math" + 1:4)

Spatial data

#ftp://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2016/Brasil/BR/
munis_raw = read_sf("data/spatial/br_municipios/BRMUE250GC_SIR.shp")
munis_raw$id = munis_raw$CD_GEOCMU %>% as.numeric()

Recode

ANA

ana %<>% mutate(
  reading = PROFICIENCIA_LPO,
  writing = PROFICIENCIA_LPD,
  math = PROFICIENCIA_MT,
  g = (standardize(reading) + standardize(writing) + standardize(math)) %>% standardize()
)

#correlate variables
ana %>% select(reading:g) %>% wtd.cors()

##         reading writing  math     g
## reading   1.000   0.712 0.756 0.917
## writing   0.712   1.000 0.646 0.876
## math      0.756   0.646 1.000 0.894
## g         0.917   0.876 0.894 1.000

By municipality

#make a g factor from their summary scores
ana_munis1 %<>% mutate(
  reading = reading2 + reading3 + reading4,
  writing = writing2 + writing3 + writing4 + writing5,
  math = math2 + math3 + math4,
  g = (standardize(reading) + standardize(writing) + standardize(math)) %>% standardize(),
)

#average individual scores
ana_munis2 = ana %>% 
  group_by(ID_MUNICIPIO) %>% 
  summarise(
    n = n(),
    reading_i = wtd_mean(reading),
    writing_i = wtd_mean(writing),
    math_i = wtd_mean(math),
    g_i = wtd_mean(g),
  ) %>% 
  rename(id = ID_MUNICIPIO)

#merge
ana_munis = ana_munis1 %>% filter(subset1 == "Todos", subset2 == "Todos", id %in% munis_raw$id) %>% full_join(ana_munis2)

## Joining, by = "id"

ana_munis$id %>% duplicated() %>% any()

## [1] FALSE

#correlations
ana_munis %>% select(reading1:g_i) %>% wtd.cors()

##           reading1 reading2 reading3 reading4 writing1 writing2 writing3
## reading1    1.0000  0.27054 -0.90107  -0.7304   0.9272   0.7794  0.51059
## reading2    0.2705  1.00000 -0.49851  -0.6880   0.2096   0.3671  0.34622
## reading3   -0.9011 -0.49851  1.00000   0.6365  -0.8291  -0.7648 -0.53775
## reading4   -0.7304 -0.68798  0.63653   1.0000  -0.6488  -0.6260 -0.45589
## writing1    0.9272  0.20962 -0.82908  -0.6488   1.0000   0.7218  0.42625
## writing2    0.7794  0.36707 -0.76482  -0.6260   0.7218   1.0000  0.41833
## writing3    0.5106  0.34622 -0.53775  -0.4559   0.4262   0.4183  1.00000
## writing4   -0.8361 -0.19942  0.78641   0.5430  -0.8378  -0.8412 -0.49740
## writing5   -0.6127 -0.45309  0.59964   0.6412  -0.5737  -0.5818 -0.58430
## math1       0.9602  0.34723 -0.89138  -0.7447   0.8976   0.7738  0.53092
## math2       0.3774  0.71585 -0.44391  -0.6862   0.3028   0.4007  0.36878
## math3      -0.7338 -0.16252  0.71639   0.4307  -0.6876  -0.5917 -0.40365
## math4      -0.8249 -0.62385  0.79845   0.8872  -0.7458  -0.7123 -0.53596
## reading    -1.0000 -0.27054  0.90107   0.7304  -0.9272  -0.7794 -0.51059
## writing    -0.9272 -0.20963  0.82909   0.6488  -1.0000  -0.7218 -0.42625
## math       -0.9602 -0.34722  0.89138   0.7447  -0.8976  -0.7738 -0.53092
## g          -0.9865 -0.28330  0.89584   0.7260  -0.9642  -0.7774 -0.50196
## n           0.0187  0.00112 -0.00995  -0.0193   0.0214   0.0188 -0.00107
## reading_i  -0.9539 -0.51421  0.89611   0.8770  -0.8791  -0.7813 -0.53549
## writing_i  -0.9208 -0.37401  0.87040   0.7302  -0.9136  -0.8858 -0.59921
## math_i     -0.9242 -0.51243  0.87452   0.8542  -0.8494  -0.7698 -0.55571
## g_i        -0.9536 -0.47845  0.90060   0.8386  -0.9000  -0.8329 -0.57819
##           writing4 writing5   math1  math2    math3   math4 reading
## reading1   -0.8361  -0.6127  0.9602  0.377 -0.73380 -0.8249 -1.0000
## reading2   -0.1994  -0.4531  0.3472  0.716 -0.16252 -0.6239 -0.2705
## reading3    0.7864   0.5996 -0.8914 -0.444  0.71639  0.7985  0.9011
## reading4    0.5430   0.6412 -0.7447 -0.686  0.43070  0.8872  0.7304
## writing1   -0.8378  -0.5737  0.8976  0.303 -0.68756 -0.7458 -0.9272
## writing2   -0.8412  -0.5818  0.7738  0.401 -0.59173 -0.7123 -0.7794
## writing3   -0.4974  -0.5843  0.5309  0.369 -0.40365 -0.5360 -0.5106
## writing4    1.0000   0.3032 -0.8136 -0.245  0.66861  0.6441  0.8361
## writing5    0.3032   1.0000 -0.6272 -0.516  0.40042  0.7023  0.6127
## math1      -0.8136  -0.6272  1.0000  0.367 -0.74394 -0.8541 -0.9602
## math2      -0.2452  -0.5161  0.3672  1.000 -0.28991 -0.7344 -0.3774
## math3       0.6686   0.4004 -0.7439 -0.290  1.00000  0.4720  0.7338
## math4       0.6441   0.7023 -0.8541 -0.734  0.47199  1.0000  0.8249
## reading     0.8361   0.6127 -0.9602 -0.377  0.73380  0.8249  1.0000
## writing     0.8378   0.5737 -0.8977 -0.303  0.68756  0.7458  0.9272
## math        0.8136   0.6272 -1.0000 -0.367  0.74394  0.8541  0.9602
## g           0.8495   0.6198 -0.9767 -0.358  0.74007  0.8288  0.9865
## n          -0.0167  -0.0122  0.0181  0.033 -0.00198 -0.0329 -0.0187
## reading_i   0.7817   0.6735 -0.9439 -0.547  0.67416  0.9160  0.9539
## writing_i   0.8361   0.7619 -0.9109 -0.445  0.68164  0.8314  0.9208
## math_i      0.7519   0.6940 -0.9543 -0.599  0.65583  0.9587  0.9242
## g_i         0.8087   0.7271 -0.9577 -0.544  0.68478  0.9235  0.9536
##           writing    math       g        n reading_i writing_i  math_i
## reading1  -0.9272 -0.9602 -0.9865  0.01870   -0.9539   -0.9208 -0.9242
## reading2  -0.2096 -0.3472 -0.2833  0.00112   -0.5142   -0.3740 -0.5124
## reading3   0.8291  0.8914  0.8958 -0.00995    0.8961    0.8704  0.8745
## reading4   0.6488  0.7447  0.7260 -0.01929    0.8770    0.7302  0.8542
## writing1  -1.0000 -0.8976 -0.9642  0.02141   -0.8791   -0.9136 -0.8494
## writing2  -0.7218 -0.7738 -0.7774  0.01883   -0.7813   -0.8858 -0.7698
## writing3  -0.4263 -0.5309 -0.5020 -0.00107   -0.5355   -0.5992 -0.5557
## writing4   0.8378  0.8136  0.8495 -0.01671    0.7817    0.8361  0.7519
## writing5   0.5737  0.6272  0.6198 -0.01220    0.6735    0.7619  0.6940
## math1     -0.8977 -1.0000 -0.9767  0.01807   -0.9439   -0.9109 -0.9543
## math2     -0.3028 -0.3672 -0.3583  0.03300   -0.5469   -0.4454 -0.5989
## math3      0.6876  0.7439  0.7401 -0.00198    0.6742    0.6816  0.6558
## math4      0.7458  0.8541  0.8288 -0.03285    0.9160    0.8314  0.9587
## reading    0.9272  0.9602  0.9865 -0.01870    0.9539    0.9208  0.9242
## writing    1.0000  0.8976  0.9642 -0.02142    0.8791    0.9136  0.8494
## math       0.8976  1.0000  0.9767 -0.01807    0.9439    0.9109  0.9543
## g          0.9642  0.9767  1.0000 -0.01986    0.9490    0.9378  0.9324
## n         -0.0214 -0.0181 -0.0199  1.00000   -0.0136   -0.0157 -0.0218
## reading_i  0.8791  0.9439  0.9490 -0.01359    1.0000    0.9135  0.9647
## writing_i  0.9136  0.9109  0.9378 -0.01570    0.9135    1.0000  0.8989
## math_i     0.8494  0.9543  0.9324 -0.02180    0.9647    0.8989  1.0000
## g_i        0.9000  0.9576  0.9607 -0.01632    0.9822    0.9610  0.9779
##               g_i
## reading1  -0.9536
## reading2  -0.4784
## reading3   0.9006
## reading4   0.8386
## writing1  -0.9000
## writing2  -0.8329
## writing3  -0.5782
## writing4   0.8087
## writing5   0.7271
## math1     -0.9577
## math2     -0.5437
## math3      0.6848
## math4      0.9235
## reading    0.9536
## writing    0.9000
## math       0.9576
## g          0.9607
## n         -0.0163
## reading_i  0.9822
## writing_i  0.9610
## math_i     0.9779
## g_i        1.0000

#sample sizes
ana_munis2$n %>% describe() %>% as.matrix()

##    vars    n mean   sd median trimmed mad min    max  range skew kurtosis
## X1    1 5847  463 2203    157     221 163  10 124603 124593 36.7     1848
##      se
## X1 28.8

Maps

#merge
munis = left_join(munis_raw, ana_munis)

## Joining, by = "id"

munis$id %>% duplicated() %>% any()

## [1] FALSE

#clean map
munis %>% 
  ggplot() +
  geom_sf(lwd = .1) +
  theme_classic()

#g
munis %>% 
  ggplot() +
  geom_sf(aes(fill = g_i), lwd = .1) +
  scale_fill_continuous("Intelligence level (z-score)") +
  theme_classic()

Write data

ana_munis2 %>% write_csv("data/ANA2014_munis.csv", na = "")

Versions

sessionInfo()

## R version 3.5.3 (2019-03-11)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.1
## 
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doFuture_0.8.0     iterators_1.0.10   foreach_1.4.4     
##  [4] future_1.13.0      globals_0.12.4     caret_6.0-84      
##  [7] rms_5.1-3.1        SparseM_1.77       sf_0.7-5          
## [10] kirkegaard_2018.05 metafor_2.1-0      Matrix_1.2-17     
## [13] psych_1.8.12       magrittr_1.5       assertthat_0.2.1  
## [16] weights_1.0        mice_3.5.0         gdata_2.18.0      
## [19] Hmisc_4.2-0        Formula_1.2-3      survival_2.44-1.1 
## [22] lattice_0.20-38    forcats_0.4.0      stringr_1.4.0     
## [25] dplyr_0.8.1        purrr_0.3.2        readr_1.3.1       
## [28] tidyr_0.8.3        tibble_2.1.3       ggplot2_3.2.0     
## [31] tidyverse_1.2.1    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.0-10      minqa_1.2.4         colorspace_1.4-1   
##  [4] class_7.3-15        htmlTable_1.13.1    base64enc_0.1-3    
##  [7] rstudioapi_0.10     listenv_0.7.0       MatrixModels_0.4-1 
## [10] prodlim_2018.04.18  mvtnorm_1.0-10      lubridate_1.7.4    
## [13] xml2_1.2.0          codetools_0.2-16    splines_3.5.3      
## [16] mnormt_1.5-5        knitr_1.23          jsonlite_1.6       
## [19] nloptr_1.2.1        broom_0.5.2         cluster_2.0.9      
## [22] compiler_3.5.3      httr_1.4.0          backports_1.1.4    
## [25] lazyeval_0.2.2      cli_1.1.0           acepack_1.4.1      
## [28] htmltools_0.3.6     quantreg_5.38       tools_3.5.3        
## [31] gtable_0.3.0        glue_1.3.1          reshape2_1.4.3     
## [34] Rcpp_1.0.1          cellranger_1.1.0    nlme_3.1-140       
## [37] timeDate_3043.102   xfun_0.7            gower_0.2.1        
## [40] lme4_1.1-21         rvest_0.3.4         gtools_3.8.1       
## [43] polspline_1.1.14    pan_1.6             MASS_7.3-51.4      
## [46] zoo_1.8-6           scales_1.0.0        ipred_0.9-9        
## [49] hms_0.4.2           sandwich_2.5-1      RColorBrewer_1.1-2 
## [52] yaml_2.2.0          gridExtra_2.3       rpart_4.1-15       
## [55] latticeExtra_0.6-28 stringi_1.4.3       e1071_1.7-2        
## [58] checkmate_1.9.3     boot_1.3-22         lava_1.6.5         
## [61] rlang_0.3.4         pkgconfig_2.0.2     evaluate_0.14      
## [64] labeling_0.3        recipes_0.1.5       htmlwidgets_1.3    
## [67] tidyselect_0.2.5    plyr_1.8.4          R6_2.4.0           
## [70] generics_0.0.2      mitml_0.3-7         multcomp_1.4-10    
## [73] DBI_1.0.0           pillar_1.4.1        haven_2.1.0        
## [76] foreign_0.8-70      withr_2.1.2         units_0.6-3        
## [79] nnet_7.3-12         modelr_0.1.4        crayon_1.3.4       
## [82] jomo_2.6-8          KernSmooth_2.23-15  rmarkdown_1.13     
## [85] grid_3.5.3          readxl_1.3.1        data.table_1.12.2  
## [88] ModelMetrics_1.2.2  digest_0.6.19       classInt_0.3-3     
## [91] stats4_3.5.3        munsell_0.5.0

Brazilian ANA 2014 dataset: example analysis

Init

Parallel

Ad hoc functions

Data

Microdata

Spatial data

Recode

ANA

By municipality

Maps

Write data

Versions