Dataset available at http://portal.inep.gov.br/microdados
library(pacman)
p_load(kirkegaard, sf, rms, caret, doFuture)
options(digits = 3)
doFuture::registerDoFuture()
plan(multiprocess(workers = 4))
options(future.globals.maxSize = Inf)
names2clip = function(x) tibble(colnames(x)) %>% write_clipboard(print = F)
na_outside = function(x, lower = -Inf, upper = Inf) {
x[x < lower | x > upper] = NA
x
}
1:10 %>% na_outside(2, 7)
## [1] NA 2 3 4 5 6 7 NA NA NA
#read some large datasets
ana = read_csv("data/microdados_ana_2014/DADOS/TS_ALUNO.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## CO_CADERNO_PROVA_LP = col_character(),
## CO_CADERNO_PROVA_MT = col_character(),
## TX_RESPOSTA_LP = col_character(),
## TX_RESPOSTA_MT = col_character(),
## CONCEITO_Q1 = col_character(),
## CONCEITO_Q2 = col_character(),
## CONCEITO_Q3_ORTOGRAFIA = col_character(),
## CONCEITO_Q3_COESAO = col_character(),
## CONCEITO_Q3_SEGMENTACAO = col_character(),
## CONCEITO_Q3_PONTUACAO = col_character(),
## CONCEITO_Q3_PROGRESSAO_TEMATICA = col_character(),
## CONCEITO_Q3_ELEMENTOS_NARRATIVA = col_character()
## )
## See spec(...) for full column specifications.
#there are precomputed files
ana_munis1 = readxl::read_excel("data/microdados_ana_2014/PLANILHAS DE RESULTADOS/TS_MUNICIPIO.xlsx", skip = 3)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * … and 12 more problems
colnames(ana_munis1) = c("state", "id", "name", "subset1", "subset2", "reading" + 1:4, "writing" + 1:5, "math" + 1:4)
#ftp://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2016/Brasil/BR/
munis_raw = read_sf("data/spatial/br_municipios/BRMUE250GC_SIR.shp")
munis_raw$id = munis_raw$CD_GEOCMU %>% as.numeric()
ana %<>% mutate(
reading = PROFICIENCIA_LPO,
writing = PROFICIENCIA_LPD,
math = PROFICIENCIA_MT,
g = (standardize(reading) + standardize(writing) + standardize(math)) %>% standardize()
)
#correlate variables
ana %>% select(reading:g) %>% wtd.cors()
## reading writing math g
## reading 1.000 0.712 0.756 0.917
## writing 0.712 1.000 0.646 0.876
## math 0.756 0.646 1.000 0.894
## g 0.917 0.876 0.894 1.000
#make a g factor from their summary scores
ana_munis1 %<>% mutate(
reading = reading2 + reading3 + reading4,
writing = writing2 + writing3 + writing4 + writing5,
math = math2 + math3 + math4,
g = (standardize(reading) + standardize(writing) + standardize(math)) %>% standardize(),
)
#average individual scores
ana_munis2 = ana %>%
group_by(ID_MUNICIPIO) %>%
summarise(
n = n(),
reading_i = wtd_mean(reading),
writing_i = wtd_mean(writing),
math_i = wtd_mean(math),
g_i = wtd_mean(g),
) %>%
rename(id = ID_MUNICIPIO)
#merge
ana_munis = ana_munis1 %>% filter(subset1 == "Todos", subset2 == "Todos", id %in% munis_raw$id) %>% full_join(ana_munis2)
## Joining, by = "id"
ana_munis$id %>% duplicated() %>% any()
## [1] FALSE
#correlations
ana_munis %>% select(reading1:g_i) %>% wtd.cors()
## reading1 reading2 reading3 reading4 writing1 writing2 writing3
## reading1 1.0000 0.27054 -0.90107 -0.7304 0.9272 0.7794 0.51059
## reading2 0.2705 1.00000 -0.49851 -0.6880 0.2096 0.3671 0.34622
## reading3 -0.9011 -0.49851 1.00000 0.6365 -0.8291 -0.7648 -0.53775
## reading4 -0.7304 -0.68798 0.63653 1.0000 -0.6488 -0.6260 -0.45589
## writing1 0.9272 0.20962 -0.82908 -0.6488 1.0000 0.7218 0.42625
## writing2 0.7794 0.36707 -0.76482 -0.6260 0.7218 1.0000 0.41833
## writing3 0.5106 0.34622 -0.53775 -0.4559 0.4262 0.4183 1.00000
## writing4 -0.8361 -0.19942 0.78641 0.5430 -0.8378 -0.8412 -0.49740
## writing5 -0.6127 -0.45309 0.59964 0.6412 -0.5737 -0.5818 -0.58430
## math1 0.9602 0.34723 -0.89138 -0.7447 0.8976 0.7738 0.53092
## math2 0.3774 0.71585 -0.44391 -0.6862 0.3028 0.4007 0.36878
## math3 -0.7338 -0.16252 0.71639 0.4307 -0.6876 -0.5917 -0.40365
## math4 -0.8249 -0.62385 0.79845 0.8872 -0.7458 -0.7123 -0.53596
## reading -1.0000 -0.27054 0.90107 0.7304 -0.9272 -0.7794 -0.51059
## writing -0.9272 -0.20963 0.82909 0.6488 -1.0000 -0.7218 -0.42625
## math -0.9602 -0.34722 0.89138 0.7447 -0.8976 -0.7738 -0.53092
## g -0.9865 -0.28330 0.89584 0.7260 -0.9642 -0.7774 -0.50196
## n 0.0187 0.00112 -0.00995 -0.0193 0.0214 0.0188 -0.00107
## reading_i -0.9539 -0.51421 0.89611 0.8770 -0.8791 -0.7813 -0.53549
## writing_i -0.9208 -0.37401 0.87040 0.7302 -0.9136 -0.8858 -0.59921
## math_i -0.9242 -0.51243 0.87452 0.8542 -0.8494 -0.7698 -0.55571
## g_i -0.9536 -0.47845 0.90060 0.8386 -0.9000 -0.8329 -0.57819
## writing4 writing5 math1 math2 math3 math4 reading
## reading1 -0.8361 -0.6127 0.9602 0.377 -0.73380 -0.8249 -1.0000
## reading2 -0.1994 -0.4531 0.3472 0.716 -0.16252 -0.6239 -0.2705
## reading3 0.7864 0.5996 -0.8914 -0.444 0.71639 0.7985 0.9011
## reading4 0.5430 0.6412 -0.7447 -0.686 0.43070 0.8872 0.7304
## writing1 -0.8378 -0.5737 0.8976 0.303 -0.68756 -0.7458 -0.9272
## writing2 -0.8412 -0.5818 0.7738 0.401 -0.59173 -0.7123 -0.7794
## writing3 -0.4974 -0.5843 0.5309 0.369 -0.40365 -0.5360 -0.5106
## writing4 1.0000 0.3032 -0.8136 -0.245 0.66861 0.6441 0.8361
## writing5 0.3032 1.0000 -0.6272 -0.516 0.40042 0.7023 0.6127
## math1 -0.8136 -0.6272 1.0000 0.367 -0.74394 -0.8541 -0.9602
## math2 -0.2452 -0.5161 0.3672 1.000 -0.28991 -0.7344 -0.3774
## math3 0.6686 0.4004 -0.7439 -0.290 1.00000 0.4720 0.7338
## math4 0.6441 0.7023 -0.8541 -0.734 0.47199 1.0000 0.8249
## reading 0.8361 0.6127 -0.9602 -0.377 0.73380 0.8249 1.0000
## writing 0.8378 0.5737 -0.8977 -0.303 0.68756 0.7458 0.9272
## math 0.8136 0.6272 -1.0000 -0.367 0.74394 0.8541 0.9602
## g 0.8495 0.6198 -0.9767 -0.358 0.74007 0.8288 0.9865
## n -0.0167 -0.0122 0.0181 0.033 -0.00198 -0.0329 -0.0187
## reading_i 0.7817 0.6735 -0.9439 -0.547 0.67416 0.9160 0.9539
## writing_i 0.8361 0.7619 -0.9109 -0.445 0.68164 0.8314 0.9208
## math_i 0.7519 0.6940 -0.9543 -0.599 0.65583 0.9587 0.9242
## g_i 0.8087 0.7271 -0.9577 -0.544 0.68478 0.9235 0.9536
## writing math g n reading_i writing_i math_i
## reading1 -0.9272 -0.9602 -0.9865 0.01870 -0.9539 -0.9208 -0.9242
## reading2 -0.2096 -0.3472 -0.2833 0.00112 -0.5142 -0.3740 -0.5124
## reading3 0.8291 0.8914 0.8958 -0.00995 0.8961 0.8704 0.8745
## reading4 0.6488 0.7447 0.7260 -0.01929 0.8770 0.7302 0.8542
## writing1 -1.0000 -0.8976 -0.9642 0.02141 -0.8791 -0.9136 -0.8494
## writing2 -0.7218 -0.7738 -0.7774 0.01883 -0.7813 -0.8858 -0.7698
## writing3 -0.4263 -0.5309 -0.5020 -0.00107 -0.5355 -0.5992 -0.5557
## writing4 0.8378 0.8136 0.8495 -0.01671 0.7817 0.8361 0.7519
## writing5 0.5737 0.6272 0.6198 -0.01220 0.6735 0.7619 0.6940
## math1 -0.8977 -1.0000 -0.9767 0.01807 -0.9439 -0.9109 -0.9543
## math2 -0.3028 -0.3672 -0.3583 0.03300 -0.5469 -0.4454 -0.5989
## math3 0.6876 0.7439 0.7401 -0.00198 0.6742 0.6816 0.6558
## math4 0.7458 0.8541 0.8288 -0.03285 0.9160 0.8314 0.9587
## reading 0.9272 0.9602 0.9865 -0.01870 0.9539 0.9208 0.9242
## writing 1.0000 0.8976 0.9642 -0.02142 0.8791 0.9136 0.8494
## math 0.8976 1.0000 0.9767 -0.01807 0.9439 0.9109 0.9543
## g 0.9642 0.9767 1.0000 -0.01986 0.9490 0.9378 0.9324
## n -0.0214 -0.0181 -0.0199 1.00000 -0.0136 -0.0157 -0.0218
## reading_i 0.8791 0.9439 0.9490 -0.01359 1.0000 0.9135 0.9647
## writing_i 0.9136 0.9109 0.9378 -0.01570 0.9135 1.0000 0.8989
## math_i 0.8494 0.9543 0.9324 -0.02180 0.9647 0.8989 1.0000
## g_i 0.9000 0.9576 0.9607 -0.01632 0.9822 0.9610 0.9779
## g_i
## reading1 -0.9536
## reading2 -0.4784
## reading3 0.9006
## reading4 0.8386
## writing1 -0.9000
## writing2 -0.8329
## writing3 -0.5782
## writing4 0.8087
## writing5 0.7271
## math1 -0.9577
## math2 -0.5437
## math3 0.6848
## math4 0.9235
## reading 0.9536
## writing 0.9000
## math 0.9576
## g 0.9607
## n -0.0163
## reading_i 0.9822
## writing_i 0.9610
## math_i 0.9779
## g_i 1.0000
#sample sizes
ana_munis2$n %>% describe() %>% as.matrix()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 5847 463 2203 157 221 163 10 124603 124593 36.7 1848
## se
## X1 28.8
#merge
munis = left_join(munis_raw, ana_munis)
## Joining, by = "id"
munis$id %>% duplicated() %>% any()
## [1] FALSE
#clean map
munis %>%
ggplot() +
geom_sf(lwd = .1) +
theme_classic()
#g
munis %>%
ggplot() +
geom_sf(aes(fill = g_i), lwd = .1) +
scale_fill_continuous("Intelligence level (z-score)") +
theme_classic()
ana_munis2 %>% write_csv("data/ANA2014_munis.csv", na = "")
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doFuture_0.8.0 iterators_1.0.10 foreach_1.4.4
## [4] future_1.13.0 globals_0.12.4 caret_6.0-84
## [7] rms_5.1-3.1 SparseM_1.77 sf_0.7-5
## [10] kirkegaard_2018.05 metafor_2.1-0 Matrix_1.2-17
## [13] psych_1.8.12 magrittr_1.5 assertthat_0.2.1
## [16] weights_1.0 mice_3.5.0 gdata_2.18.0
## [19] Hmisc_4.2-0 Formula_1.2-3 survival_2.44-1.1
## [22] lattice_0.20-38 forcats_0.4.0 stringr_1.4.0
## [25] dplyr_0.8.1 purrr_0.3.2 readr_1.3.1
## [28] tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
## [31] tidyverse_1.2.1 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 minqa_1.2.4 colorspace_1.4-1
## [4] class_7.3-15 htmlTable_1.13.1 base64enc_0.1-3
## [7] rstudioapi_0.10 listenv_0.7.0 MatrixModels_0.4-1
## [10] prodlim_2018.04.18 mvtnorm_1.0-10 lubridate_1.7.4
## [13] xml2_1.2.0 codetools_0.2-16 splines_3.5.3
## [16] mnormt_1.5-5 knitr_1.23 jsonlite_1.6
## [19] nloptr_1.2.1 broom_0.5.2 cluster_2.0.9
## [22] compiler_3.5.3 httr_1.4.0 backports_1.1.4
## [25] lazyeval_0.2.2 cli_1.1.0 acepack_1.4.1
## [28] htmltools_0.3.6 quantreg_5.38 tools_3.5.3
## [31] gtable_0.3.0 glue_1.3.1 reshape2_1.4.3
## [34] Rcpp_1.0.1 cellranger_1.1.0 nlme_3.1-140
## [37] timeDate_3043.102 xfun_0.7 gower_0.2.1
## [40] lme4_1.1-21 rvest_0.3.4 gtools_3.8.1
## [43] polspline_1.1.14 pan_1.6 MASS_7.3-51.4
## [46] zoo_1.8-6 scales_1.0.0 ipred_0.9-9
## [49] hms_0.4.2 sandwich_2.5-1 RColorBrewer_1.1-2
## [52] yaml_2.2.0 gridExtra_2.3 rpart_4.1-15
## [55] latticeExtra_0.6-28 stringi_1.4.3 e1071_1.7-2
## [58] checkmate_1.9.3 boot_1.3-22 lava_1.6.5
## [61] rlang_0.3.4 pkgconfig_2.0.2 evaluate_0.14
## [64] labeling_0.3 recipes_0.1.5 htmlwidgets_1.3
## [67] tidyselect_0.2.5 plyr_1.8.4 R6_2.4.0
## [70] generics_0.0.2 mitml_0.3-7 multcomp_1.4-10
## [73] DBI_1.0.0 pillar_1.4.1 haven_2.1.0
## [76] foreign_0.8-70 withr_2.1.2 units_0.6-3
## [79] nnet_7.3-12 modelr_0.1.4 crayon_1.3.4
## [82] jomo_2.6-8 KernSmooth_2.23-15 rmarkdown_1.13
## [85] grid_3.5.3 readxl_1.3.1 data.table_1.12.2
## [88] ModelMetrics_1.2.2 digest_0.6.19 classInt_0.3-3
## [91] stats4_3.5.3 munsell_0.5.0