Introduction

This is the R notebook for the paper:

Kirkegaard, E. O. W. & Fuerst, J. (2017). Admixture in Argentina. Mankind Quarterly

The notebook contains nearly all the code and output from the statistical analyses.

Initialize

First we load the packages, data and do some recoding.

# libs --------------------------------------------------------------------
library(pacman)
p_load(psych, plyr, dplyr, stringr, weights, ggplot2, magrittr, XLConnect, tidyr, readr, VIM, lavaan, lavaan.survey, purrr, choroplethrAdmin1, choroplethr, grid, haven, polycor, lsr, forcats, kirkegaard, mirt)

#UTF-8
options(encoding = "UTF-8", digits = 2)

# controls -----------------------------------------------------------------
#some variables that control the behavior of functions
#primary variables
v_primary_vars = c("CA", "S", "European", "White", "Skin_brightness", "European_SIRE", "Latitude", "Temperature")

#demographic variables
v_demo = c("European_SIRE", "White", "Skin_brightness")

#cognitive items
v_all_items = c("gi1", "gi2", "gi3", "gi4", "gix4", "gi5")


# custom functions --------------------------------------------------------

source("R/functions.R")

# knoema data --------------------------------------------------------------------
#wiki data
# d_wiki = read.csv("data/Wikipedia_data.csv", encoding = "UTF-8", stringsAsFactors = F)
# rownames(d_wiki) = d_wiki$Knoema_name

#knoema data
d_knoema = readWorksheetFromFile("data/Argentina_data.xlsx", sheet = "tidy_data")

#clean colnames
colnames(d_knoema) %<>% str_replace_all(pattern = "^X..", replacement = "") %>%
  str_replace_all(pattern = "\\.\\.+", replacement = ".")

#rownames to abbreviations
rownames(d_knoema) = d_knoema$Province %>% pu_translate(superunit = "ARG", messages = F)


# ONE data ----------------------------------------------------------------

d_CA = readWorksheetFromFile("data/ONEresults.xlsx", sheet = "data")

# LAPOP data --------------------------------------------------------------
l_lapop = list("2008" = haven::read_spss("data/LAPOP/Argentina LAPOP 2008.sav", user_na=F),
               "2010" = haven::read_spss("data/LAPOP/Argentina LAPOP 2010.sav", user_na=F),
               "2012" = haven::read_spss("data/LAPOP/Argentina LAPOP 2012.sav", user_na=F),
               "2014" = haven::read_spss("data/LAPOP/Argentina LAPOP 2014.sav", user_na=F)
)

#convert to one dataset
d_lapop = ldply(l_lapop, .fun = identity, .id = "year")

#fill in missing year data
d_lapop$year %<>% mapvalues(NA, "2014") %>% as.factor

#id var
d_lapop$.id = 1:nrow(d_lapop)

#rename
d_lapop$municipality = d_lapop$municipio %>% factor
d_lapop$province = d_lapop$prov %>% factor
d_lapop$district = ((d_lapop$municipality %>% as.character()) + "_" + d_lapop$argdistrito) %>% factor
d_lapop$sex = d_lapop$q1 %>% factor(labels = c("Male", "Female"))
d_lapop$age = d_lapop$q2 %>% as.numeric

#SIRE data
d_lapop$SIRE = d_lapop$etid %>% as.numeric
sire_labels = attr(d_lapop$etid, "labels")
d_lapop$SIRE %<>% mapvalues(c(sire_labels, NA), c(names(sire_labels), "not stated"), warn_missing = F) %>% factor
d_lapop$White = d_lapop$etid == 1
d_lapop$Mestizo = d_lapop$etid == 2
d_lapop$Indigenous = d_lapop$etid == 3
d_lapop$Black = d_lapop$etid == 4
d_lapop$Mulatto = d_lapop$etid == 5
d_lapop$Other = d_lapop$etid == 7

#skin brightnes
d_lapop$colorr_num = d_lapop$colorr %>%  as.numeric() %>% mapvalues(97, NA)
d_lapop$Skin_brightness = kirkegaard::reverse_scale(d_lapop$colorr_num)
table(d_lapop$Skin_brightness)

## 
##    1    2    3    4    5    6    7    8    9   10 
##    3    6   40  101  306  542  998 1342  749  332

#SIRE-based admixture
#John way
d_lapop %<>% mutate(European_SIRE1 = White + .5*Mestizo + .5 * Mulatto)
#right way
d_lapop %<>% mutate(European_SIRE2 = .8*White + .6*Mestizo + .6*Mulatto + .2*Black + .2*Indigenous + .2*Other)
#primary
d_lapop$European_SIRE = d_lapop$European_SIRE2

#get rid of 'labelled' for items / score items
for (var in v_all_items) {
  d_lapop[[var]] = (d_lapop[[var]] == 1)
}

#abbreviate
d_lapop$province_abbrev = pu_translate(d_lapop$province %>% as.character(), superunit = "ARG", messages = 0)
#nice names
d_lapop$province_name = pu_translate(d_lapop$province_abbrev, reverse = T, messages = 0)

#examine units
table(d_lapop$estratopri) #region https://en.wikipedia.org/wiki/Regions_of_Argentina

## 
## 1701 1702 1703 1704 1705 1706 1707 
## 2242 1104  550  701  451  314  558

table(d_lapop$upm) #primary sampling unit, unclear

## 
##   10   12   15   16   21   25   31   32   35   38   43   45   49   52   53 
##   36   36   36   36   36   36   36   54   54   36   36   36   36   36   36 
##   59   60   62   69   74   78   79   82   83   84   85   86   87   88   89 
##   36   36   36   36   36   36   36   36   36   36   36   36   36   36   36 
##   90   91   92   93   94   95   96   97   98   99  100  101  102  103  104 
##   36   36   36   36   36   36   36   36   36   36   36   36   36   36   36 
##  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119 
##   36   36   36   36   36   36   36   36   36   36   36   36   36   36   36 
##  120  121  122  123  124  125  126  127  128  129  130  131  132  201  202 
##   36   36   36   36   36   36   36   36   36   36   36   36   36   42   48 
##  203  204  205  206  207  321  322  323  351  352  353 1701 1702 1703 1704 
##   36   48   42   18   18   18   18   18   18   18   18   44  247   21   22 
## 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 
##    8   29   33   15   14   22   97   27   56   17   35   28   30   41   12 
## 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 
##   25   22   28   28   17  126  139   49   23   34   39  124  125  123   14 
## 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 
##  118   33   42   48   39    8   67    7   41   14   71    4   33   15   67 
## 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 
##   16   29   14   58   15   25   14   32   12   24   24   27   10   14   24 
## 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1776 1777 1778 1780 1781 
##   14   16   10   21   11   34    4   40    7   37   26    5    1    6    5

table(d_lapop$prov) #province https://en.wikipedia.org/wiki/Provinces_of_Argentina

## 
## 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 
## 2242  429  493  128   74  111  328   37  186  168  269   26   52  312   84 
## 1716 1717 1718 1719 1720 1721 1722 1723 
##   29   26   34   87  170   23  504  108

table(d_lapop$municipio) #municipality seems to be third-level unit https://en.wikipedia.org/wiki/Administrative_divisions_of_Argentina

## 
##       2      10      12      15      16      21      25      31      32 
##     126      18      18      18      18      18      18      18      54 
##      35      38      43      45      49      52      53      59      60 
##      54      18      18      18      18      18      18      18      18 
##      62      69      74      78      79      82      83      84      85 
##      18      18      18      18      18      18      18      18      18 
##      86      87      88      89      90      91      92      93      94 
##      18      18      18      18      18      18      18      18      18 
##      95      96      97      98      99     100     101     102     103 
##      18      18      18      18      18      18      18      18      18 
##     104     105     106     107     108     109     110     111     112 
##      18      18      18      18      18      18      18      18      18 
##     113     114     115     116     117     118     119     120     121 
##      18      18      18      18      18      18      18      18      18 
##     122     123     124     125     126     127     128     129     130 
##      18      18      18      18      18      18      18      18      18 
##     131     132    1701    1702    1703    1704    1705    1706    1707 
##      18      18      19     113      11      11       4      14      16 
##    1708    1709    1710    1711    1712    1713    1714    1715    1716 
##       7       7      11      44       9      28       6      18      13 
##    1717    1718    1719    1720    1721    1722    1723    1724    1725 
##      13      19       6      15      11      13      11       5      63 
##    1726    1727    1728    1729    1730    1731    1732    1733    1735 
##      71      25      12      17      19      14     106      14      81 
##    1736    1737    1738    1739    1741    1742    1743    1744    1745 
##      23      14      37       8      34       1      29       7      35 
##    1746    1747    1748    1749    1750    1751    1752    1753    1754 
##       2      14       4      34       6      19      11      35      11 
##    1755    1756    1757    1758    1759    1760    1761    1762    1763 
##      10       3      21       4      15      12      12       7       8 
##    1764    1765    1766    1767    1768    1769    1770    1771    1772 
##      12      12       4       8      13       2      10       2      15 
##    1773    1774    1776    1777    1778    1780    1781   17001   17002 
##       2      25      26       5       1       6       5      25     134 
##   17003   17004   17005   17006   17007   17008   17009   17010   17011 
##      10      11       4      15      17       8       7      11      53 
##   17012   17013   17014   17015   17016   17017   17018   17019   17020 
##      18      28      11      17      15      17      22       6      10 
##   17021   17022   17023   17024   17025   17026   17027   17028   17029 
##      11      15      17      12      63      68      24      11      17 
##   17030   17031   17032   17033   17034   17035   17036   17037   17038 
##      20     110      19     109      14      37      10      28      11 
##   17039   17040   17041   17042   17043   17044   17045   17046   17047 
##      31       8      33       6      12       7      36       2      19 
##   17048   17049   17050   17051   17052   17053   17054   17055   17056 
##      11      33      10      10       3      23       4      15      11 
##   17057   17058   17059   17060   17061   17062   17063   17064   17065 
##      11       8       9      12      15       3       6      12       2 
##   17066   17067   17068   17069   17070   17071   17072   17073   17074 
##      12       2       8       9      24       2      25       5      12 
## 1700002 1700010 1700012 1700015 1700016 1700021 1700025 1700031 1700032 
##     126      18      18      18      18      18      18      18      54 
## 1700035 1700038 1700043 1700045 1700049 1700052 1700053 1700059 1700060 
##      54      18      18      18      18      18      18      18      18 
## 1700062 1700069 1700074 1700078 1700079 1700082 1700083 1700084 1700085 
##      18      18      18      18      18      18      18      18      18 
## 1700086 1700087 1700088 1700089 1700090 1700091 1700092 1700093 1700094 
##      18      18      18      18      18      18      18      18      18 
## 1700095 1700096 1700097 1700098 1700099 1700100 1700101 1700102 1700103 
##      18      18      18      18      18      18      18      18      18 
## 1700104 1700105 1700106 1700107 1700108 1700109 1700110 1700111 1700112 
##      18      18      18      18      18      18      18      18      18 
## 1700113 1700114 1700115 1700116 1700117 1700118 1700119 1700120 1700121 
##      18      18      18      18      18      18      18      18      18 
## 1700122 1700123 1700124 1700125 1700126 1700127 1700128 1700129 1700130 
##      18      18      18      18      18      18      18      18      18 
## 1700131 1700132 
##      18      18

table(d_lapop$argdistrito) #district ?

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  122 1572  396  172  144  164  212   65   60   44   28   56   61   24   23 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##   20   38   64   38   56   20   93   24  158   20   21   39   22   22   59 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##   20   20   20   24   18   18   19   18   72   18   36   36   18   18   72 
##   45   46   47   48   49   50 
##   18   18   36   36   18   18

# missing data ------------------------------------------------------------
miss_amount(d_knoema)

## cases with missing data  vars with missing data cells with missing data 
##                  0.2500                  0.1600                  0.0082

miss_plot(d_knoema)

#impute
d_knoema = VIM::irmi(d_knoema, noise = F)

## Warning in irmi_work(x, eps, maxit, mixed, mixed.constant, count, step, :
## At least one character variable is converted into a factor

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

#rownames again
rownames(d_knoema) = pu_translate(d_knoema$Province %>% as.character(), superunit = "ARG", messages = 0)

# merge and average across years ----------------------------------------------------------
d_knoema_long = df_gather_by_pattern(d_knoema[-1], pattern = "\\.(\\d\\d\\d\\d)", key_col = "year", id_col = "Province")

## Warning: Too few values at 384 locations: 1057, 1058, 1059, 1060, 1061,
## 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073,
## 1074, 1075, 1076, ...

#warning is because a few measurements have no year info
d_knoema_averaged = ddply(d_knoema_long, .variables = "Province", .fun = function(block) {
  colMeans(block[-c(1:2)], na.rm=T) #get the means, but ignore the first two columns which has non-numeric data
})

#rownames
rownames(d_knoema_averaged) = d_knoema_averaged$Province

#rename
d_knoema_averaged$pop = d_knoema_averaged$Total.population


# load genomic ancestry data -----------------------------------------------------------
d_genoancest = readWorksheetFromFile("data/Supplemental File 1 Argentina Provincial Admixture Estimates.xlsx", sheet = "clean_data")
d_genoancest$Province %<>% pu_translate(superunit="ARG", messages = 0)

# merge_datasets ------------------------------------------------

d_main = dplyr::full_join(d_knoema_averaged, d_genoancest, by = c("Province" = "Province"))

#keep names
d_main$name = d_main$Province %>% pu_translate(reverse = T, messages = 0)

#rename variables for ease of use
d_main$pop = d_main$Total.population
d_main$Temperature = d_main$Mean.C

#translations
d_main$province_abbrev = d_main$Province
d_main$province_name = d_main$Province %>% pu_translate(reverse = T, messages = 0)

LAPOP analyses

The LAPOP data must be analyzed first at the individual-level and then aggregated to the higher-level units.

S data

First we analyze the S data.

### LAPOP S FACTOR ANALYSIS
#individual-level data

v_lapop_s_indi = c("aoj11", "ls3", "ed", "q10", "vic1", "r" + c("1", "3", "4", "4a", "5", "6", "7", "8", "12", "14", "15"))

#indicator names
v_lapop_s_indi_names = c("Feel safe", "Life satisfication", "Education", "Family income", "Victim of crime last 12 months", "Have: " + c("TV", "Refrigerator", "Landline phone", "Cell phone", "Car(s)", "Washing machine", "Microwave", "Motercycle", "Drinking water in house", "Bathroom indoors", "Computer"))


# extract S data ----------------------------------------------------------
#note: we have to rename one item: vic1 to vic1ext
#this is the same question with more clarifying text

d_lapop_s = ldply(l_lapop, .fun = function(block) {
  #rename if necessary
  block %<>% df_rename(current_names = c("vic1ext", "q10new"), c("vic1", "q10"))

  #extract available indicators
  block = block[c(v_lapop_s_indi)]
  
  #get rid of labelled class
  block[] = lapply(block, as.factor)
  
  block
}, .id = "year")

## Warning: Variable vic1ext was not in the data.

## Warning: Variable q10new was not in the data.

## Warning: Variable q10new was not in the data.

#reverse life satisfaction variable
d_lapop_s$ls3 %<>% fct_rev()


# control indicators for age/sex/year ------------------------------------------
#controller function for easy reuse
controller = function(x, age, sex, year) {
  #local df
  d = data_frame(var = x, age = age, sex = sex, year = year)
  
  #regress out age
  fit = loess("var ~ age", data = d, na.action = na.exclude)
  d$var2 = resid(fit)
  
  #regress out sex and year
  fit2 = lm("var2 ~ sex + year", data = d, na.action = na.exclude)
  
  #return resids
  resid(fit2)
}

#control indicators
d_lapop_s_con = lapply(d_lapop_s[-1], FUN = function(var) {
  controller(var %>% as.numeric, age = d_lapop$age, sex = d_lapop$sex, year = d_lapop$year)
}) %>% as.data.frame()
d_lapop_s_con = cbind(year = d_lapop_s$year, d_lapop_s_con)


# impute missing data -----------------------------------------------------

#examine miss
d_lapop_s_con %>% miss_amount

## cases with missing data  vars with missing data cells with missing data 
##                   0.320                   0.940                   0.029

#impute
#this takes a while, so we load from disk to save time
if (file.exists("data/s_data_imp_corrected.rds") && file.exists("data/s_data_imp.rds")) {
  d_lapop_s_con = read_rds("data/s_data_imp_corrected.rds")
  d_lapop_s = read_rds("data/s_data_imp.rds")
} else {
  d_lapop_s_con %<>% VIM::irmi(noise = F)
  d_lapop_s %<>% VIM::irmi(noise = F)
  write_rds(d_lapop_s_con, "data/s_data_imp_corrected.rds")
  write_rds(d_lapop_s, "data/s_data_imp.rds")
}

# rename indicators -------------------------------------------------------

names(d_lapop_s)[-1] = v_lapop_s_indi_names
names(d_lapop_s_con)[-1] = v_lapop_s_indi_names

# factor analyze ----------------------------------------------------------
#latent correlations of uncontrolled indicators
lapop_s_lcor = silence(hetcor(d_lapop_s[-1]))

#compare
l_fa_lapop_s = list("Pearson" = fa(d_lapop_s_con[-1]),
                    "Latent" = fa(lapop_s_lcor$correlations))

fa_plot_loadings(l_fa_lapop_s)

ggsave("figures/individual/S_method.png")

## Saving 7 x 5 in image

#congruence coefs
fa_congruence_matrix(l_fa_lapop_s) %>% MAT_half() %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##       0.94       0.94       0.94       0.94       0.94       0.94 
##   midrange 
##       0.94

# robustness checks --------------------------------------------------------

#indicator sampling variation
lapop_s_fa_rel = fa_splitsample_repeat(d_lapop_s_con[-1], progress = F)
GG_denhist(lapop_s_fa_rel)

## Warning in GG_denhist(lapop_s_fa_rel): received a data frame but no var:
## used the only available column

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lapop_s_fa_rel %>% unlist %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##       0.64       0.64       0.64       0.69       0.66       0.65 
##   midrange 
##       0.58

#factor method variance
lapop_s_fa_mv = fa_all_methods(d_lapop_s_con[-1], messages = F)
lapop_s_fa_mv$scores %>% wtd.cors %>% MAT_half() %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##          1          1          1          1          1          1 
##   midrange 
##          1

#by survey year
l_fa_s_year = dlply(d_lapop_s_con, .variables = "year", .fun = function(block) {
  fa(block[-1])
})

#plot
fa_plot_loadings(l_fa_s_year)

ggsave("figures/individual/S_year.png")

## Saving 7 x 5 in image

#congruence
fa_congruence_matrix(l_fa_s_year) %>% MAT_half() %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##       0.98       0.98       0.98       0.99       0.98       0.98 
##   midrange 
##       0.97

CA data

Second, we analyze the CA data.

### SCORE COGNITIVE ABILITY FROM POLITICAL KNOWLEDGE QUESTIONS

# percent correct by item by year by province -----------------------------
d_mean_item_correct_by_year_province = aggregate_ca_by_year_unit("province_abbrev", NA_to_F = T)

d_mean_item_correct_by_province_agr = aggregate_ca_by_unit(d_mean_item_correct_by_year_province, "province_abbrev")

#impute missing
d_mean_item_correct_by_province_agr = VIM::irmi(d_mean_item_correct_by_province_agr, noise = F)

## Warning in irmi_work(x, eps, maxit, mixed, mixed.constant, count, step, :
## At least one character variable is converted into a factor

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

#cors
cor_matrix(d_mean_item_correct_by_province_agr) %>% round(2)

##       gi1  gi2  gi3  gi4 gix4  gi5
## gi1  1.00 0.49 0.46 0.44 0.55 0.34
## gi2  0.49 1.00 0.46 0.48 0.27 0.50
## gi3  0.46 0.46 1.00 0.61 0.42 0.06
## gi4  0.44 0.48 0.61 1.00 0.18 0.42
## gix4 0.55 0.27 0.42 0.18 1.00 0.10
## gi5  0.34 0.50 0.06 0.42 0.10 1.00

cor_matrix(d_mean_item_correct_by_province_agr) %>% MAT_half() %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##       0.39       0.33       0.25       0.49       0.44       0.39 
##   midrange 
##       0.33

#factor analyze
fa_ca_prov = fa(d_mean_item_correct_by_province_agr[-1], scores = "Bartlett")

#plot
fa_plot_loadings(fa_ca_prov)

#make df for merging
d_fa_province_level = data_frame(fa_ca_prov$scores %>% as.vector,
                                 d_mean_item_correct_by_province_agr$province_abbrev)
colnames(d_fa_province_level) = c("CA_lapop_aggr", "Province")

#merge
d_main = dplyr::left_join(d_main, d_fa_province_level, by = c("Province" = "Province"))

## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector

# pass rates by year ------------------------------------------------------
item_pass_rates_by_year = ddply(d_mean_item_correct_by_year_province, .variables = c("year"), .fun = function(block) {
  # browser()
  data.frame(gi1 = wtd_mean(block$gi1, block$n_gi1, error = F),
             gi2 = wtd_mean(block$gi2, block$n_gi2, error = F),
             gi3 = wtd_mean(block$gi3, block$n_gi3, error = F),
             gi4 = wtd_mean(block$gi4, block$n_gi4, error = F),
             gix4 = wtd_mean(block$gix4, block$n_gix4, error = F),
             gi5 = wtd_mean(block$gi5, block$n_gi5, error = F))
})

cbind(item_pass_rates_by_year, mean = item_pass_rates_by_year[-1] %>% rowMeans(na.rm = T)) %>% write_clipboard()

##   year  gi1  gi2  gi3  gi4 gix4  gi5 mean
## 1 2008 0.83 0.39 0.50 0.86      0.80 0.67
## 2 2010 0.80      0.58 0.75           0.71
## 3 2012 0.76           0.86           0.81
## 4 2014 0.77           0.78 0.47      0.67

# combined data ---------------------------------------------------
#can handle NA, but perhaps not well

d_items_scored = ldply(l_lapop, .fun = function(block) {
  #subset
  d_items = df_flexsubset(block, vars = c("gi" + c(1:4, "x4", 5)), messages = F)
  
  d_items[] = lapply(d_items, function(x) {
    (x == 1)
  })
  
  d_items
})

d_items_scored = df_rename(d_items_scored, ".id", "year")

#examine
#TODO: what is the error?
# wtd.cors(d_items_scored[-1])
# hetcor(d_items_scored[-1], use = "pairwise.complete.obs") %>% 
#   `[[`(1) %>% 
#   write_clipboard()

#some item combinations never appear
#we fix this by combining items gix4 and gi5 which have similar pass rates

d_items_scored2 = d_items_scored
d_items_scored2$gi45 = rowMeans(d_items_scored[c("gix4", "gi5")], na.rm=T) %>% as.logical
d_items_scored2$gi5 = NULL
d_items_scored2$gix4 = NULL


#examine
hetcor(d_items_scored2[-1]) %>% `[[`(1) %>% write_clipboard()

##       gi1  gi2  gi3  gi4 gi45
## gi1  1.00 0.64 0.52 0.58 0.65
## gi2  0.64 1.00 0.62 0.52 0.59
## gi3  0.52 0.62 1.00 0.62 0.47
## gi4  0.58 0.52 0.62 1.00 0.60
## gi45 0.65 0.59 0.47 0.60 1.00

#NA filled version
d_items_scored_fillna = d_items_scored = ldply(l_lapop, .fun = function(block) {
  #subset
  d_items = df_flexsubset(block, vars = c("gi" + c(1:4, "x4", 5)), messages = F)
  
  #convert NAs
  d_items = map_df(d_items, ~mapvalues(., NA, F, warn_missing = F))
  
  #score
  d_items[] = lapply(d_items, function(x) {
    (x == 1)
  })
  
  d_items
})

# irt fill NA -----------------------------------------------------------

#score within years because that's easier
irt_score_fill_na = dlply(d_items_scored2, "year", function(d) {
  #get rid of NA vars
  d = df_remove_NA_vars(d)
  
  #fill in NA with F
  d = df_colFunc(d, function(x) mapvalues(x, NA, F, warn_missing = F))
  
  #to data matrix
  d = d[-1] %>% data.matrix
  
  #irt
  y = list(irt = irt.fa(d, sort = F))
  
  #scores
  y$scores = scoreIrt(y$irt, d)
  y$scores$theta1_std = y$scores$theta1 %>% standardize
  y$reliability = irt_reliability(y$irt)
  y$items = d
  
  y
})

#scores
irt_score_fill_na_scores = ldply(irt_score_fill_na, function(x) {
  x$scores
})

irt_score_fill_na_scores = map_df(irt_score_fill_na, "scores")


#regress year out
irt_score_fill_na_scores$year = d_lapop$year
irt_score_fill_na_scores$theta1_rm_yr = lm(theta1 ~ year, data = irt_score_fill_na_scores) %>% resid

#reliability
irt_score_fill_na_rel = map_dbl(irt_score_fill_na, "reliability")

#filled items for reuse
lapop_filled_items = map_df(irt_score_fill_na, ~.[["items"]] %>% as.data.frame)


# alternative scoring methods ---------------------------------------------

if (F) {
  # IRT ---------------------------------------------------------------------
  #IRT
  ca_irt = irt.fa(d_items_scored2[-1], sort = F)
  
  png("figures/individual/IRT_discrim.png")
  plot(ca_irt)
  dev.off()
  
  #score
  irt_score = scoreIrt(ca_irt, d_items_scored2[-1] %>% df_colFunc(func = as.numeric))
  irt_score$year = d_lapop$year
  
  #plot distribution
  GG_denhist(irt_score$theta1) + xlab("Theta score")
  #ggsave("figures/individual/IRT_dist.png")
  GG_denhist(irt_score, "theta1", group = "year")
  #ggsave("figures/individual/IRT_dist_year.png")
  
  #IRT did not seem to deal entirely with the year to year differences
  #will need to regress out year too
  
  fit = lm("theta1 ~ year", data = irt_score, na.action = na.exclude)
  (fit_sum = MOD_summary(fit, progress = F))
  fit_sum %>% `[[`(1) %>% write_clipboard()
  #noticable effects of year, = item composition
  
  #save residuals + standardize
  irt_score$theta_cor = resid(fit) %>% standardize()
  
  #standardize within years
  #this also equates the means, but should do a better job with the dispersion, maybe
  irt_score$theta_cor2 = irt_score %>% 
    group_by(year) %>% 
    do(standardize(.$theta1) %>% 
         as.data.frame) %>% 
    `[[`(2)
  
  wtd.cors(irt_score) %>% round(2)
  
  #output for examination
  cbind(d_items_scored2, irt_score) %>% write_csv("data/cognitive_scoring.csv", na = "")
  
  
  # IRT within years --------------------------------------------------------
  #to see how this affects dealing with missing data
  #to estimate the reliability, perform IRT by year
  
  #score within years
  lapop_ca_irt_year = dlply(d_items_scored2, "year", function(d) {
    #to numeric
    d = df_colFunc(d, as.numeric)
    
    #remove na cols
    d = df_remove_NA_vars(d)
    
    #irt
    .fa = irt.fa(d[-1], sort = F)
    
    #score
    .scores = scoreIrt(.fa, d[-1])
    
    list(irt = .fa,
         scores = .scores
    )
  })
  
  #estimate IRT reliability and true score correlations
  lapop_ca_irt_rel = sapply(lapop_ca_irt_year, function(x) {
    x$irt %>% irt_reliability
  })
  
  #collect scores
  irt_score2 = rbindlist(map(lapop_ca_irt_year, "scores"), fill = T) %>% 
    cbind(year = d_lapop$year)
  
  #plot
  GG_denhist(irt_score2, "theta1", group = "year")
  GG_denhist(irt_score, "theta1", group = "year")
  
  #descriptive within years
  ddply(irt_score2, "year", function(x) {
    psych::describe(x[1:3])
  })
  
  ddply(irt_score, "year", function(x) {
    psych::describe(x)
  })
  
  #regress year out
  irt_score2$theta1_rm_yr = lm("theta1 ~ year", data = irt_score2, na.action = na.exclude) %>% resid
  
  
  # John method -------------------------------------------------------------
  #score each person as mean item correct
  #regress out the effect of year
  
  d_item_simple = data_frame(year = d_lapop$year,
                             mean_score = rowMeans(d_items_scored[-1], na.rm=T))
  
  #regress year out
  #note that year is a categorical variable here, not a numeric!
  
  fit = lm("mean_score ~ year", data = d_item_simple, na.action = na.exclude)
  (fit_sum = MOD_summary(fit, progress = F))
  fit_sum %>% `[[`(1) %>% write_clipboard()
  #large year effects = almost entirely item composition
  
  #save residuals
  irt_score$simple = resid(fit)
  irt_score$simple_std = resid(fit) %>% standardize()
  
  
  # mirt package ------------------------------------------------------------
  
  #filter missing first
  #necessary to enable matching up the data afterwards
  #mirt does not support passing thru cases with all NA
  d_items_scored3 = cbind(
    .id = d_lapop$.id,
    d_items_scored2
  ) %>% 
    miss_filter(3, reverse = T)
  
  #fit
  mirt = mirt::mirt(d_items_scored3[-(1:2)] %>% data.matrix, model = 1)
  
  #score
  mirt_scores = data_frame(
    .id = d_items_scored3$.id,
    year = d_items_scored3$year,
    s_tog_mirt = fscores(mirt) %>% as.vector
  )
  
  #regress out year
  mirt_scores$s_tog_rm_yr_mirt = lm("s_tog_mirt ~ year", data = mirt_scores) %>% resid
  
  # simple fill NA ----------------------------------------------------------
  #why do people skip items? Assume it's when they don't know and not random
  #simple method above ignores the non-answered items in the calculations
  #so if you skip 2 out of 3, and get the last right, get perfect score
  
  #score within years because that's easier
  sim_score_fill_na = ddply(d_items_scored2, "year", function(d) {
    #get rid of NA vars
    d = df_remove_NA_vars(d)
    
    #fill in NA with F
    d = df_colFunc(d, function(x) mapvalues(x, NA, F, warn_missing = F))
    
    #to data matrix
    d = d[-1] %>% data.matrix
    
    #score
    data_frame(s_sim_fillna = rowMeans(d),
               s_sim_fillna_stwy = rowMeans(d) %>% standardize())
  })
  
  #regress year out
  sim_score_fill_na$year = d_lapop$year
  sim_score_fill_na$s_sim_fillna_rm_yr = lm(s_sim_fillna ~ year, data = sim_score_fill_na) %>% resid
  
  
  # compare scores ----------------------------------------------------------
  
  cog_scores = cbind(
    .id = 1:nrow(d_lapop),
    d_items_scored2,
    hash = df_rowhash(d_items_scored2),
    S = l_fa_lapop_s$Pearson$scores %>% as.vector,
    s_sim = d_item_simple$mean_score,
    s_sim_ry = irt_score$simple,
    s_irt_tog = irt_score$theta1,
    s_irt_tog_ry = irt_score$theta_cor,
    s_irt_sep = irt_score2$theta1,
    s_irt_sep_ry = irt_score2$theta1_rm_yr,
    s_sim_fillna = sim_score_fill_na$s_sim_fillna,
    s_sim_fillna_rm_yr = sim_score_fill_na$s_sim_fillna_rm_yr,
    s_sim_fillna_stwy = sim_score_fill_na$s_sim_fillna_stwy,
    s_irt_fillna = irt_score_fill_na_scores$theta1,
    s_irt_fillna_ry = irt_score_fill_na_scores$theta1_rm_yr,
    s_irt_fillna_stwy = irt_score_fill_na_scores$theta1_std
  )
  
  #merge in mirt
  cog_scores = dplyr::left_join(cog_scores, mirt_scores[-2])
  
  #aggregate to unique answer patterns
  cog_scores_patterns = df_merge_rows(cog_scores, key = "hash", func = wtd_mean, error = F)
  
  #correlate CA with S within years
  cog_scores %>% ddply("year", function(x) {
    x %>%
      df_subset_by_pattern("^[Ss]") %>% 
      wtd.cors() %>% 
      `[`(1, -1)
  }) %>% 
    rbind(cog_scores %>%
            df_subset_by_pattern("^[Ss]") %>% 
            wtd.cors %>% 
            `[`(1, -1) %>% 
            c(NA, .)
    )
  
  cog_scores %>% wtd.cors()
  
  GG_scatter(cog_scores_patterns, "s_irt_fillna_stwy", "S")
  
  #distribution of final scores
  GG_denhist(cog_scores, "s_irt_fillna_stwy", group = "year")
  ggsave("figures/individual/IRT_dist_year.png")
  GG_denhist(cog_scores, "s_irt_fillna_stwy")
  ggsave("figures/individual/IRT_dist.png")
  cog_scores$s_irt_fillna_stwy %>% psych::describe()
  
  
  # compare before and after psych update -----------------------------------
  
  old_psych = read_csv("data/cognitive_scoring_1.6.11.csv") %>% df_add_affix(suffix = "_old")
  new_psych = read_csv("data/cognitive_scoring_1.7.1.csv") %>% df_add_affix(suffix = "_new")
  
  #join & fix
  psych_compare = cbind(id = 1:nrow(d_lapop),
                        d_items_scored2,
                        old_psych[c("theta1", "total1", "fit1") + "_old"],
                        new_psych[c("theta1", "total1", "fit1") + "_new"])
  psych_compare$year = d_lapop$year %>% factor()
  psych_compare$S = d_lapop$S
  
  #get only the unique rows for the cognitive data
  psych_compare_distinct = psych_compare %>% 
    dplyr::distinct(year, gi1, gi2, gi3, gi4, gi45, .keep_all = T)
  
  #plots
  GG_scatter(psych_compare, "theta1_old", "theta1_new", case_names = F)
  
  GG_scatter(psych_compare %>% dplyr::filter(year == "2012"), "theta1_old", "theta1_new", case_names = F)
  
  ggplot(psych_compare, aes(theta1_old, theta1_new, color = year)) +
    geom_point() +
    geom_smooth(method = lm)
  
  
  #output
  write_csv(psych_compare, "data/psych_scortIrt_compare.csv", na = "")
  write_csv(psych_compare_distinct, "data/psych_scortIrt_compare_distinct.csv", na = "")
}

District-level analyses

Third, we aggregate the data to the district-level and analyze it there.

### DISTRICTS
#aggregate data at districto level and correlate



# score ca ----------------------------------------------------------------
#by year + unit
d_mean_item_correct_by_year_dist = aggregate_ca_by_year_unit("district", NA_to_F = T)

#by unit
d_mean_item_correct_by_dist = aggregate_ca_by_unit(d_mean_item_correct_by_year_dist, "district")

#samples
dist_ca_n = aggregate_n_by_unit(d_mean_item_correct_by_year_dist, unit = "district")
dist_ca_n$n %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##        6.7        3.8        2.5        7.2        4.0        4.5 
##   midrange 
##       59.3

#impute missing
d_mean_item_correct_by_dist %<>% df_remove_NA_vars()
d_mean_item_correct_by_dist = VIM::irmi(d_mean_item_correct_by_dist[-1], noise = F)

#factor analyze
dist_ca_fa = fa(d_mean_item_correct_by_dist[-1], scores = "Bartlett", weight = dist_ca_n$n %>% sqrt, fm = "wls")
wtd.cors(d_mean_item_correct_by_dist[-1]) %>% round(2)

##      gi2  gi3  gi4  gi5
## gi2 1.00 0.49 0.26 0.47
## gi3 0.49 1.00 0.15 0.29
## gi4 0.26 0.15 1.00 0.36
## gi5 0.47 0.29 0.36 1.00

wtd.cors(d_mean_item_correct_by_dist[-1], weight = dist_ca_n$n %>% sqrt) %>% round(2)

##      gi2  gi3  gi4  gi5
## gi2 1.00 0.52 0.25 0.50
## gi3 0.52 1.00 0.16 0.32
## gi4 0.25 0.16 1.00 0.37
## gi5 0.50 0.32 0.37 1.00

# score S -----------------------------------------------------------------
#get data
dist_s_indi = aggregate_s_by_unit(unit = "district")

#impute
dist_s_indi = VIM::irmi(dist_s_indi, noise = F)

## No missings in x. Nothing to impute

## Warning in kNN_work(as.data.table(data), variable, metric, k, dist_var, :
## Nothing to impute, because no NA are present (also after using makeNA)

#factor analyze
dist_s_fa = fa(dist_s_indi[-1], fm = "wls", weight = dist_ca_n$n %>% sqrt)


# demo --------------------------------------------------------------------

dist_demo = aggregate_demo_by_unit("district")

# analysis ----------------------------------------------------------------

d_dist = data_frame(n = dist_ca_n$n,
                    S = dist_s_fa$scores %>% as.vector,
                    CA = dist_ca_fa$scores %>% as.vector) %>% 
  cbind(dist_demo[-1])
nrow(d_dist)

## [1] 437

#cors
wtd.cors(d_dist[-1], weight = d_dist$n %>% sqrt) %>% write_clipboard()

##                    S   CA European SIRE White Skin brightness
## S               1.00 0.52          0.28  0.32            0.43
## CA              0.52 1.00          0.14  0.13            0.28
## European SIRE   0.28 0.14          1.00  0.89            0.44
## White           0.32 0.13          0.89  1.00            0.48
## Skin brightness 0.43 0.28          0.44  0.48            1.00

#plot
GG_scatter(d_dist, "CA", "S", weights = d_dist$n %>% sqrt, case_names = F)

ggsave("figures/district/CA_S.png")

## Saving 7 x 5 in image

GG_scatter(d_dist, "European_SIRE", "CA", weights = d_dist$n %>% sqrt, case_names = F)

ggsave("figures/district/Euro_CA.png")

## Saving 7 x 5 in image

GG_scatter(d_dist, "European_SIRE", "S", weights = d_dist$n %>% sqrt, case_names = F)

ggsave("figures/district/Euro_S.png")

## Saving 7 x 5 in image

Municipal-level analyses

Fourth, we aggregate data to the municipal-level and analyze it there.

### MUNICIPALITIES
#aggregate data at municipio level and correlate


# score ca ----------------------------------------------------------------
#by year + unit
d_mean_item_correct_by_year_muni = aggregate_ca_by_year_unit("municipality", NA_to_F = T)

#by unit
d_mean_item_correct_by_muni = aggregate_ca_by_unit(d_mean_item_correct_by_year_muni, "municipality")

#samples
muni_ca_n = aggregate_n_by_unit(d_mean_item_correct_by_year_muni, unit = "municipality")
muni_ca_n$n %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed 
##       10.7        7.9        5.9        6.0        8.5        8.2 
##   midrange 
##       56.1

#impute missing
d_mean_item_correct_by_muni = VIM::irmi(d_mean_item_correct_by_muni[-1], noise = F)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

#factor analyze
muni_ca_fa = fa(d_mean_item_correct_by_muni, scores = "Bartlett", weight = muni_ca_n$n %>% sqrt, fm = "wls")

## The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.

wtd.cors(d_mean_item_correct_by_muni)

##       gi1  gi2  gi3  gi4 gix4  gi5
## gi1  1.00 0.48 0.31 0.38 0.82 0.68
## gi2  0.48 1.00 0.55 0.13 0.66 0.66
## gi3  0.31 0.55 1.00 0.32 0.67 0.45
## gi4  0.38 0.13 0.32 1.00 0.45 0.39
## gix4 0.82 0.66 0.67 0.45 1.00 0.74
## gi5  0.68 0.66 0.45 0.39 0.74 1.00

wtd.cors(d_mean_item_correct_by_muni, weight = muni_ca_n$n %>% sqrt)

##       gi1   gi2  gi3   gi4 gix4  gi5
## gi1  1.00 0.436 0.31 0.390 0.80 0.65
## gi2  0.44 1.000 0.58 0.097 0.64 0.66
## gi3  0.31 0.581 1.00 0.277 0.68 0.48
## gi4  0.39 0.097 0.28 1.000 0.44 0.38
## gix4 0.80 0.643 0.68 0.436 1.00 0.72
## gi5  0.65 0.660 0.48 0.384 0.72 1.00

# score S -----------------------------------------------------------------
#get data
muni_s_indi = aggregate_s_by_unit(unit = "municipality")

#impute
muni_s_indi = VIM::irmi(muni_s_indi, noise = F)

## No missings in x. Nothing to impute

## Warning in kNN_work(as.data.table(data), variable, metric, k, dist_var, :
## Nothing to impute, because no NA are present (also after using makeNA)

#factor analyze
muni_s_fa = fa(muni_s_indi[-1], fm = "wls", weight = muni_ca_n$n %>% sqrt)


# demo --------------------------------------------------------------------

muni_demo = aggregate_demo_by_unit("municipality")

# analysis ----------------------------------------------------------------

d_muni = data_frame(n = muni_ca_n$n,
                    S = muni_s_fa$scores %>% as.vector,
                    CA = muni_ca_fa$scores %>% as.vector
                    ) %>% cbind(muni_demo[-1])

wtd.cors(d_muni[-1], weight = d_muni$n %>% sqrt) %>% write_clipboard()

##                    S   CA European SIRE White Skin brightness
## S               1.00 0.66          0.33  0.33            0.53
## CA              0.66 1.00          0.20  0.19            0.46
## European SIRE   0.33 0.20          1.00  0.90            0.45
## White           0.33 0.19          0.90  1.00            0.46
## Skin brightness 0.53 0.46          0.45  0.46            1.00

#plot
GG_scatter(d_muni, "CA", "S", weights = d_muni$n %>% sqrt, case_names = F)

ggsave("figures/municipal/CA_S.png")

## Saving 7 x 5 in image

GG_scatter(d_muni, "European_SIRE", "CA", weights = d_muni$n %>% sqrt, case_names = F)

ggsave("figures/municipal/Euro_CA.png")

## Saving 7 x 5 in image

GG_scatter(d_muni, "European_SIRE", "S", weights = d_muni$n %>% sqrt, case_names = F)

ggsave("figures/municipal/Euro_S.png")

## Saving 7 x 5 in image

Aggregate to province-level

Fifth, we aggregate the data to the province-level.

### Province level analysis using the LAPOP data
#aggregate the individual-level LAPOP data to provinces


# insert data into main lapop dataset -------------------------------------
#here we gather the variables from the other analysis into the main lapop dataset
d_lapop$S = l_fa_lapop_s$Pearson$scores %>% as.vector

#best estimates
d_lapop$CA = irt_score_fill_na_scores$theta1_std

#best reliability
lapop_ca_rel = irt_score_fill_na_rel



# aggregate individual level scores to province level ----------------------------------------------------

d_lapop_province = ddply(d_lapop, .variables = "province_abbrev", .fun = function(x) {
  c("S_lapop" = wtd_mean(x$S),
    "CA_lapop" = wtd_mean(x$CA),
    "n_S" = count_NA(x$S, reverse = T),
    "n_CA" = count_NA(x$CA, reverse = T),
    "n_mean" = wtd_mean(count_NA(x$S, reverse = T), count_NA(x$CA, reverse = T))
    )
})

#quick calc
wtd.cors(d_lapop_province[c("S_lapop", "CA_lapop")], weight = d_lapop_province$n_mean %>% sqrt) %>% round(2)

##          S_lapop CA_lapop
## S_lapop     1.00     0.75
## CA_lapop    0.75     1.00

wtd.cors(d_lapop_province[c("S_lapop", "CA_lapop")]) %>% round(2)

##          S_lapop CA_lapop
## S_lapop     1.00     0.63
## CA_lapop    0.63     1.00

#join to main
d_main = dplyr::full_join(d_main, d_lapop_province, by = c("Province" = "province_abbrev"))


# scores by year and province ---------------------------------------------

d_lapop_province_year = ddply(d_lapop, .variables = c("province_abbrev", "year"), .fun = function(x) {
  c("S_lapop" = wtd_mean(x$S),
    "CA_lapop" = wtd_mean(x$CA),
    "n_S" = count_NA(x$S, reverse = T),
    "n_CA" = count_NA(x$CA, reverse = T),
    "n_mean" = wtd_mean(count_NA(x$S, reverse = T), count_NA(x$CA, reverse = T))
  )
})

ggplot(d_lapop_province_year, aes(CA_lapop, S_lapop, group = year, color = year)) + geom_point() + geom_smooth(method=lm, se=F)

daply(d_lapop_province_year, .variables = "year", .fun = function(x) {
  c(unwtd = wtd.cors(x$CA_lapop, x$S_lapop),
    wtd = wtd.cors(x$CA_lapop, x$S_lapop, x$n_mean)
    )
  
}) %>% round(2)

##       
## year   unwtd  wtd
##   2008  0.62 0.74
##   2010  0.46 0.44
##   2012  0.47 0.66
##   2014  0.77 0.83

# demographic -------------------------------------------------------------

prov_demo = aggregate_demo_by_unit("province_abbrev")

d_main = dplyr::full_join(d_main, prov_demo, by = "province_abbrev")

## Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector

Compare S at multiple levels

Sixth, we compare the S factor at the four levels of analysis.

### Compare factor structure at different levels using LAPOP data
#using the aggregate-level indicators

# aggregate-level ----------------------------------------------------------
#average indicators by PU and then FA
#compare with individual-level analysis

d_lapop_s_prov = aggregate_s_by_unit("province_abbrev")

#analyze
lapop_s_aggr = list(individual = l_fa_lapop_s$Pearson,
                    province = fa(d_lapop_s_prov[-1]),
                    municipality = muni_s_fa,
                    district = dist_s_fa
)

fa_plot_loadings(lapop_s_aggr)

ggsave("figures/province/S_lapop_loadings.png")

## Saving 7 x 5 in image

fa_congruence_matrix(lapop_s_aggr) %>% MAT_half() %>% averages

## arithmetic  geometric   harmonic       mode     median    trimmed 
##       0.96       0.96       0.96       0.98       0.97       0.96 
##   midrange 
##       0.96

#merge scores to main
tmp = data_frame(S_lapop_aggr = lapop_s_aggr$province$scores %>% as.vector,
                 Province = d_lapop_s_prov$province)

d_main = full_join(d_main, tmp, by = c("Province" = "Province"))

## Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector

Province-level analyses

Here we combine all the data at the province-level.

Cognitive data

First we clean the data.

### CLEAN CA DATA

#this data is comma formated and sometimes the comma is missing
#fix this to be dot formated, numeric and without errors

v_CA_vars = c("Low", "Medium", "High")

# remove commas -----------------------------------------------------------

d_CA$Low %<>% str_replace(pattern = ",", replacement = "")
d_CA$Medium %<>% str_replace(pattern = ",", replacement = "")
d_CA$High %<>% str_replace(pattern = ",", replacement = "")

#check lengths
#if too short, it's because there were omited 0's
m_length = d_CA[v_CA_vars] %>% sapply(FUN = str_length) %>% is_less_than(3)
#fill in NAs with Fs
m_length[is.na(m_length)] = F

table(m_length)

## m_length
## FALSE  TRUE 
##  2679   201

# pad 0's -----------------------------------------------------------------
#which values need padding?

d_CA[v_CA_vars][m_length]

##   [1] "99" "68" "53" "64" "97" "69" "46" "51" "58" "89" "97" "53" "58" "65"
##  [15] "28" "48" "64" "35" "57" "76" "35" "83" "45" "51" "72" "63" "98" "61"
##  [29] "89" "60" "83" "95" "71" "91" "95" "77" "96" "86" "37" "46" "30" "14"
##  [43] "54" "52" "34" "38" "65" "31" "65" "96" "29" "60" "99" "40" "48" "28"
##  [57] "18" "62" "50" "56" "40" "58" "25" "58" "92" "32" "57" "87" "58" "71"
##  [71] "47" "51" "65" "92" "32" "88" "32" "43" "80" "99" "72" "90" "80" "81"
##  [85] "80" "64" "85" "54" "46" "70" "66" "73" "56" "88" "42" "69" "90" "30"
##  [99] "72" "82" "80" "61" "84" "46" "50" "68" "68" "53" "99" "74" "37" "69"
## [113] "82" "41" "97" "79" "86" "96" "43" "48" "36" "33" "53" "66" "40" "63"
## [127] "89" "48" "65" "76" "29" "97" "62" "60" "86" "62" "90" "71" "90" "97"
## [141] "91" "94" "88" "69" "43" "41" "33" "61" "45" "49" "33" "38" "64" "32"
## [155] "56" "77" "24" "45" "61" "95" "94" "62" "85" "70" "65" "56" "91" "77"
## [169] "80" "66" "71" "98" "49" "89" "99" "54" "63" "83" "75" "61" "72" "59"
## [183] "77" "63" "97" "47" "97" "86" "96" "78" "85" "81" "99" "99" "67" "68"
## [197] "94" "95" "89" "91" "91"

d_CA[v_CA_vars][m_length] = d_CA[v_CA_vars][m_length] %>% str_pad(width = 3, side = "left", pad = "0")


# insert dots, change to numeric -------------------------------------------------------------
#then we insert the dots at the 3rd position and change to numeric
dot_inserter = function(str) (str_sub(str, start = 1, end = 2) + "." + str_sub(str, start = 3, end = 3)) %>% as.numeric()

d_CA$Low %<>% sapply(dot_inserter)
d_CA$Medium %<>% sapply(dot_inserter)
d_CA$High %<>% sapply(dot_inserter)


# verify totals -----------------------------------------------------------
#these should be 100 or close to (rounding error)

d_CA$Total = aaply(.data = d_CA[v_CA_vars], .margins = 1, .fun = sum, .expand = F)
table(d_CA$Total)

## 
##  99.9   100 100.1 
##    46   743   159

#throw an error loudly
if (!d_CA$Total %>% is_between(a = 99.8, b = 100.2) %>% all(na.rm=T)) stop("ERROR IN THE DATA!")

#get rid of totals again
d_CA$Total = NULL

# deal with aggregate units and names -------------------------------------
d_CA = d_CA[!d_CA$Name %>% str_detect(pattern = "Total|Region"), ]

#translate to abbreviations
d_CA$abrev = pu_translate(d_CA$Name, superunit = "ARG", messages = F)

#crude check for translation
if (!table(d_CA$abrev) %>% all_elements_the_same()) stop("There was an unequal number of provinces! Probably this means there is an abbreviation error.")

Then we score the data.

### CALCULATE COGNITIVE ABILITY BY UNIT
#This code calculates the cognitive ability using the ONE data


# descriptive -------------------------------------------------------------

table(d_CA$Year)

## 
## 2005 2007 2010 
##  192  288  288

table(d_CA$Grade)

## 
##   6   8   9  12 
## 288 192  96 192

table(d_CA$Test)

## 
## Language     Math  Natural   Social 
##      192      192      192      192

# functions ---------------------------------------------------------------


make_combinations = function(i) {
  # lst = c(lapply(1:i, FUN = function(x) seq(from = 1, to = x))[-i],
  #         lapply(i:1, FUN = function(x) seq(from = i, to = x))[-i])
  
  #note that 1:2 is equivalent to 3, one only needs one cycle
  lst = lapply(1:i, FUN = function(x) seq(from = 1, to = x))[-i]
  
  lst
}

convert_proportions_to_z = function(data) {
  
  #investigate unusual Neuquen score
  # if(anyNA(data)) browser()
  
  #make the column combinations needed
  l_combs = make_combinations(ncol(data))
  
  #make the scores
  d_scores = lapply(l_combs, FUN = function(cols) {
    # browser()
    #get mean by row
    v = data[cols] %>% rowMeans(na.rm = T)
  }) %>% as.data.frame()
  
  #standardize
  d_scores_z = df_standardize(d_scores)
  
  #mean by row, then standardize
  v_scores_z = rowMeans(d_scores_z, na.rm = T) %>% 
    standardize()
  
  v_scores_z
}



# cognitive ability -------------------------------------------------------

#calculate standard scores
d_CA_z_t = ddply(d_CA, .variables = c("Year", "Grade", "Test"), .fun = function(block) {
  #skip if empty
  if (nrow(block) == 0) return(NULL)
  
  #to z form
  convert_proportions_to_z(block[v_CA_vars])
})

#subset data, remove other vars
d_CA_z = d_CA_z_t[-(1:3)] %>% df_t()

wtd.cors(d_CA_z) %>% round(2)

##       1    2    3    4    5    6    7    8    9   10   11   12   13   14
## 1  1.00 0.96 0.84 0.94 0.85 0.78 0.69 0.79 0.85 0.86 0.70 0.73 0.77 0.79
## 2  0.96 1.00 0.84 0.93 0.84 0.80 0.70 0.78 0.88 0.89 0.73 0.72 0.76 0.79
## 3  0.84 0.84 1.00 0.93 0.73 0.73 0.68 0.71 0.76 0.76 0.77 0.81 0.63 0.72
## 4  0.94 0.93 0.93 1.00 0.79 0.75 0.65 0.73 0.83 0.81 0.77 0.80 0.69 0.76
## 5  0.85 0.84 0.73 0.79 1.00 0.95 0.90 0.95 0.86 0.85 0.72 0.63 0.87 0.86
## 6  0.78 0.80 0.73 0.75 0.95 1.00 0.94 0.91 0.86 0.85 0.77 0.66 0.86 0.91
## 7  0.69 0.70 0.68 0.65 0.90 0.94 1.00 0.91 0.80 0.79 0.70 0.61 0.88 0.90
## 8  0.79 0.78 0.71 0.73 0.95 0.91 0.91 1.00 0.84 0.83 0.71 0.59 0.85 0.84
## 9  0.85 0.88 0.76 0.83 0.86 0.86 0.80 0.84 1.00 0.97 0.87 0.77 0.85 0.86
## 10 0.86 0.89 0.76 0.81 0.85 0.85 0.79 0.83 0.97 1.00 0.87 0.75 0.84 0.87
## 11 0.70 0.73 0.77 0.77 0.72 0.77 0.70 0.71 0.87 0.87 1.00 0.80 0.69 0.76
## 12 0.73 0.72 0.81 0.80 0.63 0.66 0.61 0.59 0.77 0.75 0.80 1.00 0.52 0.65
## 13 0.77 0.76 0.63 0.69 0.87 0.86 0.88 0.85 0.85 0.84 0.69 0.52 1.00 0.89
## 14 0.79 0.79 0.72 0.76 0.86 0.91 0.90 0.84 0.86 0.87 0.76 0.65 0.89 1.00
## 15 0.79 0.73 0.68 0.73 0.89 0.88 0.87 0.85 0.82 0.83 0.72 0.62 0.90 0.91
## 16 0.85 0.78 0.66 0.75 0.87 0.80 0.78 0.86 0.85 0.84 0.72 0.60 0.91 0.82
## 17 0.74 0.71 0.60 0.65 0.86 0.87 0.89 0.87 0.82 0.81 0.66 0.52 0.94 0.93
## 18 0.79 0.78 0.70 0.73 0.90 0.95 0.93 0.87 0.87 0.88 0.76 0.64 0.91 0.97
## 19 0.77 0.73 0.59 0.67 0.87 0.88 0.88 0.83 0.87 0.88 0.74 0.59 0.92 0.92
## 20 0.80 0.76 0.59 0.70 0.91 0.89 0.90 0.90 0.87 0.87 0.72 0.57 0.93 0.92
## 21 0.82 0.85 0.67 0.78 0.83 0.83 0.75 0.81 0.94 0.90 0.80 0.69 0.85 0.81
## 22 0.82 0.87 0.74 0.80 0.86 0.87 0.83 0.88 0.95 0.94 0.85 0.69 0.86 0.88
## 23 0.76 0.77 0.80 0.81 0.80 0.79 0.76 0.78 0.89 0.85 0.89 0.73 0.79 0.77
## 24 0.84 0.84 0.77 0.85 0.83 0.80 0.73 0.84 0.91 0.87 0.86 0.68 0.81 0.79
## 25 0.85 0.87 0.62 0.75 0.90 0.87 0.83 0.88 0.91 0.91 0.68 0.60 0.88 0.88
## 26 0.80 0.85 0.65 0.75 0.90 0.91 0.86 0.87 0.90 0.91 0.73 0.61 0.87 0.90
## 27 0.80 0.79 0.58 0.71 0.89 0.84 0.86 0.88 0.88 0.86 0.66 0.57 0.88 0.88
## 28 0.87 0.86 0.70 0.81 0.92 0.88 0.85 0.90 0.92 0.89 0.78 0.66 0.91 0.89
## 29 0.87 0.86 0.63 0.75 0.92 0.88 0.86 0.89 0.91 0.91 0.71 0.62 0.90 0.90
## 30 0.84 0.86 0.66 0.75 0.92 0.93 0.90 0.89 0.91 0.91 0.73 0.64 0.90 0.93
## 31 0.82 0.81 0.67 0.75 0.92 0.90 0.89 0.88 0.91 0.91 0.78 0.67 0.93 0.93
## 32 0.82 0.80 0.62 0.72 0.91 0.89 0.89 0.91 0.89 0.88 0.74 0.63 0.93 0.91
##      15   16   17   18   19   20   21   22   23   24   25   26   27   28
## 1  0.79 0.85 0.74 0.79 0.77 0.80 0.82 0.82 0.76 0.84 0.85 0.80 0.80 0.87
## 2  0.73 0.78 0.71 0.78 0.73 0.76 0.85 0.87 0.77 0.84 0.87 0.85 0.79 0.86
## 3  0.68 0.66 0.60 0.70 0.59 0.59 0.67 0.74 0.80 0.77 0.62 0.65 0.58 0.70
## 4  0.73 0.75 0.65 0.73 0.67 0.70 0.78 0.80 0.81 0.85 0.75 0.75 0.71 0.81
## 5  0.89 0.87 0.86 0.90 0.87 0.91 0.83 0.86 0.80 0.83 0.90 0.90 0.89 0.92
## 6  0.88 0.80 0.87 0.95 0.88 0.89 0.83 0.87 0.79 0.80 0.87 0.91 0.84 0.88
## 7  0.87 0.78 0.89 0.93 0.88 0.90 0.75 0.83 0.76 0.73 0.83 0.86 0.86 0.85
## 8  0.85 0.86 0.87 0.87 0.83 0.90 0.81 0.88 0.78 0.84 0.88 0.87 0.88 0.90
## 9  0.82 0.85 0.82 0.87 0.87 0.87 0.94 0.95 0.89 0.91 0.91 0.90 0.88 0.92
## 10 0.83 0.84 0.81 0.88 0.88 0.87 0.90 0.94 0.85 0.87 0.91 0.91 0.86 0.89
## 11 0.72 0.72 0.66 0.76 0.74 0.72 0.80 0.85 0.89 0.86 0.68 0.73 0.66 0.78
## 12 0.62 0.60 0.52 0.64 0.59 0.57 0.69 0.69 0.73 0.68 0.60 0.61 0.57 0.66
## 13 0.90 0.91 0.94 0.91 0.92 0.93 0.85 0.86 0.79 0.81 0.88 0.87 0.88 0.91
## 14 0.91 0.82 0.93 0.97 0.92 0.92 0.81 0.88 0.77 0.79 0.88 0.90 0.88 0.89
## 15 1.00 0.91 0.92 0.92 0.91 0.93 0.80 0.82 0.80 0.79 0.85 0.84 0.85 0.90
## 16 0.91 1.00 0.89 0.85 0.88 0.92 0.86 0.84 0.79 0.84 0.87 0.81 0.83 0.92
## 17 0.92 0.89 1.00 0.96 0.95 0.96 0.82 0.85 0.74 0.78 0.88 0.86 0.90 0.90
## 18 0.92 0.85 0.96 1.00 0.96 0.94 0.85 0.88 0.78 0.81 0.87 0.90 0.88 0.89
## 19 0.91 0.88 0.95 0.96 1.00 0.96 0.84 0.85 0.76 0.80 0.88 0.87 0.88 0.88
## 20 0.93 0.92 0.96 0.94 0.96 1.00 0.86 0.88 0.77 0.83 0.93 0.91 0.94 0.93
## 21 0.80 0.86 0.82 0.85 0.84 0.86 1.00 0.95 0.89 0.94 0.91 0.90 0.88 0.93
## 22 0.82 0.84 0.85 0.88 0.85 0.88 0.95 1.00 0.91 0.95 0.92 0.94 0.91 0.94
## 23 0.80 0.79 0.74 0.78 0.76 0.77 0.89 0.91 1.00 0.94 0.77 0.80 0.78 0.87
## 24 0.79 0.84 0.78 0.81 0.80 0.83 0.94 0.95 0.94 1.00 0.84 0.86 0.84 0.91
## 25 0.85 0.87 0.88 0.87 0.88 0.93 0.91 0.92 0.77 0.84 1.00 0.97 0.96 0.95
## 26 0.84 0.81 0.86 0.90 0.87 0.91 0.90 0.94 0.80 0.86 0.97 1.00 0.94 0.93
## 27 0.85 0.83 0.90 0.88 0.88 0.94 0.88 0.91 0.78 0.84 0.96 0.94 1.00 0.95
## 28 0.90 0.92 0.90 0.89 0.88 0.93 0.93 0.94 0.87 0.91 0.95 0.93 0.95 1.00
## 29 0.87 0.89 0.92 0.91 0.92 0.96 0.90 0.92 0.77 0.85 0.98 0.95 0.96 0.95
## 30 0.86 0.85 0.92 0.95 0.94 0.95 0.90 0.92 0.78 0.84 0.95 0.96 0.94 0.92
## 31 0.90 0.89 0.93 0.95 0.97 0.97 0.87 0.90 0.81 0.85 0.92 0.91 0.92 0.92
## 32 0.88 0.91 0.94 0.93 0.95 0.98 0.88 0.90 0.77 0.84 0.94 0.91 0.94 0.94
##      29   30   31   32
## 1  0.87 0.84 0.82 0.82
## 2  0.86 0.86 0.81 0.80
## 3  0.63 0.66 0.67 0.62
## 4  0.75 0.75 0.75 0.72
## 5  0.92 0.92 0.92 0.91
## 6  0.88 0.93 0.90 0.89
## 7  0.86 0.90 0.89 0.89
## 8  0.89 0.89 0.88 0.91
## 9  0.91 0.91 0.91 0.89
## 10 0.91 0.91 0.91 0.88
## 11 0.71 0.73 0.78 0.74
## 12 0.62 0.64 0.67 0.63
## 13 0.90 0.90 0.93 0.93
## 14 0.90 0.93 0.93 0.91
## 15 0.87 0.86 0.90 0.88
## 16 0.89 0.85 0.89 0.91
## 17 0.92 0.92 0.93 0.94
## 18 0.91 0.95 0.95 0.93
## 19 0.92 0.94 0.97 0.95
## 20 0.96 0.95 0.97 0.98
## 21 0.90 0.90 0.87 0.88
## 22 0.92 0.92 0.90 0.90
## 23 0.77 0.78 0.81 0.77
## 24 0.85 0.84 0.85 0.84
## 25 0.98 0.95 0.92 0.94
## 26 0.95 0.96 0.91 0.91
## 27 0.96 0.94 0.92 0.94
## 28 0.95 0.92 0.92 0.94
## 29 1.00 0.98 0.96 0.98
## 30 0.98 1.00 0.97 0.97
## 31 0.96 0.97 1.00 0.98
## 32 0.98 0.97 0.98 1.00

wtd.cors(d_CA_z) %>% MAT_half() %>% psych::describe()

##    vars   n mean   sd median trimmed  mad  min  max range  skew kurtosis
## X1    1 496 0.84 0.09   0.86    0.85 0.08 0.52 0.98  0.46 -0.92     0.34
##    se
## X1  0

silence(psych::alpha(d_CA_z))

## 
## Reliability analysis   
## Call: psych::alpha(x = d_CA_z)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N
##       0.99      0.99       1      0.85 175
## 
##  Reliability if an item is dropped:
##    raw_alpha std.alpha G6(smc) average_r S/N
## 1       0.99      0.99       1      0.85 171
## 2       0.99      0.99       1      0.85 171
## 3       0.99      0.99       1      0.85 181
## 4       0.99      0.99       1      0.85 175
## 5       0.99      0.99       1      0.84 167
## 6       0.99      0.99       1      0.84 167
## 7       0.99      0.99       1      0.85 170
## 8       0.99      0.99       1      0.84 168
## 9       0.99      0.99       1      0.84 166
## 10      0.99      0.99       1      0.84 167
## 11      0.99      0.99       1      0.85 177
## 12      0.99      0.99       1      0.86 186
## 13      0.99      0.99       1      0.84 169
## 14      0.99      0.99       1      0.84 168
## 15      0.99      0.99       1      0.84 168
## 16      0.99      0.99       1      0.84 169
## 17      0.99      0.99       1      0.85 169
## 18      0.99      0.99       1      0.84 167
## 19      0.99      0.99       1      0.84 167
## 20      0.99      0.99       1      0.84 165
## 21      0.99      0.99       1      0.84 168
## 22      0.99      0.99       1      0.84 166
## 23      0.99      0.99       1      0.85 172
## 24      0.99      0.99       1      0.85 170
## 25      0.99      0.99       1      0.84 167
## 26      0.99      0.99       1      0.84 167
## 27      0.99      0.99       1      0.84 168
## 28      0.99      0.99       1      0.84 165
## 29      0.99      0.99       1      0.84 165
## 30      0.99      0.99       1      0.84 165
## 31      0.99      0.99       1      0.84 165
## 32      0.99      0.99       1      0.84 166
## 
##  Item statistics 
##       r r.cor r.drop
## 1  0.90  0.90   0.89
## 2  0.90  0.90   0.89
## 3  0.78  0.78   0.77
## 4  0.85  0.85   0.84
## 5  0.95  0.95   0.95
## 6  0.94  0.94   0.94
## 7  0.91  0.91   0.90
## 8  0.93  0.93   0.93
## 9  0.96  0.96   0.95
## 10 0.95  0.95   0.94
## 11 0.83  0.83   0.82
## 12 0.73  0.73   0.71
## 13 0.93  0.93   0.92
## 14 0.94  0.94   0.94
## 15 0.94  0.94   0.93
## 16 0.93  0.93   0.92
## 17 0.92  0.92   0.92
## 18 0.95  0.95   0.95
## 19 0.94  0.94   0.94
## 20 0.97  0.97   0.97
## 21 0.93  0.93   0.92
## 22 0.96  0.96   0.96
## 23 0.88  0.88   0.88
## 24 0.92  0.92   0.91
## 25 0.95  0.95   0.95
## 26 0.95  0.95   0.95
## 27 0.94  0.94   0.93
## 28 0.97  0.97   0.97
## 29 0.97  0.97   0.96
## 30 0.97  0.97   0.97
## 31 0.97  0.97   0.97
## 32 0.96  0.96   0.96

#one overall estimate for each department, then reverse
v_CA = rowMeans(d_CA_z, na.rm = T) %>% standardize() %>% multiply_by(-1)

#as df
d_CA_final = data.frame(CA = v_CA,
                        Province = d_CA$abrev[1:length(v_CA)],
                        stringsAsFactors = F)

#check for duplicates
if (any(duplicated(d_CA_final$Province))) stop("There were duplicate provinces! Must be an error somewhere.")

#join to main using ISO
d_main = dplyr::full_join(d_main, d_CA_final, by = c("Province" = "Province"))

S data

Next, we analyze the S data.

### S FACTOR ANALYSIS
#province level S factor analysis based on Knoema and combined datasets


# subset variables --------------------------------------------------------

v_S_vars = c(
  "Adolescent.fertility.rate",
  "Fertility.rate",
  "crime.rate",
  "Dwellings.deficient",
  "Total.households.with.unsatisfied.basic.needs",
  "Employment.rate.Urban.areas",
  "Expendeture.on.science.and.technology",
  "Expenditure.on.research.and.development",
  "Homes.with.running.water",
  "illiteracy.rate.of.the.total.population.age.10.and.older",
  "of.population.65.years.and.over.illiterate",
  "Infant.mortality.rate",
  "Life.expectancy.at.1.year.of.age",
  "Life.expectancy.at.15.year.of.age",
  "Life.expectancy.at.birth",
  "Maternal.mortality.rates",
  "of.the.population.5.years.old.attending.and.education.institution",
  "of.population.12.17.years.of.age.attending.an.educational.institution",
  "of.population.15.years.and.over.with.incipient.or.no.education",
  "students.repeating.by.level.of.education.secondary.basic.cycle"
)
d_S = d_main[v_S_vars]


# redundancy --------------------------------------------------------------

d_S_orig = d_S
d_S = remove_redundant_vars(d_S)

## The following variable pairs had stronger intercorrelations than |0.9|:

##                                      Var1
## 147 Expendeture.on.science.and.technology
## 294     Life.expectancy.at.15.year.of.age
##                                        Var2    r
## 147 Expenditure.on.research.and.development 1.00
## 294                Life.expectancy.at.birth 0.98

## The following variables were excluded:

## Expenditure.on.research.and.development, Life.expectancy.at.birth

# factor analyze ----------------------------------------------------------

#method variance
fa_method_var = fa_all_methods(d_S, messages = F)
wtd.cors(fa_method_var$scores) %>% MAT_half() %>% psych::describe()

##    vars   n mean   sd median trimmed  mad  min max range  skew kurtosis se
## X1    1 276 0.99 0.01   0.99    0.99 0.01 0.97   1  0.03 -0.88     -0.2  0

#factor analyze
l_fa = list("standard" = fa(d_S, scores = "Bartlett"),
            "rank" = fa(df_rank(d_S), scores = "Bartlett"),
            "standard_wtd" = fa(d_S, fm="wls", weight = d_main$pop %>% sqrt, scores = "Bartlett"),
            "rank_wtd" = fa(df_rank(d_S), fm="wls", weight = d_main$pop %>% sqrt, scores = "Bartlett"))

#factor similarity
fa_congruence_matrix(l_fa)

##       MR1  MR1 WLS1 WLS1
## MR1  1.00 0.98 1.00 0.99
## MR1  0.98 1.00 0.97 0.97
## WLS1 1.00 0.97 1.00 0.99
## WLS1 0.99 0.97 0.99 1.00

fa_congruence_matrix(l_fa) %>% MAT_half() %>% psych::describe()

##    vars n mean   sd median trimmed  mad  min max range skew kurtosis se
## X1    1 6 0.98 0.01   0.98    0.98 0.01 0.97   1  0.03 0.04     -1.9  0

#indicator sampling error
if (F) {
  set.seed(1)
  fa_ise = fa_splitsample_repeat(d_S, runs = 1000, messages = F)
  GG_denhist(fa_ise %>% unlist())
  GG_denhist(fa_ise %>% unlist() %>% abs())
  fa_ise %>% unlist %>% abs %>% psych::describe()
}


#plot
fa_plot_loadings(l_fa, reverse_vector = c(-1, -1, -1, -1)) + scale_y_continuous(limits = c(-1, 1))

ggsave("figures/province/S_knoema_loadings.png")

## Saving 7 x 5 in image

#save scores
d_main$S_unwtd = l_fa$standard$scores %>% as.vector() %>% multiply_by(-1)
d_main$S_rank_unwtd = l_fa$rank$scores %>% as.vector() %>% multiply_by(-1)
d_main$S_wtd = l_fa$standard_wtd$scores %>% as.vector() %>% multiply_by(-1)
d_main$S_rank_wtd = l_fa$rank_wtd$scores %>% as.vector() %>% multiply_by(-1)


# combined Knoema + LAPOP data --------------------------------------------
#join
d_S2 = full_join(cbind(d_S_orig, province = d_knoema_averaged$Province %>% as.character()), d_lapop_s_prov,
                 by = c("province" = "province_abbrev"))

## Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector

#set rownames, remove province col
rownames(d_S2) = d_S2$province
d_S2$province = NULL

#exclude overlap
d_S2 %<>% remove_redundant_vars()

## The following variable pairs had stronger intercorrelations than |0.9|:

##                                      Var1
## 259 Expendeture.on.science.and.technology
## 518     Life.expectancy.at.15.year.of.age
##                                        Var2    r
## 259 Expenditure.on.research.and.development 1.00
## 518                Life.expectancy.at.birth 0.98

## The following variables were excluded:
## Expenditure.on.research.and.development, Life.expectancy.at.birth

#impute missing
d_S2 = VIM::irmi(d_S2, noise = F)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

#analyze
fa_s_prov = fa(d_S2, scores = "Bartlett", weight = d_knoema_averaged$pop %>% sqrt)

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was
## done

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was
## done

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was
## done

#plot
fa_plot_loadings(fa_s_prov) + ylim(-1, 1)

ggsave("figures/province/S_comb_loadings.png")

## Saving 7 x 5 in image

#save scores
d_main$S_comb = fa_s_prov$scores %>% as.vector()
d_main$S = d_main$S_comb #best S

Main analyses

Now we come to the main analyses.

Does mining-activity confound the results?

First, we examine whether the S analyses are robust to adjustments for mining-activity.

### EXPLORE MINING DATA TO SEE IF IT CONFOUNDS ASSOCIATIONS


# mining data -------------------------------------------------------------
d_mining = d_main[c("Active_mines", "Mining_volume", "Mining_value", "Mining_employees")]

wtd.cors(d_mining)

##                  Active_mines Mining_volume Mining_value Mining_employees
## Active_mines             1.00          0.38         0.62             0.53
## Mining_volume            0.38          1.00         0.63             0.81
## Mining_value             0.62          0.63         1.00             0.83
## Mining_employees         0.53          0.81         0.83             1.00

# per capita --------------------------------------------------------------
d_mining_pc = d_mining / d_main$pop

wtd.cors(d_mining_pc)

##                  Active_mines Mining_volume Mining_value Mining_employees
## Active_mines             1.00          0.15         0.17             0.27
## Mining_volume            0.15          1.00         0.32             0.76
## Mining_value             0.17          0.32         1.00             0.80
## Mining_employees         0.27          0.76         0.80             1.00

fa_mining = fa(d_mining_pc)

d_main$mining = fa_mining$scores %>% as.vector



# residualize -------------------------------------------------------------------

wtd.cors(d_main[c(v_primary_vars, "mining")]) %>% round(2)

##                    CA     S European White Skin_brightness European_SIRE
## CA               1.00  0.87     0.44  0.49            0.65          0.50
## S                0.87  1.00     0.42  0.43            0.71          0.50
## European         0.44  0.42     1.00  0.72            0.22          0.69
## White            0.49  0.43     0.72  1.00            0.41          0.97
## Skin_brightness  0.65  0.71     0.22  0.41            1.00          0.48
## European_SIRE    0.50  0.50     0.69  0.97            0.48          1.00
## Latitude         0.48  0.64     0.12  0.25            0.56          0.38
## Temperature     -0.49 -0.61     0.08 -0.08           -0.57         -0.19
## mining          -0.10 -0.05    -0.38 -0.42            0.09         -0.35
##                 Latitude Temperature mining
## CA                  0.48       -0.49  -0.10
## S                   0.64       -0.61  -0.05
## European            0.12        0.08  -0.38
## White               0.25       -0.08  -0.42
## Skin_brightness     0.56       -0.57   0.09
## European_SIRE       0.38       -0.19  -0.35
## Latitude            1.00       -0.93   0.16
## Temperature        -0.93        1.00  -0.20
## mining              0.16       -0.20   1.00

d_S_mining = d_S2
d_S_mining$mining = d_main$mining

#name vars
#otherwise the residualization function fails
colnames(d_S_mining) %<>% make.names

d_S_mining = df_residualize(d_S_mining, resid.vars = "mining", return.resid.vars = F)

#set names back
colnames(d_S_mining) = colnames(d_S2)


# factor analyze ----------------------------------------------------------

mining_s_fa = list(standard = fa_s_prov, mining_controlled = fa(d_S_mining, scores = "Bartlett", weight = d_main$pop %>% sqrt))

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was
## done

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was
## done

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was
## done

fa_plot_loadings(mining_s_fa)

fa_congruence_matrix(mining_s_fa)

##     MR1 MR1
## MR1   1   1
## MR1   1   1

Individual-level analyses

Second, the main individual-level analyses.

### INDIVIDUAL-LEVEL ANALYSES
#Here we analyze the relationship between CA, S, SIRE and skin color


# some initial plots ------------------------------------------------------

GG_group_means(d_lapop, var = "Skin_brightness", groupvar = "SIRE", type = "violin") +
  ylab("Skin brightness")

## Missing values were removed.

ggsave("figures/individual/Skin_brightness_by_SIRE.png")

## Saving 7 x 5 in image

#table
psych::describeBy(d_lapop$Skin_brightness, d_lapop$SIRE, mat = T) %>% 
  as_data_frame() %>% 
  dplyr::select(group1, n, mean, sd) %>% 
  write_clipboard()

##         group1       n mean   sd
## X11     Blanca 2845.00 7.93 1.26
## X12   IndÃgena   51.00 5.73 1.64
## X13    Mestiza 1246.00 6.73 1.48
## X14     Mulata    7.00 6.29 2.29
## X15      Negra   37.00 4.73 1.61
## X16 not stated  173.00 7.25 1.66
## X17       Otro   60.00 6.55 1.29

GG_scatter(d_lapop, "CA", "S", case_names = F) +
  xlab("Cognitive ability (political knowledge proxy)") +
  ylab("General socioeconomic status (15 indicators)")

ggplot(d_lapop, aes(CA, S, color = year)) +
  geom_point() +
  geom_smooth(method = lm) +
  xlab("Cognitive ability (political knowledge proxy)") +
  ylab("S, general socioeconomic factor (15 indicators)") +
  theme_bw()

ggsave("figures/individual/CA_S_year.png")

## Saving 7 x 5 in image

count.pairwise(d_lapop[c("CA", "S")])

##      CA    S
## CA 5920 5920
## S  5920 5920

# compare cognitive scoring methods ---------------------------------------
#we find out which scoring method is best by comparing them within years for their correlation to S
d_ca_s_cors = ddply(d_lapop, .variables = "year", .fun = function(block) {
  cors = wtd.cors(block[c("CA", "S")]) %>% round(2)
  data_frame("CA_S" = cors[1, 2])
})

#add reliability
d_ca_s_cors$reliability = lapop_ca_rel
lapop_ca_rel

## 2008 2010 2012 2014 
## 0.67 0.69 0.43 0.82

#estimated true score cors
d_ca_s_cors$CA_S_true = aaply(d_ca_s_cors, .margins = 1, .fun = function(x) {
  x[["CA_S"]] / sqrt(x[["reliability"]])
}, .expand = F)

#output
d_ca_s_cors %>% write_clipboard()

##   year CA S reliability CA S true
## 1 2008 0.52        0.67      0.64
## 2 2010 0.44        0.69      0.53
## 3 2012 0.33        0.43      0.50
## 4 2014 0.48        0.82      0.53

mean(d_ca_s_cors$CA_S_true) %>% round(2)

## [1] 0.55

#pooled estimate
wtd.cors(d_lapop$S, d_lapop$CA)

##      [,1]
## [1,] 0.44

mean(lapop_ca_rel)

## [1] 0.65

wtd.cors(d_lapop$S, d_lapop$CA) / sqrt(mean(lapop_ca_rel))

##      [,1]
## [1,] 0.55

# raw correlations --------------------------------------------------------
v_reliabilities = c(mean(lapop_ca_rel), 1, 1)

d_lapop[c("CA", "S", "Skin_brightness")] %>% wtd.cors() %>% correct.cor(v_reliabilities) %>% write_clipboard()

##                   CA    S Skin brightness
## CA              0.65 0.55            0.20
## S               0.44 1.00            0.26
## Skin brightness 0.16 0.26            1.00

# skin and CA/S within SIRE -------------------------------------------------

ggplot(d_lapop, aes(Skin_brightness, CA, color = SIRE)) +
  geom_point() +
  geom_smooth(method = lm, se = F)

## Warning: Removed 1501 rows containing non-finite values (stat_smooth).

## Warning: Removed 1501 rows containing missing values (geom_point).

#nothing too useful to be seen here

#correlations within SIREs with CIs
cors_SIRE_CI = ddply(d_lapop, .variables = "SIRE", .fun = function(dat) {
  cors = cor_matrix(dat[c("CA", "S", "Skin_brightness")], CI = .95, reliabilities = v_reliabilities)

  data.frame(SB_CA = cors[3, 1],
             SB_S = cors[3, 2],
             CA_S = cors[1, 2],
             n = count.pairwise(dat[c("CA", "S", "Skin_brightness")]) %>% MAT_half %>% max
               )
})
cors_SIRE_CI %>% write_clipboard()

##         SIRE              SB CA              SB S               CA S
## 1     Blanca   0.21 [0.17 0.24]  0.20 [0.17 0.24]   0.55 [0.53 0.57]
## 2   IndÃgena -0.09 [-0.36 0.19]  0.34 [0.07 0.56]   0.58 [0.41 0.71]
## 3    Mestiza   0.12 [0.06 0.17]  0.22 [0.17 0.27]   0.52 [0.48 0.55]
## 4     Mulata -0.29 [-0.86 0.59] 0.65 [-0.21 0.94] -0.21 [-0.65 0.34]
## 5      Negra  0.03 [-0.30 0.35] 0.09 [-0.24 0.41]   0.38 [0.12 0.59]
## 6 not stated   0.22 [0.08 0.36]  0.35 [0.21 0.47]   0.56 [0.47 0.65]
## 7       Otro  0.20 [-0.05 0.43]  0.49 [0.26 0.66]   0.47 [0.27 0.63]
##         n
## 1 3869.00
## 2   77.00
## 3 1596.00
## 4   15.00
## 5   54.00
## 6  236.00
## 7   73.00

cors_SIRE_CI %>% write_tsv("output/within_SIRE_cors.tsv")

#weighted mean
#recalculate without CIs, and get all pairwise n's
cors_SIRE = ddply(d_lapop, .variables = "SIRE", .fun = function(dat) {
  cors = cor_matrix(dat[c("CA", "S", "Skin_brightness")], reliabilities = v_reliabilities)
  cors_n = count.pairwise(dat[c("CA", "S", "Skin_brightness")])
  
  data.frame(SB_CA = cors[3, 1],
             SB_S = cors[3, 2],
             CA_S = cors[1, 2],
             n_SB_CA = cors_n[3, 1],
             n_SB_S = cors_n[3, 2],
             n_CA_S = cors_n[1, 2]
  ) 
})
wtd_mean(cors_SIRE$SB_CA, w = cors_SIRE$n_SB_CA)

## [1] 0.18

wtd_mean(cors_SIRE$SB_S, w = cors_SIRE$n_SB_S)

## [1] 0.22

wtd_mean(cors_SIRE$CA_S, w = cors_SIRE$n_CA_S)

## [1] 0.54

#models -- does SIRE have incremental validity above SB and CA?
(fit = lm("CA ~ SIRE", data = d_lapop) %>% MOD_summary(progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##                   Beta    SE CI_lower CI_upper
## SIRE: Blanca      0.00    NA       NA       NA
## SIRE: IndÃgena   -0.56 0.114    -0.78   -0.333
## SIRE: Mestiza    -0.15 0.030    -0.21   -0.094
## SIRE: Mulata     -0.66 0.257    -1.17   -0.160
## SIRE: Negra      -0.68 0.136    -0.95   -0.418
## SIRE: not stated -0.38 0.067    -0.51   -0.247
## SIRE: Otro       -0.34 0.117    -0.57   -0.113
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 5920 5913 0.017   0.016 0.014
## 
## 
## Etas from analysis of variance
##       Eta Eta_partial
## SIRE 0.13        0.13

(fit = lm("CA ~ SIRE + Skin_brightness", data = d_lapop) %>% MOD_summary(progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## SIRE: Blanca      0.000    NA       NA       NA
## SIRE: IndÃgena   -0.273 0.141   -0.550   0.0041
## SIRE: Mestiza     0.025 0.036   -0.045   0.0958
## SIRE: Mulata     -0.567 0.373   -1.298   0.1650
## SIRE: Negra      -0.190 0.167   -0.517   0.1364
## SIRE: not stated -0.292 0.077   -0.444  -0.1401
## SIRE: Otro       -0.244 0.129   -0.497   0.0098
## Skin_brightness   0.153 0.016    0.121   0.1850
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 4419 4411 0.032    0.03 0.027
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## SIRE            0.076       0.077
## Skin_brightness 0.138       0.139

(fit = lm("S ~ SIRE", data = d_lapop) %>% MOD_summary(progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##                   Beta    SE CI_lower CI_upper
## SIRE: Blanca      0.00    NA       NA       NA
## SIRE: IndÃgena   -0.71 0.113    -0.93   -0.492
## SIRE: Mestiza    -0.39 0.029    -0.45   -0.336
## SIRE: Mulata     -0.43 0.253    -0.93    0.063
## SIRE: Negra      -0.74 0.134    -1.00   -0.474
## SIRE: not stated -0.45 0.066    -0.57   -0.317
## SIRE: Otro       -0.45 0.116    -0.67   -0.218
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1       S 5920 5913 0.043   0.042  0.04
## 
## 
## Etas from analysis of variance
##       Eta Eta_partial
## SIRE 0.21        0.21

(fit = lm("S ~ SIRE + Skin_brightness", data = d_lapop) %>% MOD_summary(progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##                   Beta    SE CI_lower CI_upper
## SIRE: Blanca      0.00    NA       NA       NA
## SIRE: IndÃgena   -0.14 0.138    -0.41    0.134
## SIRE: Mestiza    -0.13 0.035    -0.20   -0.060
## SIRE: Mulata      0.15 0.365    -0.56    0.870
## SIRE: Negra      -0.19 0.163    -0.51    0.131
## SIRE: not stated -0.19 0.076    -0.34   -0.041
## SIRE: Otro       -0.23 0.126    -0.48    0.020
## Skin_brightness   0.24 0.016     0.21    0.270
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1       S 4419 4411 0.074   0.073 0.071
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## SIRE            0.065       0.067
## Skin_brightness 0.216       0.219

(fit = lm("S ~ SIRE + Skin_brightness + CA", data = d_lapop) %>% MOD_summary(progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## SIRE: Blanca      0.000    NA       NA       NA
## SIRE: IndÃgena   -0.032 0.127    -0.28    0.217
## SIRE: Mestiza    -0.139 0.032    -0.20   -0.075
## SIRE: Mulata      0.373 0.335    -0.28    1.031
## SIRE: Negra      -0.115 0.150    -0.41    0.179
## SIRE: not stated -0.077 0.070    -0.21    0.060
## SIRE: Otro       -0.135 0.116    -0.36    0.093
## Skin_brightness   0.180 0.015     0.15    0.209
## CA                0.385 0.014     0.36    0.412
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 4419 4410 0.22    0.22  0.21
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## SIRE            0.061       0.069
## Skin_brightness 0.161       0.179
## CA              0.379       0.394

#output
write_clipboard(fit$coefs)

##                   Beta   SE CI lower CI upper
## SIRE: Blanca      0.00                       
## SIRE: IndÃgena   -0.03 0.13    -0.28     0.22
## SIRE: Mestiza    -0.14 0.03    -0.20    -0.08
## SIRE: Mulata      0.37 0.34    -0.28     1.03
## SIRE: Negra      -0.11 0.15    -0.41     0.18
## SIRE: not stated -0.08 0.07    -0.21     0.06
## SIRE: Otro       -0.13 0.12    -0.36     0.09
## Skin brightness   0.18 0.01     0.15     0.21
## CA                0.39 0.01     0.36     0.41

# correlations within provinces -------------------------------------------

ggplot(d_lapop, aes(Skin_brightness, CA, color = province_name)) +
  geom_point() +
  geom_smooth(method = lm, se = F)

## Warning: Removed 1501 rows containing non-finite values (stat_smooth).

## Warning: Removed 1501 rows containing missing values (geom_point).

#nothing too useful to be seen here

#correlations within SIREs with CIs
cors_prov_CI = ddply(d_lapop, .variables = "province_name", .fun = function(dat) {
  cors = cor_matrix(dat[c("CA", "S", "Skin_brightness")], CI = .95, reliabilities = v_reliabilities)
  
  data.frame(SB_CA = cors[3, 1],
             SB_S = cors[3, 2],
             CA_S = cors[1, 2],
             n = count.pairwise(dat[c("CA", "S", "Skin_brightness")]) %>% MAT_half %>% max
  ) 
})
cors_prov_CI %>% write_clipboard(print=T)

##             province name               SB CA               SB S
## 1            Buenos Aires    0.27 [0.19 0.35]   0.30 [0.22 0.38]
## 2  Buenos Aires City (DC)    0.14 [0.09 0.19]   0.17 [0.12 0.22]
## 3               Catamarca -0.56 [-0.85 -0.01]  0.45 [-0.13 0.80]
## 4                   Chaco   0.08 [-0.03 0.19]   0.21 [0.09 0.31]
## 5                  Chubut  -0.22 [-0.63 0.29] -0.22 [-0.63 0.29]
## 6                 CÃ³rdoba   0.06 [-0.05 0.18]  0.05 [-0.07 0.16]
## 7              Corrientes   0.20 [-0.03 0.41]   0.28 [0.05 0.48]
## 8              Entre RÃos  -0.15 [-0.36 0.07]  0.13 [-0.09 0.34]
## 9                 Formosa  -0.39 [-0.72 0.10] -0.10 [-0.54 0.38]
## 10                  Jujuy   0.30 [-0.11 0.62]  0.07 [-0.34 0.45]
## 11               La Pampa    0.21 [0.02 0.38]  0.11 [-0.08 0.29]
## 12               La Rioja  -0.02 [-0.59 0.56] -0.09 [-0.63 0.51]
## 13                Mendoza    0.16 [0.03 0.28]   0.30 [0.18 0.41]
## 14               Misiones  -0.03 [-0.36 0.31]  0.00 [-0.33 0.34]
## 15                NeuquÃ©n  -0.11 [-0.35 0.15]  0.13 [-0.12 0.37]
## 16              RÃo Negro    0.28 [0.12 0.43]   0.23 [0.07 0.39]
## 17                  Salta   0.14 [-0.03 0.29] -0.12 [-0.28 0.04]
## 18               San Juan    0.52 [0.30 0.68]   0.41 [0.18 0.60]
## 19               San Luis   0.41 [-0.13 0.76] -0.46 [-0.79 0.07]
## 20               Santa Fe    0.21 [0.11 0.30]   0.26 [0.17 0.36]
## 21    Santiago del Estero  -0.05 [-0.21 0.12]   0.18 [0.02 0.34]
## 22       Tierra del Fuego  -0.15 [-0.69 0.49] -0.12 [-0.67 0.52]
## 23                TucumÃ¡n   0.05 [-0.08 0.18]   0.25 [0.13 0.37]
##                 CA S       n
## 1   0.46 [0.39 0.53]  504.00
## 2   0.48 [0.45 0.52] 2242.00
## 3  0.04 [-0.35 0.42]   26.00
## 4   0.60 [0.53 0.67]  328.00
## 5   0.43 [0.11 0.67]   34.00
## 6   0.59 [0.52 0.65]  429.00
## 7   0.92 [0.88 0.94]  111.00
## 8   0.56 [0.43 0.67]  128.00
## 9   0.60 [0.34 0.77]   37.00
## 10  0.62 [0.41 0.76]   52.00
## 11  0.58 [0.44 0.70]  108.00
## 12  0.87 [0.72 0.94]   26.00
## 13  0.60 [0.53 0.67]  312.00
## 14  0.71 [0.58 0.81]   74.00
## 15  0.51 [0.33 0.65]   87.00
## 16  0.40 [0.27 0.52]  170.00
## 17  0.38 [0.25 0.50]  186.00
## 18  0.66 [0.51 0.76]   84.00
## 19 0.14 [-0.24 0.48]   29.00
## 20  0.52 [0.45 0.58]  493.00
## 21  0.53 [0.41 0.63]  168.00
## 22  0.46 [0.05 0.73]   23.00
## 23  0.36 [0.26 0.46]  269.00

#weighted mean
#recalculate without CIs, and get all pairwise n's
cors_prov = ddply(d_lapop, .variables = "province_name", .fun = function(dat) {
  cors = cor_matrix(dat[c("CA", "S", "Skin_brightness")], reliabilities = v_reliabilities)
  cors_n = count.pairwise(dat[c("CA", "S", "Skin_brightness")])
  
  data.frame(SB_CA = cors[3, 1],
             SB_S = cors[3, 2],
             CA_S = cors[1, 2],
             n_SB_CA = cors_n[3, 1],
             n_SB_S = cors_n[3, 2],
             n_CA_S = cors_n[1, 2]
  ) 
})
wtd_mean(cors_prov$SB_CA, w = cors_prov$n_SB_CA)

## [1] 0.14

wtd_mean(cors_prov$SB_S, w = cors_prov$n_SB_S)

## [1] 0.19

wtd_mean(cors_prov$CA_S, w = cors_prov$n_CA_S)

## [1] 0.51

#models -- does prov have incremental validity above SB and CA?
(fit = lm("CA ~ province_name", data = d_lapop) %>% MOD_summary(progress = F))

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                         Beta    SE CI_lower CI_upper
## province_name: Buenos Aires            0.000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.135 0.048    0.040   0.2296
## province_name: Catamarca              -0.250 0.197   -0.637   0.1364
## province_name: Chaco                  -0.507 0.070   -0.644  -0.3710
## province_name: Chubut                 -0.384 0.174   -0.725  -0.0436
## province_name: CÃ³rdoba                 0.075 0.064   -0.052   0.2010
## province_name: Corrientes             -0.093 0.103   -0.294   0.1090
## province_name: Entre RÃos             -0.102 0.097   -0.293   0.0880
## province_name: Formosa                -0.427 0.167   -0.755  -0.1000
## province_name: Jujuy                  -0.564 0.143   -0.844  -0.2839
## province_name: La Pampa               -0.143 0.104   -0.347   0.0608
## province_name: La Rioja               -0.105 0.197   -0.491   0.2820
## province_name: Mendoza                 0.109 0.071   -0.029   0.2477
## province_name: Misiones               -0.049 0.122   -0.288   0.1904
## province_name: NeuquÃ©n                 0.310 0.114    0.086   0.5327
## province_name: RÃo Negro               0.022 0.087   -0.148   0.1927
## province_name: Salta                   0.100 0.084   -0.065   0.2645
## province_name: San Juan               -0.130 0.116   -0.357   0.0962
## province_name: San Luis                0.324 0.187   -0.043   0.6916
## province_name: Santa Fe               -0.113 0.062   -0.235   0.0088
## province_name: Santiago del Estero    -0.553 0.087   -0.724  -0.3814
## province_name: Tierra del Fuego        0.345 0.209   -0.065   0.7550
## province_name: TucumÃ¡n                -0.148 0.074   -0.293  -0.0031
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 5920 5897 0.042   0.038 0.033
## 
## 
## Etas from analysis of variance
##               Eta Eta_partial
## province_name 0.2         0.2

(fit = lm("CA ~ province_name + Skin_brightness + SIRE", data = d_lapop) %>% MOD_summary(progress = F))

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                          Beta    SE CI_lower CI_upper
## province_name: Buenos Aires            0.0000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.1097 0.050   0.0113    0.208
## province_name: Catamarca              -0.1671 0.272  -0.7004    0.366
## province_name: Chaco                  -0.4319 0.072  -0.5737   -0.290
## province_name: Chubut                 -0.3934 0.239  -0.8612    0.074
## province_name: CÃ³rdoba                 0.2312 0.071   0.0919    0.370
## province_name: Corrientes             -0.1602 0.123  -0.4021    0.082
## province_name: Entre RÃos             -0.0601 0.117  -0.2885    0.168
## province_name: Formosa                -0.5264 0.232  -0.9819   -0.071
## province_name: Jujuy                  -0.7543 0.198  -1.1431   -0.366
## province_name: La Pampa               -0.1570 0.103  -0.3588    0.045
## province_name: La Rioja                0.2257 0.283  -0.3292    0.781
## province_name: Mendoza                 0.0838 0.076  -0.0644    0.232
## province_name: Misiones                0.3073 0.169  -0.0249    0.640
## province_name: NeuquÃ©n                 0.3495 0.131   0.0918    0.607
## province_name: RÃo Negro               0.0538 0.093  -0.1293    0.237
## province_name: Salta                   0.2585 0.092   0.0784    0.439
## province_name: San Juan                0.0270 0.132  -0.2325    0.286
## province_name: San Luis                0.5098 0.254   0.0109    1.009
## province_name: Santa Fe               -0.1889 0.067  -0.3199   -0.058
## province_name: Santiago del Estero    -0.4518 0.095  -0.6373   -0.266
## province_name: Tierra del Fuego        0.5697 0.295  -0.0083    1.148
## province_name: TucumÃ¡n                -0.1228 0.078  -0.2757    0.030
## Skin_brightness                        0.1014 0.017   0.0682    0.135
## SIRE: Blanca                           0.0000    NA       NA       NA
## SIRE: IndÃgena                        -0.3589 0.140  -0.6325   -0.085
## SIRE: Mestiza                         -0.0081 0.036  -0.0792    0.063
## SIRE: Mulata                          -0.6620 0.369  -1.3848    0.061
## SIRE: Negra                           -0.2123 0.164  -0.5346    0.110
## SIRE: not stated                      -0.3079 0.076  -0.4579   -0.158
## SIRE: Otro                            -0.2381 0.128  -0.4889    0.013
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 4419 4389 0.073   0.066 0.058
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## province_name   0.202       0.206
## Skin_brightness 0.087       0.090
## SIRE            0.079       0.081

(fit = lm("S ~ province_name", data = d_lapop) %>% MOD_summary(progress = F))

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                          Beta    SE CI_lower CI_upper
## province_name: Buenos Aires            0.0000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.0564 0.047   -0.036    0.148
## province_name: Catamarca              -0.9334 0.192   -1.309   -0.558
## province_name: Chaco                  -0.7419 0.068   -0.874   -0.609
## province_name: Chubut                 -0.0167 0.169   -0.347    0.314
## province_name: CÃ³rdoba                 0.0088 0.063   -0.114    0.131
## province_name: Corrientes             -0.2764 0.100   -0.472   -0.081
## province_name: Entre RÃos             -0.2326 0.094   -0.417   -0.048
## province_name: Formosa                -0.5336 0.162   -0.852   -0.216
## province_name: Jujuy                  -0.8279 0.139   -1.100   -0.556
## province_name: La Pampa               -0.0016 0.101   -0.200    0.196
## province_name: La Rioja               -0.4986 0.192   -0.874   -0.123
## province_name: Mendoza                 0.0896 0.069   -0.045    0.224
## province_name: Misiones               -0.3903 0.119   -0.623   -0.158
## province_name: NeuquÃ©n                -0.0391 0.111   -0.256    0.178
## province_name: RÃo Negro              -0.0594 0.084   -0.225    0.106
## province_name: Salta                  -0.4606 0.082   -0.621   -0.300
## province_name: San Juan               -0.3587 0.112   -0.579   -0.139
## province_name: San Luis               -0.6560 0.182   -1.013   -0.299
## province_name: Santa Fe               -0.0465 0.060   -0.165    0.072
## province_name: Santiago del Estero    -1.2435 0.085   -1.410   -1.077
## province_name: Tierra del Fuego        0.3639 0.203   -0.034    0.762
## province_name: TucumÃ¡n                -0.4234 0.072   -0.564   -0.282
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1       S 5920 5897 0.096   0.093 0.089
## 
## 
## Etas from analysis of variance
##                Eta Eta_partial
## province_name 0.31        0.31

(fit = lm("S ~ province_name + Skin_brightness", data = d_lapop) %>% MOD_summary(progress = F))

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                         Beta    SE CI_lower CI_upper
## province_name: Buenos Aires            0.000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.057 0.048   -0.037    0.152
## province_name: Catamarca              -1.104 0.261   -1.615   -0.592
## province_name: Chaco                  -0.571 0.069   -0.707   -0.435
## province_name: Chubut                 -0.207 0.229   -0.656    0.242
## province_name: CÃ³rdoba                 0.021 0.068   -0.112    0.155
## province_name: Corrientes             -0.259 0.118   -0.490   -0.029
## province_name: Entre RÃos             -0.122 0.111   -0.341    0.096
## province_name: Formosa                -0.163 0.223   -0.599    0.274
## province_name: Jujuy                  -0.893 0.190   -1.266   -0.520
## province_name: La Pampa                0.022 0.099   -0.172    0.215
## province_name: La Rioja               -0.124 0.271   -0.656    0.408
## province_name: Mendoza                 0.076 0.072   -0.066    0.218
## province_name: Misiones               -0.079 0.162   -0.398    0.239
## province_name: NeuquÃ©n                 0.037 0.126   -0.210    0.284
## province_name: RÃo Negro               0.031 0.090   -0.145    0.206
## province_name: Salta                  -0.434 0.087   -0.605   -0.263
## province_name: San Juan               -0.329 0.127   -0.578   -0.081
## province_name: San Luis               -0.816 0.243   -1.293   -0.339
## province_name: Santa Fe               -0.064 0.064   -0.190    0.061
## province_name: Santiago del Estero    -1.138 0.091   -1.315   -0.960
## province_name: Tierra del Fuego        0.741 0.283    0.186    1.295
## province_name: TucumÃ¡n                -0.286 0.075   -0.432   -0.139
## Skin_brightness                        0.189 0.015    0.161    0.218
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 4419 4395 0.14    0.14  0.13
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## province_name   0.27        0.28
## Skin_brightness 0.18        0.19

(fit = lm("S ~ province_name + Skin_brightness + CA", data = d_lapop) %>% MOD_summary(progress = F))

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                          Beta    SE CI_lower CI_upper
## province_name: Buenos Aires            0.0000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.0157 0.044   -0.071  0.10297
## province_name: Catamarca              -1.0533 0.241   -1.526 -0.58011
## province_name: Chaco                  -0.4175 0.064   -0.544 -0.29115
## province_name: Chubut                 -0.0731 0.212   -0.489  0.34239
## province_name: CÃ³rdoba                -0.0622 0.063   -0.186  0.06140
## province_name: Corrientes             -0.2130 0.109   -0.426  0.00024
## province_name: Entre RÃos             -0.1025 0.103   -0.305  0.09975
## province_name: Formosa                 0.0159 0.206   -0.388  0.42020
## province_name: Jujuy                  -0.6267 0.176   -0.972 -0.28094
## province_name: La Pampa                0.0691 0.091   -0.110  0.24797
## province_name: La Rioja               -0.2181 0.251   -0.710  0.27420
## province_name: Mendoza                 0.0379 0.067   -0.094  0.16933
## province_name: Misiones               -0.1953 0.150   -0.490  0.09941
## province_name: NeuquÃ©n                -0.0966 0.117   -0.325  0.13215
## province_name: RÃo Negro               0.0095 0.083   -0.153  0.17191
## province_name: Salta                  -0.5314 0.081   -0.690 -0.37303
## province_name: San Juan               -0.3494 0.117   -0.580 -0.11909
## province_name: San Luis               -0.9953 0.225   -1.437 -0.55380
## province_name: Santa Fe                0.0043 0.059   -0.112  0.12034
## province_name: Santiago del Estero    -0.9776 0.084   -1.142 -0.81277
## province_name: Tierra del Fuego        0.5258 0.262    0.012  1.03944
## province_name: TucumÃ¡n                -0.2475 0.069   -0.383 -0.11195
## Skin_brightness                        0.1470 0.014    0.120  0.17396
## CA                                     0.3634 0.013    0.337  0.38961
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 4419 4394 0.27    0.26  0.26
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## province_name   0.23        0.26
## Skin_brightness 0.14        0.16
## CA              0.35        0.38

#full models
lm("CA ~ SIRE + province_name + Skin_brightness", data = d_lapop) %>% MOD_summary(progress = F)

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                          Beta    SE CI_lower CI_upper
## SIRE: Blanca                           0.0000    NA       NA       NA
## SIRE: IndÃgena                        -0.3589 0.140  -0.6325   -0.085
## SIRE: Mestiza                         -0.0081 0.036  -0.0792    0.063
## SIRE: Mulata                          -0.6620 0.369  -1.3848    0.061
## SIRE: Negra                           -0.2123 0.164  -0.5346    0.110
## SIRE: not stated                      -0.3079 0.076  -0.4579   -0.158
## SIRE: Otro                            -0.2381 0.128  -0.4889    0.013
## province_name: Buenos Aires            0.0000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.1097 0.050   0.0113    0.208
## province_name: Catamarca              -0.1671 0.272  -0.7004    0.366
## province_name: Chaco                  -0.4319 0.072  -0.5737   -0.290
## province_name: Chubut                 -0.3934 0.239  -0.8612    0.074
## province_name: CÃ³rdoba                 0.2312 0.071   0.0919    0.370
## province_name: Corrientes             -0.1602 0.123  -0.4021    0.082
## province_name: Entre RÃos             -0.0601 0.117  -0.2885    0.168
## province_name: Formosa                -0.5264 0.232  -0.9819   -0.071
## province_name: Jujuy                  -0.7543 0.198  -1.1431   -0.366
## province_name: La Pampa               -0.1570 0.103  -0.3588    0.045
## province_name: La Rioja                0.2257 0.283  -0.3292    0.781
## province_name: Mendoza                 0.0838 0.076  -0.0644    0.232
## province_name: Misiones                0.3073 0.169  -0.0249    0.640
## province_name: NeuquÃ©n                 0.3495 0.131   0.0918    0.607
## province_name: RÃo Negro               0.0538 0.093  -0.1293    0.237
## province_name: Salta                   0.2585 0.092   0.0784    0.439
## province_name: San Juan                0.0270 0.132  -0.2325    0.286
## province_name: San Luis                0.5098 0.254   0.0109    1.009
## province_name: Santa Fe               -0.1889 0.067  -0.3199   -0.058
## province_name: Santiago del Estero    -0.4518 0.095  -0.6373   -0.266
## province_name: Tierra del Fuego        0.5697 0.295  -0.0083    1.148
## province_name: TucumÃ¡n                -0.1228 0.078  -0.2757    0.030
## Skin_brightness                        0.1014 0.017   0.0682    0.135
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 4419 4389 0.073   0.066 0.058
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## SIRE            0.079       0.081
## province_name   0.202       0.206
## Skin_brightness 0.087       0.090

lm("CA ~ SIRE + province_name + Skin_brightness", data = d_lapop) %>% MOD_summary(progress = F)

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                          Beta    SE CI_lower CI_upper
## SIRE: Blanca                           0.0000    NA       NA       NA
## SIRE: IndÃgena                        -0.3589 0.140  -0.6325   -0.085
## SIRE: Mestiza                         -0.0081 0.036  -0.0792    0.063
## SIRE: Mulata                          -0.6620 0.369  -1.3848    0.061
## SIRE: Negra                           -0.2123 0.164  -0.5346    0.110
## SIRE: not stated                      -0.3079 0.076  -0.4579   -0.158
## SIRE: Otro                            -0.2381 0.128  -0.4889    0.013
## province_name: Buenos Aires            0.0000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.1097 0.050   0.0113    0.208
## province_name: Catamarca              -0.1671 0.272  -0.7004    0.366
## province_name: Chaco                  -0.4319 0.072  -0.5737   -0.290
## province_name: Chubut                 -0.3934 0.239  -0.8612    0.074
## province_name: CÃ³rdoba                 0.2312 0.071   0.0919    0.370
## province_name: Corrientes             -0.1602 0.123  -0.4021    0.082
## province_name: Entre RÃos             -0.0601 0.117  -0.2885    0.168
## province_name: Formosa                -0.5264 0.232  -0.9819   -0.071
## province_name: Jujuy                  -0.7543 0.198  -1.1431   -0.366
## province_name: La Pampa               -0.1570 0.103  -0.3588    0.045
## province_name: La Rioja                0.2257 0.283  -0.3292    0.781
## province_name: Mendoza                 0.0838 0.076  -0.0644    0.232
## province_name: Misiones                0.3073 0.169  -0.0249    0.640
## province_name: NeuquÃ©n                 0.3495 0.131   0.0918    0.607
## province_name: RÃo Negro               0.0538 0.093  -0.1293    0.237
## province_name: Salta                   0.2585 0.092   0.0784    0.439
## province_name: San Juan                0.0270 0.132  -0.2325    0.286
## province_name: San Luis                0.5098 0.254   0.0109    1.009
## province_name: Santa Fe               -0.1889 0.067  -0.3199   -0.058
## province_name: Santiago del Estero    -0.4518 0.095  -0.6373   -0.266
## province_name: Tierra del Fuego        0.5697 0.295  -0.0083    1.148
## province_name: TucumÃ¡n                -0.1228 0.078  -0.2757    0.030
## Skin_brightness                        0.1014 0.017   0.0682    0.135
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 4419 4389 0.073   0.066 0.058
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## SIRE            0.079       0.081
## province_name   0.202       0.206
## Skin_brightness 0.087       0.090

lm("S ~ SIRE + province_name + Skin_brightness + CA", data = d_lapop) %>% MOD_summary(progress = F)

## The model data contains characters. These were automatically converteed but you should probably do this before calling this function.

## 
##     ---- Model summary ----    
## Model coefficients
##                                          Beta    SE CI_lower CI_upper
## SIRE: Blanca                           0.0000    NA       NA       NA
## SIRE: IndÃgena                         0.0034 0.124   -0.240   0.2464
## SIRE: Mestiza                         -0.1137 0.032   -0.177  -0.0506
## SIRE: Mulata                           0.4913 0.327   -0.150   1.1330
## SIRE: Negra                           -0.1327 0.146   -0.419   0.1534
## SIRE: not stated                      -0.1246 0.068   -0.258   0.0088
## SIRE: Otro                            -0.1695 0.114   -0.392   0.0532
## province_name: Buenos Aires            0.0000    NA       NA       NA
## province_name: Buenos Aires City (DC)  0.0096 0.045   -0.078   0.0970
## province_name: Catamarca              -1.0147 0.241   -1.488  -0.5413
## province_name: Chaco                  -0.4248 0.064   -0.551  -0.2984
## province_name: Chubut                 -0.0546 0.212   -0.470   0.3607
## province_name: CÃ³rdoba                -0.0579 0.063   -0.182   0.0659
## province_name: Corrientes             -0.2642 0.110   -0.479  -0.0495
## province_name: Entre RÃos             -0.1336 0.103   -0.336   0.0692
## province_name: Formosa                -0.0334 0.206   -0.438   0.3711
## province_name: Jujuy                  -0.6241 0.176   -0.970  -0.2784
## province_name: La Pampa                0.0472 0.091   -0.132   0.2265
## province_name: La Rioja               -0.1809 0.251   -0.673   0.3116
## province_name: Mendoza                 0.0301 0.067   -0.102   0.1617
## province_name: Misiones               -0.2283 0.150   -0.523   0.0667
## province_name: NeuquÃ©n                -0.1116 0.117   -0.340   0.1173
## province_name: RÃo Negro               0.0028 0.083   -0.160   0.1653
## province_name: Salta                  -0.5051 0.082   -0.665  -0.3451
## province_name: San Juan               -0.3538 0.117   -0.584  -0.1235
## province_name: San Luis               -1.0478 0.226   -1.491  -0.6047
## province_name: Santa Fe                0.0011 0.059   -0.115   0.1175
## province_name: Santiago del Estero    -0.9862 0.084   -1.151  -0.8211
## province_name: Tierra del Fuego        0.5513 0.262    0.038   1.0647
## province_name: TucumÃ¡n                -0.2590 0.069   -0.395  -0.1232
## Skin_brightness                        0.1266 0.015    0.097   0.1562
## CA                                     0.3626 0.013    0.336   0.3889
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 4419 4388 0.27    0.26  0.26
## 
## 
## Etas from analysis of variance
##                   Eta Eta_partial
## SIRE            0.055       0.065
## province_name   0.227       0.257
## Skin_brightness 0.108       0.126
## CA              0.349       0.378

Province-level analyses

Third, the main province-level analyses.

### PREDICTOR ANALYSIS


# aliases/vars -----------------------------------------------------------------

#aliases
d_main$S_knoema = d_main$S_wtd
d_main$CA_ONE = d_main$CA


# correlations ------------------------------------------------------------

v_primary_vars = c("CA_ONE", "CA_lapop", "CA_lapop_aggr", "S_knoema", "S_lapop", "S_lapop_aggr", "S_comb", "HDI2011", "HDI1996", "European", "European_SIRE", "White", "Skin_brightness", "Latitude", "Temperature")

wtd.cors(d_main[v_primary_vars]) %>% round(2)

##                 CA_ONE CA_lapop CA_lapop_aggr S_knoema S_lapop
## CA_ONE            1.00     0.56          0.60     0.81    0.76
## CA_lapop          0.56     1.00          0.97     0.64    0.63
## CA_lapop_aggr     0.60     0.97          1.00     0.66    0.69
## S_knoema          0.81     0.64          0.66     1.00    0.68
## S_lapop           0.76     0.63          0.69     0.68    1.00
## S_lapop_aggr      0.75     0.68          0.75     0.66    0.99
## S_comb            0.87     0.71          0.75     0.95    0.86
## HDI2011           0.82     0.61          0.64     0.88    0.78
## HDI1996           0.76     0.52          0.56     0.83    0.65
## European          0.44     0.31          0.40     0.36    0.43
## European_SIRE     0.50     0.40          0.52     0.31    0.66
## White             0.49     0.35          0.47     0.26    0.58
## Skin_brightness   0.65     0.52          0.55     0.62    0.68
## Latitude          0.48     0.50          0.53     0.58    0.69
## Temperature      -0.49    -0.55         -0.54    -0.55   -0.66
##                 S_lapop_aggr S_comb HDI2011 HDI1996 European European_SIRE
## CA_ONE                  0.75   0.87    0.82    0.76     0.44          0.50
## CA_lapop                0.68   0.71    0.61    0.52     0.31          0.40
## CA_lapop_aggr           0.75   0.75    0.64    0.56     0.40          0.52
## S_knoema                0.66   0.95    0.88    0.83     0.36          0.31
## S_lapop                 0.99   0.86    0.78    0.65     0.43          0.66
## S_lapop_aggr            1.00   0.86    0.76    0.63     0.47          0.69
## S_comb                  0.86   1.00    0.90    0.82     0.42          0.50
## HDI2011                 0.76   0.90    1.00    0.87     0.18          0.36
## HDI1996                 0.63   0.82    0.87    1.00     0.38          0.33
## European                0.47   0.42    0.18    0.38     1.00          0.69
## European_SIRE           0.69   0.50    0.36    0.33     0.69          1.00
## White                   0.60   0.43    0.28    0.30     0.72          0.97
## Skin_brightness         0.68   0.71    0.68    0.54     0.22          0.48
## Latitude                0.69   0.64    0.70    0.57     0.12          0.38
## Temperature            -0.65  -0.61   -0.71   -0.49     0.08         -0.19
##                 White Skin_brightness Latitude Temperature
## CA_ONE           0.49            0.65     0.48       -0.49
## CA_lapop         0.35            0.52     0.50       -0.55
## CA_lapop_aggr    0.47            0.55     0.53       -0.54
## S_knoema         0.26            0.62     0.58       -0.55
## S_lapop          0.58            0.68     0.69       -0.66
## S_lapop_aggr     0.60            0.68     0.69       -0.65
## S_comb           0.43            0.71     0.64       -0.61
## HDI2011          0.28            0.68     0.70       -0.71
## HDI1996          0.30            0.54     0.57       -0.49
## European         0.72            0.22     0.12        0.08
## European_SIRE    0.97            0.48     0.38       -0.19
## White            1.00            0.41     0.25       -0.08
## Skin_brightness  0.41            1.00     0.56       -0.57
## Latitude         0.25            0.56     1.00       -0.93
## Temperature     -0.08           -0.57    -0.93        1.00

wtd.cors(d_main[v_primary_vars], weight = d_main$n_mean %>% sqrt) %>% round(2)

##                 CA_ONE CA_lapop CA_lapop_aggr S_knoema S_lapop
## CA_ONE            1.00     0.65          0.71     0.91    0.77
## CA_lapop          0.65     1.00          0.97     0.69    0.75
## CA_lapop_aggr     0.71     0.97          1.00     0.72    0.81
## S_knoema          0.91     0.69          0.72     1.00    0.74
## S_lapop           0.77     0.75          0.81     0.74    1.00
## S_lapop_aggr      0.75     0.75          0.83     0.70    0.99
## S_comb            0.92     0.76          0.81     0.97    0.88
## HDI2011           0.92     0.69          0.73     0.93    0.77
## HDI1996           0.86     0.53          0.59     0.87    0.59
## European          0.58     0.32          0.43     0.56    0.52
## European_SIRE     0.53     0.45          0.59     0.46    0.70
## White             0.54     0.40          0.55     0.46    0.64
## Skin_brightness   0.75     0.65          0.68     0.76    0.79
## Latitude          0.50     0.50          0.54     0.56    0.66
## Temperature      -0.37    -0.54         -0.51    -0.40   -0.59
##                 S_lapop_aggr S_comb HDI2011 HDI1996 European European_SIRE
## CA_ONE                  0.75   0.92    0.92    0.86     0.58          0.53
## CA_lapop                0.75   0.76    0.69    0.53     0.32          0.45
## CA_lapop_aggr           0.83   0.81    0.73    0.59     0.43          0.59
## S_knoema                0.70   0.97    0.93    0.87     0.56          0.46
## S_lapop                 0.99   0.88    0.77    0.59     0.52          0.70
## S_lapop_aggr            1.00   0.85    0.74    0.56     0.53          0.73
## S_comb                  0.85   1.00    0.93    0.82     0.58          0.58
## HDI2011                 0.74   0.93    1.00    0.89     0.43          0.49
## HDI1996                 0.56   0.82    0.89    1.00     0.55          0.39
## European                0.53   0.58    0.43    0.55     1.00          0.69
## European_SIRE           0.73   0.58    0.49    0.39     0.69          1.00
## White                   0.67   0.56    0.46    0.41     0.75          0.97
## Skin_brightness         0.77   0.82    0.77    0.60     0.40          0.57
## Latitude                0.65   0.62    0.56    0.44     0.29          0.44
## Temperature            -0.57  -0.48   -0.44   -0.19     0.06         -0.15
##                 White Skin_brightness Latitude Temperature
## CA_ONE           0.54            0.75     0.50       -0.37
## CA_lapop         0.40            0.65     0.50       -0.54
## CA_lapop_aggr    0.55            0.68     0.54       -0.51
## S_knoema         0.46            0.76     0.56       -0.40
## S_lapop          0.64            0.79     0.66       -0.59
## S_lapop_aggr     0.67            0.77     0.65       -0.57
## S_comb           0.56            0.82     0.62       -0.48
## HDI2011          0.46            0.77     0.56       -0.44
## HDI1996          0.41            0.60     0.44       -0.19
## European         0.75            0.40     0.29        0.06
## European_SIRE    0.97            0.57     0.44       -0.15
## White            1.00            0.53     0.34       -0.05
## Skin_brightness  0.53            1.00     0.52       -0.46
## Latitude         0.34            0.52     1.00       -0.84
## Temperature     -0.05           -0.46    -0.84        1.00

wtd.cors(d_main[v_primary_vars], weight = d_main$pop %>% sqrt) %>% round(2)

##                 CA_ONE CA_lapop CA_lapop_aggr S_knoema S_lapop
## CA_ONE            1.00     0.60          0.65     0.85    0.76
## CA_lapop          0.60     1.00          0.97     0.64    0.67
## CA_lapop_aggr     0.65     0.97          1.00     0.65    0.72
## S_knoema          0.85     0.64          0.65     1.00    0.72
## S_lapop           0.76     0.67          0.72     0.72    1.00
## S_lapop_aggr      0.75     0.70          0.77     0.69    0.99
## S_comb            0.88     0.71          0.75     0.96    0.88
## HDI2011           0.87     0.62          0.65     0.86    0.72
## HDI1996           0.81     0.49          0.54     0.80    0.57
## European          0.48     0.35          0.44     0.49    0.53
## European_SIRE     0.47     0.39          0.53     0.33    0.62
## White             0.46     0.35          0.49     0.32    0.58
## Skin_brightness   0.69     0.56          0.57     0.69    0.72
## Latitude          0.47     0.46          0.50     0.57    0.66
## Temperature      -0.43    -0.52         -0.48    -0.50   -0.61
##                 S_lapop_aggr S_comb HDI2011 HDI1996 European European_SIRE
## CA_ONE                  0.75   0.88    0.87    0.81     0.48          0.47
## CA_lapop                0.70   0.71    0.62    0.49     0.35          0.39
## CA_lapop_aggr           0.77   0.75    0.65    0.54     0.44          0.53
## S_knoema                0.69   0.96    0.86    0.80     0.49          0.33
## S_lapop                 0.99   0.88    0.72    0.57     0.53          0.62
## S_lapop_aggr            1.00   0.86    0.71    0.55     0.56          0.67
## S_comb                  0.86   1.00    0.87    0.77     0.54          0.48
## HDI2011                 0.71   0.87    1.00    0.86     0.26          0.37
## HDI1996                 0.55   0.77    0.86    1.00     0.42          0.32
## European                0.56   0.54    0.26    0.42     1.00          0.67
## European_SIRE           0.67   0.48    0.37    0.32     0.67          1.00
## White                   0.62   0.46    0.32    0.31     0.72          0.97
## Skin_brightness         0.71   0.76    0.68    0.51     0.33          0.48
## Latitude                0.65   0.61    0.58    0.46     0.24          0.36
## Temperature            -0.59  -0.55   -0.54   -0.30     0.02         -0.07
##                 White Skin_brightness Latitude Temperature
## CA_ONE           0.46            0.69     0.47       -0.43
## CA_lapop         0.35            0.56     0.46       -0.52
## CA_lapop_aggr    0.49            0.57     0.50       -0.48
## S_knoema         0.32            0.69     0.57       -0.50
## S_lapop          0.58            0.72     0.66       -0.61
## S_lapop_aggr     0.62            0.71     0.65       -0.59
## S_comb           0.46            0.76     0.61       -0.55
## HDI2011          0.32            0.68     0.58       -0.54
## HDI1996          0.31            0.51     0.46       -0.30
## European         0.72            0.33     0.24        0.02
## European_SIRE    0.97            0.48     0.36       -0.07
## White            1.00            0.44     0.26        0.00
## Skin_brightness  0.44            1.00     0.51       -0.50
## Latitude         0.26            0.51     1.00       -0.88
## Temperature      0.00           -0.50    -0.88        1.00

#output
combine_upperlower(wtd.cors(d_main[v_primary_vars], weight = d_main$pop %>% sqrt),
                   wtd.cors(d_main[v_primary_vars])) %>% 
  write_clipboard()

##                 CA ONE CA lapop CA lapop aggr S knoema S lapop
## CA ONE                     0.60          0.65     0.85    0.76
## CA lapop          0.56                   0.97     0.64    0.67
## CA lapop aggr     0.60     0.97                   0.65    0.72
## S knoema          0.81     0.64          0.66             0.72
## S lapop           0.76     0.63          0.69     0.68        
## S lapop aggr      0.75     0.68          0.75     0.66    0.99
## S comb            0.87     0.71          0.75     0.95    0.86
## HDI2011           0.82     0.61          0.64     0.88    0.78
## HDI1996           0.76     0.52          0.56     0.83    0.65
## European          0.44     0.31          0.40     0.36    0.43
## European SIRE     0.50     0.40          0.52     0.31    0.66
## White             0.49     0.35          0.47     0.26    0.58
## Skin brightness   0.65     0.52          0.55     0.62    0.68
## Latitude          0.48     0.50          0.53     0.58    0.69
## Temperature      -0.49    -0.55         -0.54    -0.55   -0.66
##                 S lapop aggr S comb HDI2011 HDI1996 European European SIRE
## CA ONE                  0.75   0.88    0.87    0.81     0.48          0.47
## CA lapop                0.70   0.71    0.62    0.49     0.35          0.39
## CA lapop aggr           0.77   0.75    0.65    0.54     0.44          0.53
## S knoema                0.69   0.96    0.86    0.80     0.49          0.33
## S lapop                 0.99   0.88    0.72    0.57     0.53          0.62
## S lapop aggr                   0.86    0.71    0.55     0.56          0.67
## S comb                  0.86           0.87    0.77     0.54          0.48
## HDI2011                 0.76   0.90            0.86     0.26          0.37
## HDI1996                 0.63   0.82    0.87             0.42          0.32
## European                0.47   0.42    0.18    0.38                   0.67
## European SIRE           0.69   0.50    0.36    0.33     0.69              
## White                   0.60   0.43    0.28    0.30     0.72          0.97
## Skin brightness         0.68   0.71    0.68    0.54     0.22          0.48
## Latitude                0.69   0.64    0.70    0.57     0.12          0.38
## Temperature            -0.65  -0.61   -0.71   -0.49     0.08         -0.19
##                 White Skin brightness Latitude Temperature
## CA ONE           0.46            0.69     0.47       -0.43
## CA lapop         0.35            0.56     0.46       -0.52
## CA lapop aggr    0.49            0.57     0.50       -0.48
## S knoema         0.32            0.69     0.57       -0.50
## S lapop          0.58            0.72     0.66       -0.61
## S lapop aggr     0.62            0.71     0.65       -0.59
## S comb           0.46            0.76     0.61       -0.55
## HDI2011          0.32            0.68     0.58       -0.54
## HDI1996          0.31            0.51     0.46       -0.30
## European         0.72            0.33     0.24        0.02
## European SIRE    0.97            0.48     0.36       -0.07
## White                            0.44     0.26        0.00
## Skin brightness  0.41                     0.51       -0.50
## Latitude         0.25            0.56                -0.88
## Temperature     -0.08           -0.57    -0.93

# OLS ---------------------------------------------------------------------
#some wtf error with standardized betas not fitting with correlations
#CA
(fit = lm("CA ~ European + Temperature", data = d_main) %>% 
  MOD_summary(runs = 200, progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##              Beta   SE CI_lower CI_upper
## European     0.48 0.16     0.15     0.81
## Temperature -0.53 0.16    -0.86    -0.20
## 
## 
## Model meta-data
##   outcome  N df   R2 R2-adj. R2-cv
## 1      CA 24 21 0.47    0.42 -0.85
## 
## 
## Etas from analysis of variance
##              Eta Eta_partial
## European    0.48        0.55
## Temperature 0.53        0.59

(fit = lm("CA ~ European + Temperature", data = d_main, weights = d_main$pop %>% sqrt) %>% 
  MOD_summary(runs = 200, progress = F))

## Warning in sqrt(.): NaNs produced

## 
##     ---- Model summary ----    
## Model coefficients
##             Beta   SE CI_lower CI_upper
## European     0.5 0.17     0.15     0.85
## Temperature -0.6 0.22    -1.06    -0.13
## 
## 
## Model meta-data
##   outcome  N df   R2 R2-adj. R2-cv
## 1      CA 24 21 0.43    0.37  -2.3
## 
## 
## Etas from analysis of variance
##             Eta Eta_partial
## European    NaN           1
## Temperature NaN           1

fit %>% `[[`("coefs") %>% write_clipboard(print = T, clean_names = T)

##              Beta   SE CI lower CI upper
## European     0.50 0.17     0.15     0.85
## Temperature -0.60 0.22    -1.06    -0.13

#S
(fit = lm("S ~ CA + European + Temperature", data = d_main) %>% 
  MOD_summary(runs = 200, progress = F))

## 
##     ---- Model summary ----    
## Model coefficients
##              Beta   SE CI_lower CI_upper
## CA           0.64 0.13     0.37    0.920
## European     0.16 0.12    -0.08    0.401
## Temperature -0.31 0.12    -0.56   -0.063
## 
## 
## Model meta-data
##   outcome  N df   R2 R2-adj. R2-cv
## 1       S 24 20 0.82    0.79   0.3
## 
## 
## Etas from analysis of variance
##              Eta Eta_partial
## CA          0.47        0.74
## European    0.13        0.30
## Temperature 0.25        0.50

(fit = lm("S ~ CA + European + Temperature", data = d_main, weights = d_main$pop %>% sqrt) %>% 
  MOD_summary(runs = 200, progress = F))

## Warning in sqrt(.): NaNs produced

## 
##     ---- Model summary ----    
## Model coefficients
##              Beta   SE CI_lower CI_upper
## CA           0.65 0.12    0.408    0.900
## European     0.24 0.11    0.013    0.465
## Temperature -0.39 0.14   -0.678   -0.093
## 
## 
## Model meta-data
##   outcome  N df   R2 R2-adj. R2-cv
## 1       S 24 20 0.84    0.82   0.4
## 
## 
## Etas from analysis of variance
##             Eta Eta_partial
## CA          NaN           1
## European    NaN           1
## Temperature NaN           1

fit %>% `[[`("coefs") %>% write_clipboard(print = T, clean_names = T)

##              Beta   SE CI lower CI upper
## CA           0.65 0.12     0.41     0.90
## European     0.24 0.11     0.01     0.47
## Temperature -0.39 0.14    -0.68    -0.09

# LASSO -------------------------------------------------------------------

#CA
LASSO_CA = MOD_LASSO(d_main, dependent = "CA", predictors = c("European", "Temperature"),
                     runs = 500, weights_ = d_main$pop %>% sqrt, progress = F, nfolds = 5)

## Data standardized.

MOD_summarize_models(LASSO_CA) %>% write_clipboard()

##                 European Temperature
## mean                0.04       -0.04
## median              0.00        0.00
## sd                  0.08        0.08
## mad                 0.00        0.00
## fraction zeroNA     0.74        0.77

#S
LASSO_S = MOD_LASSO(d_main, dependent = "S", predictors = c("CA", "European", "Temperature"),
                    runs = 500, weights_ = d_main$pop %>% sqrt, progress = F, nfolds = 5)

## Data standardized.

MOD_summarize_models(LASSO_S) %>% write_clipboard()

##                   CA European Temperature
## mean            0.64     0.13       -0.24
## median          0.64     0.14       -0.25
## sd              0.01     0.05        0.06
## mad             0.00     0.04        0.05
## fraction zeroNA 0.00     0.01        0.00

#OLS fit
fit = lm("S ~ CA + European + Mean.C", data = d_main, weights = d_main$pop %>% sqrt) %>%
  MOD_summary(runs = 200, progress = F)

## Warning in sqrt(.): NaNs produced

fit$coefs %>% write_clipboard(clean_names = T, print = T)

##           Beta   SE CI lower CI upper
## CA        0.65 0.12     0.41     0.90
## European  0.24 0.11     0.01     0.47
## Mean C   -0.39 0.14    -0.68    -0.09

# path model -------------------------------------------------------------
wtd.cors(d_main[c("S", "CA", "Temperature", "Latitude", "European")])

##                 S    CA Temperature Latitude European
## S            1.00  0.87      -0.612     0.64    0.418
## CA           0.87  1.00      -0.489     0.48    0.440
## Temperature -0.61 -0.49       1.000    -0.93    0.084
## Latitude     0.64  0.48      -0.933     1.00    0.120
## European     0.42  0.44       0.084     0.12    1.000

#make path model
#simple
str_mod = "
S ~ CA + European
CA ~ European
"

#complex
str_mod = "
S ~ CA + European + Temperature
CA ~ European + Temperature
European ~ Temperature
"

#fit unwtd, make design, fit wtd
fit = sem(model = str_mod, data = d_main, std.ov = T)
parameterestimates(fit)

##            lhs op         rhs    est    se     z pvalue ci.lower ci.upper
## 1            S  ~          CA  0.645 0.120  5.37  0.000    0.409    0.881
## 2            S  ~    European  0.161 0.105  1.53  0.127   -0.045    0.367
## 3            S  ~ Temperature -0.310 0.108 -2.87  0.004   -0.523   -0.098
## 4           CA  ~    European  0.484 0.149  3.25  0.001    0.192    0.776
## 5           CA  ~ Temperature -0.529 0.149 -3.55  0.000   -0.821   -0.237
## 6     European  ~ Temperature  0.084 0.203  0.41  0.681   -0.315    0.482
## 7            S ~~           S  0.176 0.051  3.46  0.001    0.076    0.275
## 8           CA ~~          CA  0.507 0.146  3.46  0.001    0.220    0.793
## 9     European ~~    European  0.952 0.275  3.46  0.001    0.413    1.490
## 10 Temperature ~~ Temperature  0.958 0.000    NA     NA    0.958    0.958

#weighted
design = svydesign(ids = ~1,
                   data = d_main,
                   weights = d_main$pop %>% sqrt)

fit_wtd = lavaan.survey(fit, design)

## Warning in lavData(data = data, group = group, cluster = cluster, ov.names
## = ov.names, : lavaan WARNING: std.ov argument is ignored if only sample
## statistics are provided.

parameterestimates(fit_wtd, standardized = T)

##            lhs op         rhs    est    se     z pvalue ci.lower ci.upper
## 1            S  ~          CA  0.628 0.098  6.41  0.000    0.436    0.820
## 2            S  ~    European  1.494 0.677  2.21  0.027    0.167    2.821
## 3            S  ~ Temperature -0.092 0.027 -3.45  0.001   -0.145   -0.040
## 4           CA  ~    European  3.246 1.558  2.08  0.037    0.192    6.301
## 5           CA  ~ Temperature -0.149 0.047 -3.16  0.002   -0.241   -0.057
## 6     European  ~ Temperature  0.001 0.007  0.18  0.857   -0.012    0.015
## 7            S ~~           S  0.143 0.033  4.30  0.000    0.078    0.208
## 8           CA ~~          CA  0.557 0.222  2.51  0.012    0.122    0.992
## 9     European ~~    European  0.022 0.010  2.33  0.020    0.004    0.041
## 10 Temperature ~~ Temperature  8.493 0.000    NA     NA    8.493    8.493
## 11           S ~1              0.728 0.470  1.55  0.122   -0.194    1.650
## 12          CA ~1              0.655 1.268  0.52  0.606   -1.830    3.140
## 13    European ~1              0.628 0.128  4.92  0.000    0.378    0.878
## 14 Temperature ~1             17.550 0.000    NA     NA   17.550   17.550
##    std.lv std.all std.nox
## 1   0.628   0.645   0.645
## 2   1.494   0.232   0.232
## 3  -0.092  -0.281  -0.096
## 4   3.246   0.492   0.492
## 5  -0.149  -0.441  -0.151
## 6   0.001   0.025   0.008
## 7   0.143   0.156   0.156
## 8   0.557   0.575   0.575
## 9   0.022   0.999   0.999
## 10  8.493   1.000   8.493
## 11  0.728   0.759   0.759
## 12  0.655   0.665   0.665
## 13  0.628   4.214   4.214
## 14 17.550   6.022  17.550

# plots -------------------------------------------------------------------

GG_scatter(d_main, "CA", "S", weights = d_main$pop %>% sqrt, case_names = d_main$province_name) +
  xlab("Cognitive ability (mean of multiple scholastic tests)") +
  ylab("S, General socioeconomic factor (34 indicators)")

ggsave("figures/province/CA_S.png")

## Saving 7 x 5 in image

GG_scatter(d_main, "European", "CA", weights = d_main$pop %>% sqrt, case_names = d_main$province_name) +
  xlab("European genomic ancestry (estimated from multiple studies)") +
  ylab("Cognitive ability (mean of multiple scholastic tests)")

ggsave("figures/province/Euro_CA.png")

## Saving 7 x 5 in image

GG_scatter(d_main, "European", "S", weights = d_main$pop %>% sqrt, case_names = d_main$province_name) +
  xlab("European genomic ancestry (estimated from multiple studies)") +
  ylab("S, General socioeconomic factor (34 indicators)")

ggsave("figures/province/Euro_S.png")

## Saving 7 x 5 in image

# multiple ancestries -----------------------------------------------------

MOD_APSLM("CA", predictors = c("African", "Amerindian", "European"), data = d_main, .weights = d_main$pop %>% sqrt)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%

## $beta_matrix
##   African Amerindian European AIC BIC   r2 r2.adj. r2_cv    R R.adj.  N
## 1   -0.46         NA       NA  72  75 0.18    0.14 -0.63 0.43   0.38 24
## 2      NA      -0.40       NA  73  76 0.15    0.11 -1.37 0.39   0.34 24
## 3      NA         NA     0.49  70  74 0.23    0.20 -1.95 0.48   0.44 24
## 4   -0.42      -0.36       NA  70  75 0.30    0.23 -2.07 0.55   0.48 24
## 5   -0.31         NA     0.38  70  75 0.30    0.23 -2.06 0.55   0.48 24
## 6      NA       0.99     1.43  70  75 0.30    0.23 -2.06 0.55   0.48 24
## 7   -0.52      -0.70    -0.36  72  78 0.30    0.20 -3.36 0.55   0.44 24
## 
## $all_models
## $all_models[[1]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African  
##       0.064       -0.457  
## 
## 
## $all_models[[2]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)   Amerindian  
##      0.0743      -0.4011  
## 
## 
## $all_models[[3]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)     European  
##      0.0355       0.4881  
## 
## 
## $all_models[[4]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African   Amerindian  
##     0.00199     -0.41717     -0.35900  
## 
## 
## $all_models[[5]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African     European  
##     0.00234     -0.30639      0.37973  
## 
## 
## $all_models[[6]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)   Amerindian     European  
##     0.00336      0.99051      1.42778  
## 
## 
## $all_models[[7]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African   Amerindian     European  
##     0.00167     -0.52190     -0.69833     -0.35899

MOD_APSLM("S", predictors = c("African", "Amerindian", "European"), data = d_main, .weights = d_main$pop %>% sqrt)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%

## $beta_matrix
##   African Amerindian European AIC BIC   r2 r2.adj.  r2_cv    R R.adj.  N
## 1   -0.38         NA       NA  74  78 0.12   0.082 -0.333 0.35   0.29 24
## 2      NA      -0.49       NA  71  75 0.22   0.185 -0.099 0.47   0.43 24
## 3      NA         NA     0.55  69  73 0.29   0.254 -0.106 0.54   0.50 24
## 4   -0.33      -0.46       NA  70  75 0.31   0.246 -0.321 0.56   0.50 24
## 5   -0.19         NA     0.48  70  75 0.31   0.247 -0.318 0.56   0.50 24
## 6      NA       0.62     1.14  70  75 0.31   0.248 -0.317 0.56   0.50 24
## 7    1.75       6.27     7.12  72  78 0.32   0.213 -1.906 0.56   0.46 24
## 
## $all_models
## $all_models[[1]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African  
##       0.106       -0.381  
## 
## 
## $all_models[[2]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)   Amerindian  
##      0.0843      -0.4905  
## 
## 
## $all_models[[3]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)     European  
##      0.0478       0.5513  
## 
## 
## $all_models[[4]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African   Amerindian  
##       0.027       -0.331       -0.457  
## 
## 
## $all_models[[5]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African     European  
##      0.0273      -0.1892       0.4843  
## 
## 
## $all_models[[6]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)   Amerindian     European  
##      0.0276       0.6209       1.1403  
## 
## 
## $all_models[[7]]
## 
## Call:
## lm(formula = models[model.idx], data = data, weights = .weights)
## 
## Coefficients:
## (Intercept)      African   Amerindian     European  
##      0.0333       1.7466       6.2730       7.1201

Maps

Fourth, we create the maps for a nice visual presentation.

### CREATE MAPS OF VARIABLES

#https://cran.r-project.org/web/packages/choroplethr/vignettes/i-creating-admin1-maps.html

# load regions ------------------------------------------------------------
d_argentina_admin1 = get_admin1_regions("argentina")

#add data
d_argentina_admin1$Province = str_replace(d_argentina_admin1$region, pattern = "provincia de ", replacement = "") %>% 
  str_replace(pattern = "provincia del ", replacement = "") %>% pu_translate(superunit="ARG", messages = 0)

#attempt to match using fuzzymatch
# fuzzyjoin::stringdist_left_join(x = d_argentina_admin1,
#                                 y = d_main[1],
#                                 by = c("region_short" = "Wiki_name"),
#                                 method = "lcs",
#                                 max_dist = 7)
#this never works better than manual solutions!

#sort by short choro name
# sort_df(d_argentina_admin1, vars = "region_short")

d_argentina_admin1 = dplyr::left_join(d_argentina_admin1, d_main[c("Province", "name", "S", "European", "CA", "mining", "Temperature")], by = c("Province" = "Province"))


# plot --------------------------------------------------------------------

#neutral
gg = admin1_map("argentina")

#get average lat lon by AD
d_geo = ddply(gg$data, .variables = "region", .fun = function(block) {
  c("long" = averages(block$long, types = "midrange") %>% unname,
    "lat" = averages(block$lat, types = "midrange") %>% unname
    )
})

#merge these into the main dataset
d_argentina_admin1 = dplyr::full_join(d_argentina_admin1, d_geo, by = c("region" = "region"))

#CA
d_argentina_admin1$value = d_argentina_admin1$CA
admin1_choropleth(country.name = "argentina", 
                  df           = d_argentina_admin1, 
                  legend       = "Cognitive ability (Z-scale)", 
                  num_colors   = 1) +
  geom_text(data = d_argentina_admin1, aes(long, lat, label = name, group = NULL), size = 2.5)

ggsave("figures/province/maps/map_CA.png")

## Saving 7 x 5 in image

#S
d_argentina_admin1$value = d_argentina_admin1$S
admin1_choropleth(country.name = "argentina", 
                  df           = d_argentina_admin1, 
                  legend       = "General socioeconomic factor (Z-scale)", 
                  num_colors   = 1) +
  geom_text(data = d_argentina_admin1, aes(long, lat, label = name, group = NULL), size = 2.5)

ggsave("figures/province/maps/map_S.png")

## Saving 7 x 5 in image

#euro
d_argentina_admin1$value = d_argentina_admin1$European
admin1_choropleth(country.name = "argentina", 
                  df           = d_argentina_admin1, 
                  legend       = "European genetic ancestry (proportion)", 
                  num_colors   = 1) +
  geom_text(data = d_argentina_admin1, aes(long, lat, label = name, group = NULL), size = 2.5)

ggsave("figures/province/maps/map_euro.png")

## Saving 7 x 5 in image

#temperature (control)
d_argentina_admin1$value = d_argentina_admin1$Temperature
admin1_choropleth(country.name = "argentina", 
                  df           = d_argentina_admin1, 
                  legend       = "Mean temperature (C)", 
                  num_colors   = 1) +
  geom_text(data = d_argentina_admin1, aes(long, lat, label = name, group = NULL), size = 2.5)

ggsave("figures/province/maps/map_temp.png")

## Saving 7 x 5 in image

#mining (control)
d_argentina_admin1$value = d_argentina_admin1$mining
admin1_choropleth(country.name = "argentina", 
                  df           = d_argentina_admin1, 
                  legend       = "Mining (factor score)", 
                  num_colors   = 1) +
  geom_text(data = d_argentina_admin1, aes(long, lat, label = name, group = NULL), size = 2.5)

ggsave("figures/province/maps/map_mining.png")

## Saving 7 x 5 in image

#output data for reprex
write_rds(d_argentina_admin1, "data/map_data.rds")

Output data for reuse

Finally, we output the datasets for reuse.

#output data
write_rds(d_lapop, "data/reuse/cases.rds")
write_rds(d_main, "data/reuse/provinces.rds")
write_rds(d_muni, "data/reuse/municipals.rds")
write_rds(d_dist, "data/reuse/districts.rds")

Admixture in Argentina

Emil O. W. Kirkegaard & John Fuerst

Introduction

Initialize

LAPOP analyses

S data

CA data

District-level analyses

Municipal-level analyses

Aggregate to province-level

Compare S at multiple levels

Province-level analyses

Cognitive data

S data

Main analyses

Does mining-activity confound the results?

Individual-level analyses

Province-level analyses

Maps

Output data for reuse