library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
googlesheets4,
ggrepel,
lme4, performance, ggfortify, ggeffects, effectsize, fixest, mediation, metafor, #models
sf, rnaturalearth, countrycode, rnaturalearthhires, tmap, geodata, spatialreg, spdep, #spatial
)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'effectsize'
##
## The following object is masked from 'package:kirkegaard':
##
## standardize
##
## The following object is masked from 'package:psych':
##
## phi
##
##
## Attaching package: 'fixest'
##
## The following object is masked from 'package:performance':
##
## r2
##
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: mvtnorm
##
## Attaching package: 'mvtnorm'
##
## The following object is masked from 'package:effectsize':
##
## standardize
##
## The following object is masked from 'package:kirkegaard':
##
## standardize
##
## Loading required package: sandwich
## mediation: Causal Mediation Analysis
## Version: 4.5.1
##
##
## Attaching package: 'mediation'
##
## The following object is masked from 'package:psych':
##
## mediate
##
## Loading required package: metadat
## Loading required package: numDeriv
##
## Loading the 'metafor' package (version 4.8-0). For an
## introduction to the package please type: help(metafor)
##
##
## Attaching package: 'metafor'
##
## The following object is masked from 'package:fixest':
##
## se
##
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
## Loading required package: terra
## terra 1.8.70
##
## Attaching package: 'terra'
##
## The following object is masked from 'package:MASS':
##
## area
##
## The following object is masked from 'package:fixest':
##
## panel
##
## The following object is masked from 'package:kirkegaard':
##
## rescale
##
## The following objects are masked from 'package:psych':
##
## describe, distance, rescale
##
## The following object is masked from 'package:assertthat':
##
## noNA
##
## The following objects are masked from 'package:magrittr':
##
## extract, inset
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
##
## Attaching package: 'spData'
##
## The following object is masked from 'package:ggeffects':
##
## coffee_data
##
##
## Attaching package: 'spdep'
##
## The following objects are masked from 'package:spatialreg':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
#preferred functions
standardize = kirkegaard::standardize
select = dplyr::select
# Define a palette with 13+ colors (ColorBrewer-inspired, distinct for light/dark themes)
large_palette <- c(
"#1F77B4", "#FF7F0E", "#2CA02C", "#D62728", "#9467BD",
"#8C564B", "#E377C2", "#7F7F7F", "#BCBD22", "#17BECF",
"#FFBB78", "#98DF8A", "#FF9896", "#C5B0D5", "#C49C94"
)
#fix a list
fix_list_col = function(x) {
#if x is a list, return the first element
if (is.list(x)) {
return(x[[1]] %>% as.character())
} else {
return(x %>% as.character())
}
}
#recale within country S to global scale
rescale_S_to_global = function(dd, L_level) {
# browser()
#level variable
if (L_level == "1") {
level_var = "has_L1_subunits"
} else if (L_level == "2") {
level_var = "has_L2_subunits"
} else {
stop("bad input")
}
#rescale values
#first select appropriate subset
dd %>%
select(
L0_name, L1_name, L0_name, Level, has_L1_subunits, has_L2_subunits, POP, HDI_latest, ISO3,
matches("^S_\\d$")) %>%
filter(
(.data[[level_var]] & Level == "0") | Level == L_level
) %>%
plyr::ddply(c("L0_name"), function(dd) {
#subset to sub units only
dd_subs = dd[-1, ]
message(dd_subs$L0_name[1])
if (dd_subs$L0_name[1] == "Chile") {
# browser()
}
#z score S variables
dd_subs_S_z = dd_subs %>%
select(matches("^S_\\d$")) %>%
df_standardize(exclude_range_01 = F)
#drop variables with too much missing data
miss = miss_by_var(dd_subs_S_z, prop = T)
#keep only variables with less than 30% missing data
dd_subs_S_z2 = dd_subs_S_z %>%
select(-which(miss >= 0.3))
#impute missing data if there are at least 2 variables
if (ncol(dd_subs_S_z2) >= 3) {
#check if any data is missing
if (any(is.na(dd_subs_S_z2))) {
dd_subs_S_z2_imp = dd_subs_S_z2 %>% miss_impute()
} else {
#if no missing data, use as is
dd_subs_S_z2_imp = dd_subs_S_z2
}
} else {
#if only 1-2 variables, use them as they are
dd_subs_S_z2_imp = dd_subs_S_z2
}
#ensure all cors positive, otherwise we need to reverse the sign
S_cors = cor(bind_cols(
dd_subs %>% select(contains("HDI")),
dd_subs_S_z2_imp
),
use = "pairwise.complete.obs")
#reverse variables that correlate negatively with HDI
S_reversed = which(S_cors[1, ] < 0) - 1 #-1 because first column is HDI
if (length(S_reversed) > 0) {
dd_subs_S_z2_imp[, S_reversed] = -dd_subs_S_z2_imp[, S_reversed]
}
#compute mean z score
dd_subs$S_mean_within = dd_subs_S_z2_imp %>%
rowMeans(na.rm = T) %>%
standardize()
#scale to international S norms
#do this by making the weighted mean equal the country mean
#and the within country S SD equal to subnational HDI SD/international HDI SD
#first we need to compute the subnational HDI SD relative to the global HDI SD
hdi_sd = sd(dd_subs %>% pull(contains("HDI")), na.rm = T)
hdi_sd_global = global_hdi_stats$sd[4] #2020 HDI SD
hdi_sd_ratio = hdi_sd / hdi_sd_global
#then calculate the weighted mean of the subnational S scores
S_mean_weighted = weighted.mean(dd_subs$S_mean_within, dd_subs$POP, na.rm = T) %>% as.numeric() #this avoids a weird attribute it attaches dw_transformer
#set weighted mean to 0
dd_subs$S_mean_within = dd_subs$S_mean_within - S_mean_weighted
#then convert to international norms
dd_subs$S_global = dd_subs$S_mean_within * hdi_sd_ratio + dd[1, "S_1"]
#check that the weighted mean is now equal to the country mean
means_diff = dd[1, "S_1"] - weighted.mean(dd_subs$S_global, dd_subs$POP, na.rm = T)
if (is.na(means_diff)) {
browser()
}
if (abs(means_diff) > 0.01) {
stop("Weighted mean of subnational S scores does not match country mean.")
}
dd_subs
})
}
#last value in a data frame, rowwise
get_last_value = function(df) {
df %>%
rowwise() %>%
mutate(HDI_latest = {
vals <- c_across(HDI_2005:HDI_2022_PUND)
# get last non-NA value
last_val <- rev(vals)[which(!is.na(rev(vals)))[1]]
last_val
}) %>%
ungroup() %>%
pull(HDI_latest)
}
#summarizer for spatialreg models
summarize_spatial_models = function(model_list) {
signif_stars <- function(p) {
if (p < 0.001) "***"
else if (p < 0.01) "**"
else if (p < 0.05) "*"
else ""
}
summarize_one <- function(model) {
tidy_coef <- broom::tidy(model)
tidy_coef <- tidy_coef[!duplicated(tidy_coef$term), ]
tidy_coef$formatted <- sprintf("%0.2f (%0.3f%s)",
tidy_coef$estimate,
tidy_coef$std.error,
sapply(tidy_coef$p.value, signif_stars))
spatial_param_name <- if (!is.null(model$rho)) "rho" else if (!is.null(model$lambda)) "lambda" else NA_character_
spatial_val <- if (spatial_param_name == "rho") model$rho else if (spatial_param_name == "lambda") model$lambda else NA_real_
spatial_se <- if (spatial_param_name == "rho") model$rho.se else if (spatial_param_name == "lambda") model$lambda.se else NA_real_
spatial_p <- 2 * (1 - pnorm(abs(spatial_val / spatial_se)))
spatial_str <- sprintf("%0.2f (%0.3f%s)", spatial_val, spatial_se, signif_stars(spatial_p))
tidy_coef <- tidy_coef[tidy_coef$term != spatial_param_name, ]
pred_tbl <- tidy_coef %>%
select(term, formatted) %>%
bind_rows(tibble(term = "spatial", formatted = spatial_str))
resid <- residuals(model)
y <- model$y
r_squared <- 1 - var(resid) / var(y)
n <- length(y)
p <- length(coef(model))
adj_r_squared <- 1 - (1 - r_squared) * ((n - 1) / (n - p))
stats_tbl <- tibble(
term = c("R2 adj.", "N"),
formatted = c(sprintf("%.3f", adj_r_squared), as.character(n))
)
bind_rows(pred_tbl, stats_tbl)
}
summaries <- lapply(model_list, summarize_one)
all_terms <- unique(unlist(lapply(summaries, function(df) df$term)))
result_mat <- sapply(summaries, function(df) {
formatted_vec <- setNames(df$formatted, df$term)
unname(formatted_vec[all_terms])
})
stats_terms <- c("R2 adj.", "N")
term_rank <- ifelse(all_terms %in% stats_terms, 2, 1)
ordered_terms <- all_terms[order(term_rank, all_terms)]
df <- as.data.frame(result_mat[match(ordered_terms, all_terms), , drop = FALSE],
stringsAsFactors = FALSE)
df$term <- ordered_terms
df <- df[, c(ncol(df), 1:(ncol(df) - 1))]
rownames(df) <- NULL
as_tibble(df)
}
#mediation function
fit_mediation_by_group <- function(data, outcome, treatment, mediator, group_var,
covariates_m = NULL, covariates_y = NULL, weight_var = NULL,
sims = 1000) {
groups <- unique(data[[group_var]])
results_list <- lapply(groups, function(g) {
d_sub <- data %>% dplyr::filter(.data[[group_var]] == g)
# Build formulas more safely
rhs_m <- c(treatment, covariates_m)
rhs_y <- c(mediator, treatment, covariates_y)
formula_m <- as.formula(paste(mediator, "~", paste(rhs_m, collapse = " + ")))
formula_y <- as.formula(paste(outcome, "~", paste(rhs_y, collapse = " + ")))
# Fit mediator and outcome models (with optional weights)
if (!is.null(weight_var)) {
w <- d_sub[[weight_var]]
model_m <- lm(formula_m, data = d_sub, weights = w)
model_y <- lm(formula_y, data = d_sub, weights = w)
} else {
model_m <- lm(formula_m, data = d_sub)
model_y <- lm(formula_y, data = d_sub)
}
# Perform mediation analysis
med_fit <- mediate(model.m = model_m,
model.y = model_y,
treat = treatment,
mediator = mediator,
sims = sims)
# Tidy summary of mediation results with group column
med_sum <- summary(med_fit)
#approx. SE
prop_sims <- med_fit$n0.sims
prop_sims <- prop_sims[is.finite(prop_sims)]
tibble::tibble(
group = g,
ACME = med_sum$d0,
ACME_p = med_sum$d0.p,
ADE = med_sum$z0,
ADE_p = med_sum$z0.p,
TotalEffect = med_sum$tau.coef,
TotalEffect_p = med_sum$tau.p,
PropMediated = med_sum$n0,
PropMediated_p = med_sum$n0.p,
PropMediated_se = sd(pmin(pmax(prop_sims, -1), 1))
)
})
dplyr::bind_rows(results_list)
}
#use live version of local version
if (T) {
#download live version
gs4_deauth()
d = read_sheet("https://docs.google.com/spreadsheets/d/1_YjHil-uOFy6mPor1tU0ZM-aXI0HxlD3SQWY45E95u8/edit?gid=0#gid=0", sheet = 2)
d_NIQ_S = read_sheet("https://docs.google.com/spreadsheets/d/1cReoeIZLlbxOrN4_wnj52Q6UWFepXSJGi3Gj5m6ZAZg/edit?gid=528842851#gid=528842851", sheet = "Seb2024b")
#save local version in case we need it
d %>% write_rds("data/main_file.rds")
d_NIQ_S %>% write_rds("data/d_NIQ_S.rds")
} else {
d = read_rds("data/main_file.rds")
d_NIQ_S = read_rds("data/d_NIQ_S.rds")
}
## ✔ Reading from "Summary Admix Data File".
## ✔ Range ''Tab 1: Summary data''.
## New names:
## ✔ Reading from "National IQ datasets".
## ✔ Range ''Seb2024b''.
## • `` -> `...63`
## • `` -> `...95`
#HDI data
d_hdi = read_csv("data/human-development-index/human-development-index.csv") %>%
rename(
HDI = `Human Development Index`,
Region = `World regions according to OWID`,
ISO3 = Code
)
## Rows: 6682 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Entity, Code, World regions according to OWID
## dbl (2): Year, Human Development Index
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#ISO for countries
d %<>% mutate(
row_i = 1:n(), #for preserving the spreadsheet row order
L0_ISO2 = countrycode::countrycode(L0_ISO3, origin = "iso3c", destination = "iso2c"),
#combine region ISO codes if present
ISO3 = L0_ISO3 + ifelse(!is.na(L1_ISO), "-" + L1_ISO, "") + ifelse(!is.na(L2_ISO), "-" + L2_ISO, ""),
ISO2 = L0_ISO2 + ifelse(!is.na(L1_ISO), "-" + L1_ISO, "") + ifelse(!is.na(L2_ISO), "-" + L2_ISO, ""),
Level = Level %>% map_chr(fix_list_col) %>% as.character() %>% str_remove(".000000"),
#smallest unit name for plotting
lowest_name = miss_fill(L2_name, L1_name, L0_name),
#latest HDI value, better than average since time may be off
HDI_latest = d %>% select(HDI_2005:HDI_2022_PUND) %>% get_last_value(),
#CA on z scale
CA = Summary_ACHQ,
CA_z = (CA-100)/15
)
#manually override Puerto Rico ISO
d$ISO3[d$L0_name == "Puerto Rico"] = "PRI"
d$ISO2[d$L0_name == "Puerto Rico"] = "US-PR"
#assert no dup ISOs
assert_that(
!any(duplicated(d$ISO3)),
msg = "Duplicate ISO codes found in the dataset."
)
## [1] TRUE
#assert no dup ISO2 codes
assert_that(
!any(duplicated(d$ISO2)),
msg = "Duplicate ISO2 codes found in the dataset."
)
## [1] TRUE
#determine relationships among levels
#loop across rows and look up other cases to determine relationships
d = map_dfr(1:nrow(d), function(i) {
#get row
dd = d[i, ]
#if it is level 0b, they never have subunits
if (dd$Level == "0b") {
y = dd %>%
mutate(
L1_subunits = 0,
has_L1_subunits = FALSE,
L2_subunits = 0,
has_L2_subunits = FALSE,
L1_group = "Others",
L2_group = "Others",
)
return(y)
}
#if level 2, it doesn't have subunits either
if (dd$Level == "2") {
y = dd %>%
mutate(
L1_subunits = 0,
has_L1_subunits = FALSE,
L2_subunits = 0,
has_L2_subunits = FALSE,
L1_group = dd$L0_name,
L2_group = dd$L1_name,
)
return(y)
}
#if level 0
if (dd$Level == "0") {
#filter to see if any L1 and L2 units exist
dd_L1 = d %>% filter(L0_name == dd$L0_name, Level == "1")
dd_L2 = d %>% filter(L0_name == dd$L0_name, Level == "2")
y = dd %>% mutate(
L1_subunits = nrow(dd_L1),
has_L1_subunits = L1_subunits > 0,
L2_subunits = nrow(dd_L2),
has_L2_subunits = L2_subunits > 0,
L1_group = if_else(has_L1_subunits, true = dd$L0_name, false = "Others"),
L2_group = if_else(has_L2_subunits, true = dd$L0_name, false = "Others")
)
return(y)
}
#if level 1
if (dd$Level == "1") {
#filter to see if any L1 and L2 units exist
dd_L2 = d %>% filter(L0_name == dd$L0_name, L1_name == dd$L1_name, Level == "2")
y = dd %>% mutate(
L1_subunits = 0,
has_L1_subunits = F,
L2_subunits = nrow(dd_L2),
has_L2_subunits = L2_subunits > 0,
L1_group = dd$L0_name,
L2_group = NA
)
return(y)
}
#any others is an error
browser()
})
#check results
d %>% filter(has_L1_subunits) %>% select(L0_name, L1_subunits)
d %>% filter(has_L2_subunits) %>% select(L0_name, L1_name, L2_subunits)
#not Americans
d$Americans = !d$L0_name %in% c("France", "Denmark", "Netherlands", "United Kingdom")
#leave these out, they cause a lot of headaches
d_euros = d %>%
filter(
!Americans
)
#get rid of non-Americas units
d_orig = d
d %<>% filter(Americans)
#set subnational S to HDI scale
#first we need to compute global HDI mean/SD, then the mean/SD for countries in our sample
#then we need to group data for countries with L 1 units, then compute the SD among those units, and set their weighted mean to the country mean, and the S/S_subnational SD to be equal to the HDI/HDI_subnational SD.
global_hdi_stats = d_hdi %>%
select(-Region) %>%
filter(
!is.na(ISO3),
Year %in% c(2005, 2010, 2015, 2020)
) %>%
#wide format
pivot_wider(
names_from = Year,
values_from = HDI
) %>%
describe2()
#international S scale
#get international data
global_S_stats = d_NIQ_S$SDI %>% describe2()
American_S_stats = d %>%
filter(
S_1_source == "Kirkegaard_Seb_Jensen"
) %>%
select(S_1) %>%
describe2()
#aggregate and rescale the data for L1 units
d_L1_rescaled = rescale_S_to_global(d, L_level = "1")
## Argentina
## Bolivia
## Brazil
## Canada
## Chile
## Colombia
## Costa Rica
## Cuba
## Ecuador
## Guatemala
## Mexico
## Panama
## Peru
## USA
## Venezuela
d_L2_rescaled = rescale_S_to_global(d, L_level = "2")
## Chile
# d %>% filter(Level == "0", has_L1_subunits) %>% GG_scatter("HDI_2020", "S_1", case_names = "Region_1")
#join these values back to main dataset
d$S_global = NULL
d = full_join(
d,
bind_rows(
d_L1_rescaled,
d_L2_rescaled
) %>% select(ISO3, S_global),
by = "ISO3"
)
#reset order to regular
d %<>% arrange(row_i)
#combine S_1 and S_global
#use S_global when present, otherwise, use S_1
d %<>% mutate(
S = ifelse(!is.na(S_global), S_global, S_1)
)
#normalize the ancestries to sum = 1
d %<>% mutate(across(ends_with("_fin"), ~ . / (Genetic_West_Eurasian_fin + Genetic_AMER_fin + Genetic_AFR_fin + Genetic_OTHER_fin)))
#confirm it is correct
d %>% select(Genetic_West_Eurasian_fin, Genetic_AMER_fin, Genetic_AFR_fin, Genetic_OTHER_fin) %>% rowSums() %>% table2()
#rename ancestries for ease of use
d %<>% mutate(
EUR = Genetic_West_Eurasian_fin,
AMR = Genetic_AMER_fin,
AFR = Genetic_AFR_fin,
OTR = Genetic_OTHER_fin
)
#check that the sum is 1
d %>% select(matches("_fin$")) %>%
rowSums() %>%
subtract(1) %>%
is_less_than(0.0001) %>%
all(na.rm = T) %>%
assert_that()
## [1] TRUE
#rename some variables for ease of use
# d %<>% mutate(
# CA = Summary_ACHQ,
# )
#code to export recomputed S to spreadsheet via clipboard and also keeping the original order of rows
if (F) {
left_join(
d_orig %>% select(row_i, ISO3),
d %>% select(row_i, S)
) %>%
arrange(row_i) %>%
write_clipboard()
}
#fill in S for any using without it which have HDI
S_pred_fit = lm(S ~ HDI_latest, data = d %>% filter(Level == "0"))
S_pred_fit %>% summary()
##
## Call:
## lm(formula = S ~ HDI_latest, data = d %>% filter(Level == "0"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2893 -0.1258 0.0053 0.1020 0.4127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.627 0.278 -16.6 <2e-16 ***
## HDI_latest 6.233 0.364 17.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.165 on 32 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.901, Adjusted R-squared: 0.898
## F-statistic: 292 on 1 and 32 DF, p-value: <2e-16
d$S_pred = predict(S_pred_fit, newdata = d %>% select(HDI_latest))
d %>% filter(Level == "0") %>% GG_scatter("S_pred", "S", case_names = "L0_name")
## `geom_smooth()` using formula = 'y ~ x'
d %>% filter(Level == "0b") %>% select(lowest_name, S_pred, HDI_latest)
d %>% filter(Level == "0b") %>% GG_scatter("S_pred", "HDI_latest", case_names = "L0_name")
## `geom_smooth()` using formula = 'y ~ x'
d %>% filter(Level == "1") %>% GG_scatter("S_pred", "S", color = "L1_group", case_names = "L0_name")
## `geom_smooth()` using formula = 'y ~ x'
d %>% filter(Level == "1", L0_name == "Guatemala") %>% GG_scatter("S", "HDI_latest", case_names = "lowest_name")
## `geom_smooth()` using formula = 'y ~ x'
#fill in missing S values
d %<>% mutate(
S = ifelse(is.na(S), S_pred, S)
)
We must do this step after the above.
L0 and L1b units.
#countries we have subnational data for
#since we
nats_with_L1_ISO3 = d %>% filter(has_L1_subunits) %>% pull(ISO3)
nats_with_L1_ISO2 <- countrycode::countrycode(nats_with_L1_ISO3, origin = "iso3c", destination = "iso2c")
#spatial
sp_nat = ne_countries(type = "countries") %>%
mutate(
ISO3 = adm0_a3,
ISO2 = iso_a2
)
#drop data we dont need
sp_nat %<>%
filter(
ISO3 %in% d$ISO3
) %>%
select(name, ISO2, ISO3)
#make the data more valid
sp_nat %<>%
st_make_valid()
#a few countries are inexplicably ISO2 codes, so we need to fix them
#more are missing, but these ones are the important ones, though we don't need them for this study
#the remaining missings are: Somaliland and northern Cyprus
sp_nat$ISO2[sp_nat$ISO3 == "NOR"] = "NO"
sp_nat$ISO2[sp_nat$ISO3 == "FRA"] = "FR"
sp_nat$ISO2[sp_nat$ISO3 == "KOS"] = "XK"
# sp_extras = rnaturalearth::ne_download(scale = 10, type = "admin_0_scale_rank_minor_islands", returnclass = "sf", destdir = "data")
sp_extras = read_sf("data/ne_10m_admin_0_scale_rank_minor_islands/ne_10m_admin_0_scale_rank_minor_islands.shp") %>%
mutate(
ISO3 = sr_su_a3,
# ISO2 = countrycode::countrycode(sr_adm0_a3, origin = "iso3c", destination = "iso2c")
name = sr_subunit
) %>%
#drop data we dont need
select(
name, ISO3, sr_adm0_a3, sr_su_a3
)
#drop units without ISO codes
sp_extras %<>% filter(!is.na(ISO3))
#recode Antigua & Barbuda to ATG
sp_extras$ISO3[sp_extras$sr_adm0_a3 == "ATG"] = "ATG"
#and US virgin islands to our special code
sp_extras$ISO3[sp_extras$sr_adm0_a3 == "VIR"] = "USA-VI"
#Puerto Rico as well
sp_extras$ISO3[sp_extras$sr_adm0_a3 == "PRI"] = "USA-PR"
#Netherlands Caribbean = Bonaire, Sint Eustatius and Saba
sp_extras$ISO3[sp_extras$name == "Caribbean Netherlands"] = "BES"
#are we missing anything that isn't L1 or L2?
missing_mini_countries = d %>%
filter(Level %in% c("0", "0b"), Americans) %>%
filter(!ISO3 %in% sp_nat$ISO3) %>% select(
L0_name, ISO3
) %>%
mutate(
#add ISO2 for easier joining later
adm0_match = ISO3 %in% sp_extras$sr_adm0_a3,
sr_su_match = ISO3 %in% sp_extras$sr_su_a3
)
#add these from the extras file
#but first we need to filter and merge them
sp_extras_missing = sp_extras %>%
filter(
#some use their own ISO codes, but others only have a sub-code, and we need to find them in the subunit ISO
ISO3 %in% missing_mini_countries$ISO3
) %>%
select(ISO3, name, geometry) %>%
group_by(ISO3) %>%
summarise(
name = first(na.omit(name)),
ISO3 = first(ISO3),
geometry = st_union(geometry)
) %>%
ungroup()
sp_extras_missing %>% tm_shape() + tm_polygons()
#do we have all of them?
nrow(sp_extras_missing)
## [1] 23
nrow(missing_mini_countries)
## [1] 23
#what's missing?
missing_mini_countries_check = missing_mini_countries %>%
filter(!ISO3 %in% c(sp_extras_missing$ISO3))
missing_mini_countries_check
#add them to main maps
sp_nat2 = bind_rows(
sp_nat,
sp_extras_missing
)
#validate and simplify geometries
sp_nat2 %<>%
#repair
st_make_valid() %>%
st_simplify(preserveTopology = TRUE, dTolerance = 0.01) %>%
#add lat lon and controids
mutate(centroid = st_centroid(geometry)) %>%
mutate(lon = st_coordinates(centroid)[,1], lat = st_coordinates(centroid)[,2])
#check all nations are there again
missing_mini_countries_check = d %>%
filter(Level %in% c("0", "0b")) %>%
filter(!ISO3 %in% sp_nat2$ISO3) %>%
select(
L0_name, ISO3
)
assert_that(nrow(missing_mini_countries_check) == 0,
msg = "Some countries are missing from the spatial data.")
## [1] TRUE
#no dups
assert_that(!sp_nat2$ISO3 %>% anyDuplicated())
## [1] TRUE
#test map by plotting names
sp_nat2 %>%
ggplot(aes(x = lon, y = lat, label = name)) +
geom_sf() +
geom_text_repel(size = 3,
max.overlaps = Inf,
min.segment.length = 0,
force = 1,
force_pull = 1,
point.padding = 0.5,
segment.color = "black",
segment.size = 0.5) +
theme_minimal()
GG_save("figs/maps/nat names.png")
#join data to spatial
sp_nat3 = left_join(
sp_nat2,
d %>% select(ISO3, CA, S, ends_with("_fin")),
by = c("ISO3" = "ISO3")
)
#which have missing data?
miss_covars = sp_nat3 %>%
filter(is.na(CA) | is.na(S)) %>%
select(ISO3, name, CA, S) %>%
arrange(ISO3)
#confirm with main file
d %>% select(L0_name, ISO3, CA, S, HDI_latest, ends_with("_fin")) %>% filter(ISO3 %in% miss_covars$ISO3)
L1 units.
#L1 units
sp_subdiv = read_sf("data/ne_10m_admin_1_states_provinces/ne_10m_admin_1_states_provinces.shp") %>%
mutate(ISO2 = iso_3166_2,
L0_ISO3 = adm0_a3,
L0_ISO2 = iso_a2,
ISO3 = L0_ISO3 + (ISO2 %>% str_remove("^[^-]+"))
)
sp_subdiv %>% select(name, ISO2, ISO3, L0_ISO2, L0_ISO3)
#there are some errors we need to fix
#first stop units without names
sp_subdiv %<>% filter(!is.na(name))
#then fix the issues
sp_subdiv$ISO3[sp_subdiv$name == "Bogota"] = "COL-DC"
sp_subdiv$ISO3[sp_subdiv$ISO3 == "MEX-DIF"] = "MEX-CMX"
#Panama data is out of date, so we download from another source
sp_panama <- gadm(country = "PAN", level = 1, path = "data") %>% st_as_sf()
#it has missing ISO codes, so we add them
sp_panama$ISO3 = c(
"PAN-1",
"PAN-4",
"PAN-2",
"PAN-3",
"PAN-5",
"PAN-EM",
"PAN-6",
"PAN-KY",
"PAN-7",
"PAN-NB",
"PAN-8",
"PAN-10",
"PAN-9"
)
#Puerto Rico and US Virgin Islands
#but first merge their geometries
sp_USA_PR_VI = sp_extras %>%
filter(ISO3 %in% c("USA-PR", "USA-VI")) %>%
select(ISO3, name, geometry) %>%
group_by(ISO3) %>%
summarise(
name = first(na.omit(name)),
ISO3 = first(ISO3),
geometry = st_union(geometry)
) %>%
ungroup()
#subnational divs
sp_subdiv2 <- bind_rows(
sp_nat3 %>%
#then remove those that have L1 units
filter(!ISO3 %in% nats_with_L1_ISO3) %>%
#only keep the vars we need
select(name, ISO3),
#add most subunits, but exclude Panama
sp_subdiv %>% filter(L0_ISO3 %in% nats_with_L1_ISO3) %>% filter(L0_ISO3 != "PAN"),
#and then Panama
sp_panama %>% rename(name = NAME_1) %>% select(name, ISO3),
#USA territories
sp_USA_PR_VI
)
sp_subdiv2$name[!st_is_valid(sp_subdiv2)]
## [1] "Goiás"
#make valid
sp_subdiv2 %<>%
st_make_valid()
#Lima Peru has 2 parts, we need to merge them
sp_subdiv2 = sp_subdiv2 %>%
group_by(ISO3) %>%
summarise(
name = first(na.omit(name)),
ISO3 = first(ISO3),
geometry = st_union(geometry)
) %>%
ungroup()
#check for duplicates
sp_subdiv2$ISO3 %>% anyDuplicated() %>% not() %>% assert_that()
## [1] TRUE
#make valid and add lon lat
sp_subdiv2 %<>%
#repair
st_make_valid() %>% # Repair invalid geometries
st_simplify(preserveTopology = TRUE, dTolerance = 0.01) %>% # Simplify to remove duplicates
#add lat lon and controids
mutate(centroid = st_centroid(geometry)) %>%
mutate(lon = st_coordinates(centroid)[,1], lat = st_coordinates(centroid)[,2])
#check for missing units
missing_subunits = d %>%
select(L0_name, L1_name, ISO2, ISO3, Level) %>%
mutate(
matches = ISO3 %in% sp_subdiv2$ISO3
) %>%
filter(
!ISO3 %in% nats_with_L1_ISO3,
!matches,
Level != "2"
)
#assert none
assert_that(nrow(missing_subunits) == 0)
## [1] TRUE
#test map by plotting names
sp_subdiv2 %>%
ggplot(aes(x = lon, y = lat, label = name)) +
geom_sf() +
geom_text(size = 1) +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal()
GG_save("figs/maps/subdiv names.png")
#join in the data
sp_subdiv3 = left_join(
sp_subdiv2,
d %>% select(ISO3, L0_ISO3, CA, S,
EUR, AMR, AFR, OTR),
by = "ISO3"
)
#which have missing data?
miss_covars = sp_subdiv3 %>%
filter(is.na(CA) | is.na(S)) %>%
select(ISO3, name, CA, S) %>%
arrange(ISO3)
miss_covars
library(lwgeom)
library(igraph)
# 0) Start from your sf polygons
# sp_subdiv3
# 2️⃣ Turn off s2 for topology ops
sf_use_s2(FALSE)
# 3️⃣ Reproject to a metric CRS
# (Web Mercator EPSG:3857 is fine for global work)
sp <- st_transform(sp_subdiv3, 3857)
# 4️⃣ Clean and snap to a small grid
sp <- sp |>
st_make_valid() |>
st_set_precision(1) |>
st_snap_to_grid(size = 1)
# meters; tune 300–1500 if needed
conn_dist <- 750
# adjacency allowing tiny gaps
nb_list <- st_is_within_distance(sp, sp, dist = conn_dist)
nb_list <- lapply(seq_along(nb_list), function(i) setdiff(nb_list[[i]], i)) # drop self
g <- graph_from_adj_list(nb_list, mode = "all") |> as_undirected("collapse")
sp$landmass <- components(g)$membership
# union per landmass on original (unbuffered) sp
landmass_union <- sp |>
group_by(landmass) |>
summarise(geometry = st_union(geometry), .groups = "drop") |>
st_make_valid()
# keep holes (lakes are OK)
landmass_coast <- landmass_union |>
mutate(coast = st_boundary(geometry)) |>
select(landmass, coast)
# coastal proportion (same helper as before)
poly_boundary <- st_boundary(sp)
coast_by_poly <- landmass_coast$coast[ match(sp$landmass, landmass_coast$landmass) ]
len_intersection <- function(i) {
seg <- st_intersection(st_geometry(poly_boundary)[i], st_geometry(coast_by_poly)[i])
if (length(seg) == 0 || all(st_is_empty(seg))) return(0)
as.numeric(sum(st_length(seg)))
}
coastal_lengths <- vapply(seq_len(nrow(sp)), len_intersection, numeric(1))
total_lengths <- as.numeric(st_length(poly_boundary))
sp$coastal_proportion <- ifelse(total_lengths > 0, pmin(coastal_lengths/total_lengths, 1), NA_real_)
#centroids
sp_labels <- sp %>%
st_centroid() %>%
mutate(coords = st_coordinates(geometry)) %>%
mutate(x = coords[,1], y = coords[,2])
#lat lon format
sp_plot <- st_transform(sp, 4326)
sp_labels <- sp_plot %>%
st_point_on_surface() %>%
mutate(coords = st_coordinates(geometry),
x = coords[,1], y = coords[,2])
# 8) Plot
sp_plot %>%
ggplot() +
geom_sf(aes(fill = coastal_proportion), color = "black", size = 0.2) +
# geom_text(data = sp_labels, aes(x = x, y = y, label = round(coastal_proportion, 2))) +
scale_fill_gradient(low = "white", high = "red", name = "Coastal proportion",
limits = c(0, 1), labels = scales::percent_format(accuracy = 1)) +
theme_minimal() +
labs(title = "Coastal Proportion by Unit") +
theme(legend.position = "right") +
coord_sf(
xlim = c(-170, -10), # longitudes (west to east)
ylim = c(-60, 120), # latitudes (south to north)
expand = FALSE # don’t add extra margins
)
GG_save("figs/maps/water access.png")
From the map data, we derived lat lon, which we want for analysis.
#compile the data we need
d_latlon = bind_rows(
sp_subdiv3 %>% select(ISO3, lat, lon) %>% st_drop_geometry(),
sp_nat3 %>% select(ISO3, lat, lon) %>% filter(!ISO3 %in% sp_subdiv3$ISO3) %>% st_drop_geometry()
)
d = left_join(
d,
d_latlon,
by = "ISO3"
)
#add spatial lags
d = spatial_knn(
d,
vars = c("S", "CA"),
k = 3
)
d %>% spatial_lag_cors()
## S CA
## 0.863 0.885
#join water %
# d = left_join(
# d,
# sp_plot %>% st_drop_geometry() %>% select(ISO3, coastal_proportion),
# by = "ISO3"
# )
#no dups
d$ISO3 %>% anyDuplicated()
## [1] 0
#Country maps
sp_nat3 %>%
ggplot(aes(fill = CA)) +
scale_fill_gradient(
low = "red",
high = "blue",
na.value = "grey80"
) +
geom_sf() +
geom_text_repel(aes(x = lon, y = lat, label = floor(CA)),
size = 3,
max.overlaps = Inf,
min.segment.length = 0,
force = 1,
force_pull = 1,
point.padding = 0.5,
segment.color = "black",
segment.size = 0.5) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
GG_save("figs/maps/nat CA.png")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
#do the same for S
sp_nat3 %>%
ggplot(aes(fill = S)) +
scale_fill_gradient(
low = "red",
high = "blue",
na.value = "grey80"
) +
geom_sf() +
geom_text_repel(aes(x = lon, y = lat, label = round(S, 2)),
size = 3,
max.overlaps = Inf,
min.segment.length = 0,
force = 1,
force_pull = 1,
point.padding = 0.5,
segment.color = "black",
segment.size = 0.5) +
theme_minimal()
GG_save("figs/maps/nat S.png")
#Subnational maps
sp_subdiv3 %>%
#since Cuba and Venezuela have missing data for L1, remove them and use the L0
filter(!L0_ISO3 %in% c("CUB", "VEN")) %>%
bind_rows(
sp_nat3 %>%
filter(ISO3 %in% c("CUB", "VEN"))
) %>%
ggplot(aes(fill = CA)) +
scale_fill_gradient(
low = "red",
high = "blue",
na.value = "grey80"
) +
geom_sf() +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal()
GG_save("figs/maps/subdiv CA.png")
sp_subdiv3 %>%
ggplot(aes(fill = S)) +
scale_fill_gradient(
low = "red",
high = "blue",
na.value = "grey80"
) +
geom_sf() +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal()
GG_save("figs/maps/subdiv S.png")
#euro
sp_subdiv3 %>%
ggplot(aes(fill = EUR)) +
scale_fill_gradient(
low = "white",
high = "blue",
na.value = "grey80",
labels = scales::percent,
limits = c(0, 1)
) +
geom_sf() +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal() +
labs(
fill = "West Eurasian ancestry %"
)
GG_save("figs/maps/subdiv EUR.png")
#african
sp_subdiv3 %>%
ggplot(aes(fill = AFR)) +
scale_fill_gradient(
low = "white",
high = "black",
na.value = "grey80",
labels = scales::percent,
limits = c(0, 1)
) +
geom_sf() +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal() +
labs(
fill = "African ancestry %"
)
GG_save("figs/maps/subdiv AFR.png")
#amerindian
sp_subdiv3 %>%
ggplot(aes(fill = AMR)) +
scale_fill_gradient(
low = "white",
high = "red",
na.value = "grey80",
labels = scales::percent,
limits = c(0, 1)
) +
geom_sf() +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal() +
labs(
fill = "Amerindian ancestry %"
)
GG_save("figs/maps/subdiv AMR.png")
#other
sp_subdiv3 %>%
ggplot(aes(fill = OTR)) +
scale_fill_gradient(
low = "white",
high = "green",
na.value = "grey80",
labels = scales::percent,
limits = c(0, 1)
) +
geom_sf() +
coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) + # Adjusted range for Americas
theme_minimal() +
labs(
fill = "Other ancestry %"
)
GG_save("figs/maps/subdiv OTR.png")
d %>%
filter(
Level != "2",
!has_L1_subunits,
) %>%
GG_scatter("EUR", "S", case_names = "lowest_name", weights = "SQRT_POP", color = "L1_group") +
scale_x_continuous(
labels = scales::percent,
limits = c(0, 1)
) +
labs(
x = "West Eurasian ancestry %",
y = "Socioeconomic development",
color = "Country group"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter subnat S~EUR.png")
## `geom_smooth()` using formula = 'y ~ x'
d %>%
filter(
Level != "2",
!has_L1_subunits,
) %>%
GG_scatter("EUR", "CA", case_names = "lowest_name", weights = "SQRT_POP", color = "L1_group") +
labs(
x = "West Eurasian ancestry %",
y = "Cognitive ability",
color = "Country group"
) +
scale_x_continuous(
labels = scales::percent,
limits = c(0, 1)
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter subnat CA~EUR.png")
## `geom_smooth()` using formula = 'y ~ x'
d %>%
filter(
Level != "2",
!has_L1_subunits,
) %>%
GG_scatter("CA", "S", case_names = "lowest_name", weights = "SQRT_POP", color = "L1_group") +
labs(
x = "Cognitive ability",
y = "Socioeconomic development",
color = "Country group"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter subnat S~CA.png")
## `geom_smooth()` using formula = 'y ~ x'
#L1 units split by country
#EUR x S
d %>%
filter(!has_L1_subunits,
!is.na(EUR),
Level != "2",
) %>%
ggplot(aes(EUR, S, color = L1_group)) +
geom_point(aes(size = SQRT_POP), show.legend = FALSE) +
geom_smooth(method = "lm") +
facet_wrap(~L1_group, scales = "free") +
labs(
x = "West Eurasian ancestry %",
y = "Socioeconomic development",
color = "Country group"
) +
scale_x_continuous(
labels = scales::percent,
limits = c(0, 1)
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter by country S~EUR.png")
## `geom_smooth()` using formula = 'y ~ x'
#EUR x CA
d %>%
filter(!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
Level != "2",
) %>%
ggplot(aes(EUR, CA, color = L1_group)) +
geom_point(aes(size = SQRT_POP), show.legend = F) +
geom_smooth(method = "lm") +
facet_wrap(~L1_group, scales = "free") +
labs(
x = "West Eurasian ancestry %",
y = "Cognitive ability",
color = "Country group"
) +
scale_x_continuous(
labels = scales::percent,
limits = c(0, 1)
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter by country CA~EUR.png")
## `geom_smooth()` using formula = 'y ~ x'
#CA x S
d %>%
filter(!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
Level != "2",
) %>%
ggplot(aes(CA, S, color = L1_group)) +
geom_point(aes(size = SQRT_POP), show.legend = F) +
geom_smooth(method = "lm") +
facet_wrap(~L1_group, scales = "free") +
labs(
x = "Cognitive ability",
y = "Socioeconomic development",
color = "Country group"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter by country S~CA.png")
## `geom_smooth()` using formula = 'y ~ x'
#prep data subset
d_ols_models_data = d %>%
filter(!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
Level != "2",
) %>%
spatial_knn(
vars = c("S", "CA"),
k = 3
)
d_ols_models_data %>% spatial_lag_cors()
## S CA
## 0.856 0.893
#fit models
#S
ols_models_S = list(
#no country effects
lm(S ~ CA, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ EUR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ CA + EUR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ AMR + AFR + OTR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
#fixed effects for countries
lm(S ~ CA + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ EUR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ CA + EUR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)
#summarize models
ols_models_S %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))
#CA
#fit models
ols_models_CA = list(
#no country effects
lm(CA ~ EUR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(CA ~ AMR + AFR + OTR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
#fixed effects for countries
lm(CA ~ EUR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(CA ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)
#summarize models
ols_models_CA %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))
#mediation
ols_models_S_mediation <- mediation::mediate(ols_models_CA[[1]], ols_models_S[[3]], treat = "EUR", mediator = "CA", sims = 1000)
summary(ols_models_S_mediation)
##
## Causal Mediation Analysis
##
## Quasi-Bayesian Confidence Intervals
##
## Estimate 95% CI Lower 95% CI Upper p-value
## ACME 1.527 1.333 1.728 <2e-16 ***
## ADE 0.658 0.504 0.802 <2e-16 ***
## Total Effect 2.185 1.974 2.398 <2e-16 ***
## Prop. Mediated 0.698 0.636 0.765 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Sample Size Used: 334
##
##
## Simulations: 1000
#add spatial lags
#S
ols_models_S_lag = list(
#no country effects
lm(S ~ CA + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ EUR + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ CA + EUR + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ AMR + AFR + OTR + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
#fixed effects for countries
lm(S ~ CA + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ EUR + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ CA + EUR + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(S ~ AMR + AFR + OTR + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)
#summarize models
ols_models_S_lag %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))
#CA
ols_models_CA_lag = list(
#no country effects
lm(CA ~ EUR + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(CA ~ AMR + AFR + OTR + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
#fixed effects for countries
lm(CA ~ EUR + L1_group + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
lm(CA ~ AMR + AFR + OTR + L1_group + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)
#summarize models
ols_models_CA_lag %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))
#mediation
ols_models_S_lag_mediation <- mediation::mediate(ols_models_CA_lag[[1]], ols_models_S_lag[[3]], treat = "EUR", mediator = "CA", sims = 1000)
summary(ols_models_S_lag_mediation)
##
## Causal Mediation Analysis
##
## Quasi-Bayesian Confidence Intervals
##
## Estimate 95% CI Lower 95% CI Upper p-value
## ACME 0.210 0.120 0.302 <2e-16 ***
## ADE 0.465 0.317 0.614 <2e-16 ***
## Total Effect 0.675 0.505 0.855 <2e-16 ***
## Prop. Mediated 0.312 0.192 0.441 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Sample Size Used: 334
##
##
## Simulations: 1000
#mediation by country group
set.seed(1)
ols_models_S_L1 = fit_mediation_by_group(
data = d_ols_models_data,
outcome = "S",
treatment = "EUR",
mediator = "CA",
group_var = "L1_group",
weight_var = "SQRT_POP"
)
#meta-analysis
rma(
yi = ols_models_S_L1$PropMediated,
sei = ols_models_S_L1$PropMediated_se
)
##
## Random-Effects Model (k = 14; tau^2 estimator: REML)
##
## tau^2 (estimated amount of total heterogeneity): 0.0801 (SE = 0.0542)
## tau (square root of estimated tau^2 value): 0.2830
## I^2 (total heterogeneity / total variability): 70.89%
## H^2 (total variability / sampling variability): 3.43
##
## Test for Heterogeneity:
## Q(df = 13) = 84.7379, p-val < .0001
##
## Model Results:
##
## estimate se zval pval ci.lb ci.ub
## 0.6271 0.1032 6.0787 <.0001 0.4249 0.8293 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#mediation by country group with lags
set.seed(1)
ols_models_S_lag_L1 = fit_mediation_by_group(
data = d_ols_models_data,
outcome = "S",
treatment = "EUR",
mediator = "CA",
group_var = "L1_group",
weight_var = "SQRT_POP",
covariates_m = "CA_lag",
covariates_y = "S_lag"
)
#meta-analysis
rma(
yi = ols_models_S_lag_L1$PropMediated,
sei = ols_models_S_lag_L1$PropMediated_se
)
##
## Random-Effects Model (k = 14; tau^2 estimator: REML)
##
## tau^2 (estimated amount of total heterogeneity): 0.0658 (SE = 0.0556)
## tau (square root of estimated tau^2 value): 0.2566
## I^2 (total heterogeneity / total variability): 52.47%
## H^2 (total variability / sampling variability): 2.10
##
## Test for Heterogeneity:
## Q(df = 13) = 36.3729, p-val = 0.0005
##
## Model Results:
##
## estimate se zval pval ci.lb ci.ub
## 0.5598 0.1041 5.3794 <.0001 0.3558 0.7637 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#get distance matrix for spatial models
d_ols_models_dists = st_as_sf(d_ols_models_data, coords = c("lon", "lat"), crs = 4326) %>% st_distance()
quantile(d_ols_models_dists, probs = seq(0, 1, 0.1))
## Units: [m]
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0 1102412 1831422 2429527 3020888 3586622 4196866 4912233
## 80% 90% 100%
## 5944567 7420748 15127223
#neighbors
d_ols_models_nbs = knearneigh(d_ols_models_data %>% select(lat, lon), k=4)
d_ols_models_nbs_listw = d_ols_models_nbs %>% knn2nb() %>% nb2listw(style = "W")
#fit models
#S
sa_lag_models_S = list(
#no country effects
lagsarlm(S ~ CA, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(S ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(S ~ CA + EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(S ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
#fixed effects for countries
lagsarlm(S ~ CA + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(S ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(S ~ CA + EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(S ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw)
)
#summarize
sa_lag_models_S %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))
#impacts (direct and indirect effects)
sa_lag_models_S %>% map(\(x) impacts(x, listw = d_ols_models_nbs_listw))
## [[1]]
## Impact measures (lag, exact):
## Direct Indirect Total
## CA dy/dx 0.0491 0.0268 0.0759
##
## [[2]]
## Impact measures (lag, exact):
## Direct Indirect Total
## EUR dy/dx 1.03 1.34 2.37
##
## [[3]]
## Impact measures (lag, exact):
## Direct Indirect Total
## CA dy/dx 0.0433 0.0187 0.062
## EUR dy/dx 0.4190 0.1807 0.600
##
## [[4]]
## Impact measures (lag, exact):
## Direct Indirect Total
## AMR dy/dx -1.1814 -1.2345 -2.4159
## AFR dy/dx -0.9169 -0.9581 -1.8749
## OTR dy/dx -0.0464 -0.0485 -0.0949
##
## [[5]]
## Impact measures (lag, exact):
## Direct Indirect Total
## CA dy/dx 0.0549 -0.000560 0.0544
## L1_groupBolivia dy/dx -0.7731 0.007879 -0.7652
## L1_groupBrazil dy/dx -0.1683 0.001715 -0.1665
## L1_groupCanada dy/dx 0.1580 -0.001610 0.1563
## L1_groupChile dy/dx 0.0918 -0.000936 0.0909
## L1_groupColombia dy/dx -0.3795 0.003868 -0.3756
## L1_groupCosta Rica dy/dx -0.1314 0.001339 -0.1301
## L1_groupEcuador dy/dx -0.5314 0.005416 -0.5260
## L1_groupGuatemala dy/dx -0.7640 0.007786 -0.7562
## L1_groupMexico dy/dx -0.3569 0.003637 -0.3532
## L1_groupOthers dy/dx -0.3085 0.003145 -0.3054
## L1_groupPanama dy/dx 0.0242 -0.000247 0.0240
## L1_groupPeru dy/dx -0.3632 0.003702 -0.3595
## L1_groupUSA dy/dx 0.0414 -0.000422 0.0410
##
## [[6]]
## Impact measures (lag, exact):
## Direct Indirect Total
## EUR dy/dx 0.9808 0.23689 1.2176
## L1_groupBolivia dy/dx -0.5473 -0.13220 -0.6795
## L1_groupBrazil dy/dx -0.2818 -0.06807 -0.3499
## L1_groupCanada dy/dx 0.5713 0.13798 0.7092
## L1_groupChile dy/dx 0.2595 0.06267 0.3221
## L1_groupColombia dy/dx -0.2412 -0.05825 -0.2994
## L1_groupCosta Rica dy/dx 0.0190 0.00459 0.0236
## L1_groupEcuador dy/dx -0.2228 -0.05382 -0.2766
## L1_groupGuatemala dy/dx -0.4766 -0.11511 -0.5917
## L1_groupMexico dy/dx 0.0828 0.02000 0.1028
## L1_groupOthers dy/dx 0.1153 0.02784 0.1431
## L1_groupPanama dy/dx 0.0332 0.00802 0.0412
## L1_groupPeru dy/dx -0.0509 -0.01230 -0.0632
## L1_groupUSA dy/dx 0.5407 0.13060 0.6713
##
## [[7]]
## Impact measures (lag, exact):
## Direct Indirect Total
## CA dy/dx 0.0527 -0.000910 0.0518
## EUR dy/dx 0.1502 -0.002595 0.1476
## L1_groupBolivia dy/dx -0.7389 0.012767 -0.7262
## L1_groupBrazil dy/dx -0.1821 0.003147 -0.1790
## L1_groupCanada dy/dx 0.1755 -0.003032 0.1725
## L1_groupChile dy/dx 0.1087 -0.001878 0.1068
## L1_groupColombia dy/dx -0.3602 0.006223 -0.3540
## L1_groupCosta Rica dy/dx -0.1233 0.002130 -0.1211
## L1_groupEcuador dy/dx -0.4950 0.008552 -0.4865
## L1_groupGuatemala dy/dx -0.7277 0.012572 -0.7151
## L1_groupMexico dy/dx -0.3173 0.005482 -0.3118
## L1_groupOthers dy/dx -0.2649 0.004577 -0.2603
## L1_groupPanama dy/dx 0.0607 -0.001049 0.0597
## L1_groupPeru dy/dx -0.3103 0.005362 -0.3050
## L1_groupUSA dy/dx 0.0549 -0.000949 0.0540
##
## [[8]]
## Impact measures (lag, exact):
## Direct Indirect Total
## AMR dy/dx -1.47346 -0.275338 -1.74880
## AFR dy/dx -0.75553 -0.141182 -0.89671
## OTR dy/dx -0.59698 -0.111554 -0.70853
## L1_groupBolivia dy/dx -0.38566 -0.072067 -0.45773
## L1_groupBrazil dy/dx -0.41190 -0.076970 -0.48887
## L1_groupCanada dy/dx 0.47689 0.089114 0.56600
## L1_groupChile dy/dx 0.33275 0.062179 0.39493
## L1_groupColombia dy/dx -0.22818 -0.042638 -0.27082
## L1_groupCosta Rica dy/dx 0.00473 0.000885 0.00562
## L1_groupEcuador dy/dx -0.10085 -0.018846 -0.11970
## L1_groupGuatemala dy/dx -0.31179 -0.058262 -0.37005
## L1_groupMexico dy/dx 0.19831 0.037058 0.23537
## L1_groupOthers dy/dx -0.12112 -0.022634 -0.14376
## L1_groupPanama dy/dx 0.12139 0.022684 0.14408
## L1_groupPeru dy/dx 0.13992 0.026146 0.16606
## L1_groupUSA dy/dx 0.40895 0.076418 0.48537
#CA
sa_lag_models_CA = list(
#no country effects
lagsarlm(CA ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(CA ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
#fixed effects for countries
lagsarlm(CA ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
lagsarlm(CA ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw)
)
sa_lag_models_CA %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))
#spatial error model
#S
sa_error_models_S = list(
#no country effects
errorsarlm(S ~ CA, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(S ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(S ~ CA + EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(S ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
#fixed effects for countries
errorsarlm(S ~ CA + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(S ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(S ~ CA + EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(S ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP)
)
sa_error_models_S %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))
#CA
sa_error_models_CA = list(
#no country effects
errorsarlm(CA ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(CA ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
#fixed effects for countries
errorsarlm(CA ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
errorsarlm(CA ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP)
)
sa_error_models_CA %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))
#we need z scale variables for random effects models
d_ols_models_data
#European only
lmer_S_eur_ri = lmer(
S ~ EUR + (1 | L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
lmer_S_eur_ri %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ EUR + (1 | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 222
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.847 -0.421 0.014 0.496 2.880
##
## Random effects:
## Groups Name Variance Std.Dev.
## L1_group (Intercept) 0.168 0.41
## Residual 94.286 9.71
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.216 0.123 -1.75
## EUR 1.112 0.121 9.17
##
## Correlation of Fixed Effects:
## (Intr)
## EUR -0.431
check_outliers(lmer_S_eur_ri)
## 2 outliers detected: cases 184, 185.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).
lmer_S_eur_ri %>% plot()
lmer_S_eur_full = lmer(
S ~ EUR + (1 + EUR | L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
lmer_S_eur_full %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ EUR + (1 + EUR | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 220
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.973 -0.424 -0.004 0.477 2.906
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 0.277 0.527
## EUR 0.154 0.392 -0.73
## Residual 91.980 9.591
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.250 0.156 -1.60
## EUR 1.258 0.179 7.05
##
## Correlation of Fixed Effects:
## (Intr)
## EUR -0.707
lmer_S_eur_full %>% standardize_parameters()
check_outliers(lmer_S_eur_full)
## 2 outliers detected: cases 184, 185.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).
lmer_S_eur_full %>% plot()
#model test, do we need random slopes?
anova(lmer_S_eur_ri, lmer_S_eur_full)
## refitting model(s) with ML (instead of REML)
ggpredict(
lmer_S_eur_full,
terms = c("EUR", "L1_group"),
type = "random"
) %>%
plot(
colors = large_palette
)
lmer_IQ_eur_ri = lmer(
CA ~ EUR + (1 | L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
lmer_IQ_eur_ri %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ EUR + (1 | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 1901
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.895 -0.482 0.019 0.593 3.999
##
## Random effects:
## Groups Name Variance Std.Dev.
## L1_group (Intercept) 25 5
## Residual 14815 122
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 78.31 1.51 51.72
## EUR 13.80 1.52 9.09
##
## Correlation of Fixed Effects:
## (Intr)
## EUR -0.440
lmer_IQ_eur_full = lmer(
CA ~ EUR + (1 + EUR| L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
lmer_IQ_eur_full %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ EUR + (1 + EUR | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 1883
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.160 -0.438 -0.011 0.581 3.535
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 48.3 6.95
## EUR 94.4 9.72 -0.71
## Residual 13446.3 115.96
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 76.53 2.11 36.35
## EUR 18.52 3.47 5.34
##
## Correlation of Fixed Effects:
## (Intr)
## EUR -0.751
lmer_IQ_eur_full %>% standardize_parameters()
check_outliers(lmer_IQ_eur_full)
## 3 outliers detected: cases 134, 184, 264.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).
#model test, do we need random slopes?
anova(lmer_IQ_eur_ri, lmer_IQ_eur_full)
## refitting model(s) with ML (instead of REML)
ggpredict(
lmer_IQ_eur_full,
terms = c("EUR", "L1_group"),
type = "random"
) %>%
plot(
colors = large_palette
)
#S ~ CA
lmer_S_CA_ri = lmer(
S ~ CA + (1 | L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
lmer_S_CA_ri %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA + (1 | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 25.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.197 -0.441 0.105 0.520 3.788
##
## Random effects:
## Groups Name Variance Std.Dev.
## L1_group (Intercept) 0.07 0.265
## Residual 51.56 7.180
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -4.83849 0.25329 -19.1
## CA 0.06057 0.00288 21.1
##
## Correlation of Fixed Effects:
## (Intr)
## CA -0.958
check_outliers(lmer_S_CA_ri)
## 3 outliers detected: cases 134, 184, 248.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).
lmer_S_CA_ri %>% plot()
lmer_S_CA_full = lmer(
S ~ CA + (1 + CA | L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 9.42875 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
lmer_S_CA_full %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA + (1 + CA | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 20
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.037 -0.431 0.071 0.498 4.181
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 5.43e-01 0.73669
## CA 7.19e-05 0.00848 -0.93
## Residual 4.98e+01 7.05472
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -4.8322 0.3399 -14.2
## CA 0.0606 0.0040 15.2
##
## Correlation of Fixed Effects:
## (Intr)
## CA -0.976
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 9.42875 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
lmer_S_CA_full %>% standardize_parameters()
check_outliers(lmer_S_CA_full)
## 4 outliers detected: cases 134, 184, 248, 316.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).
lmer_S_CA_full %>% plot()
#model test, do we need random slopes?
anova(lmer_S_CA_ri, lmer_S_CA_full)
## refitting model(s) with ML (instead of REML)
#z scored
lmer_S_CA_full_z = lmer(
S ~ CA_z + (1 + CA_z | L1_group),
data = d_ols_models_data,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
## boundary (singular) fit: see help('isSingular')
lmer_S_CA_full_z %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA_z + (1 + CA_z | L1_group)
## Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 37.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.236 -0.505 0.062 0.576 4.501
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 0.0000 0.000
## CA_z 0.0459 0.214 NaN
## Residual 55.1268 7.425
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.3800 0.0201 68.7
## CA_z 1.0227 0.0629 16.2
##
## Correlation of Fixed Effects:
## (Intr)
## CA_z 0.273
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
#non-European
lmer_S_all = lmer(
S ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group),
data = d %>% filter(
!has_L1_subunits,
!is.na(AMR),
!Level == "2"
),
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
## boundary (singular) fit: see help('isSingular')
lmer_S_all %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group)
## Data: d %>% filter(!has_L1_subunits, !is.na(AMR), !Level == "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 191
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.071 -0.453 0.008 0.476 3.379
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 0.248 0.498
## AMR 0.436 0.660 -0.72
## AFR 0.103 0.322 0.37 0.38
## OTR 2.588 1.609 0.85 -0.98 -0.16
## Residual 75.622 8.696
## Number of obs: 375, groups: L1_group, 16
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.997 0.148 6.76
## AMR -1.617 0.232 -6.96
## AFR -1.138 0.165 -6.89
## OTR -1.206 0.668 -1.80
##
## Correlation of Fixed Effects:
## (Intr) AMR AFR
## AMR -0.774
## AFR -0.062 0.315
## OTR 0.544 -0.548 -0.096
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmer_S_all %>% standardize_parameters()
## boundary (singular) fit: see help('isSingular')
check_outliers(lmer_S_all)
## 2 outliers detected: cases 175, 200.
## - Based on the following method and threshold: cook (0.9).
## - For variable: (Whole model).
lmer_IQ_all = lmer(
CA ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group),
data = d %>% filter(
!has_L1_subunits,
!is.na(AMR),
!Level == "2"
),
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
## boundary (singular) fit: see help('isSingular')
lmer_IQ_all %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group)
## Data: d %>% filter(!has_L1_subunits, !is.na(AMR), !Level == "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 1837
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.526 -0.478 0.022 0.539 3.573
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 34.8 5.9
## AMR 120.1 11.0 -0.87
## AFR 166.2 12.9 -0.09 -0.37
## OTR 784.5 28.0 0.05 -0.50 0.99
## Residual 12192.6 110.4
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 95.53 2.00 47.79
## AMR -21.38 3.58 -5.98
## AFR -27.17 4.72 -5.75
## OTR -30.95 10.17 -3.04
##
## Correlation of Fixed Effects:
## (Intr) AMR AFR
## AMR -0.882
## AFR -0.270 -0.087
## OTR -0.102 -0.237 0.872
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmer_IQ_all %>% standardize_parameters()
## boundary (singular) fit: see help('isSingular')
check_outliers(lmer_IQ_all)
## 2 outliers detected: cases 134, 184.
## - Based on the following method and threshold: cook (0.9).
## - For variable: (Whole model).
#S ~ EUR + CA
lmer_S_EUR = lmer(
S ~ EUR + (1 + EUR| L1_group),
data = d %>% filter(
!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
!Level == "2"
),
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
lmer_S_EUR %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ EUR + (1 + EUR | L1_group)
## Data: d %>% filter(!has_L1_subunits, !is.na(EUR), !is.na(CA), !Level ==
## "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 220
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.973 -0.424 -0.004 0.477 2.906
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 0.277 0.527
## EUR 0.154 0.392 -0.73
## Residual 91.980 9.591
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.250 0.156 -1.60
## EUR 1.258 0.179 7.05
##
## Correlation of Fixed Effects:
## (Intr)
## EUR -0.707
lmer_S_EUR %>% standardize_parameters()
lmer_S_CA = lmer(
S ~ CA_z + (1 + CA_z | L1_group),
data = d %>% filter(
!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
!Level == "2"
),
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
## boundary (singular) fit: see help('isSingular')
lmer_S_CA %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA_z + (1 + CA_z | L1_group)
## Data: d %>% filter(!has_L1_subunits, !is.na(EUR), !is.na(CA), !Level ==
## "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 37.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.236 -0.505 0.062 0.576 4.501
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 0.0000 0.000
## CA_z 0.0459 0.214 NaN
## Residual 55.1268 7.425
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.3800 0.0201 68.7
## CA_z 1.0227 0.0629 16.2
##
## Correlation of Fixed Effects:
## (Intr)
## CA_z 0.273
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmer_S_CA %>% standardize_parameters()
lmer_S_CA_EUR = lmer(
S ~ CA_z + EUR + (1 + CA_z + EUR| L1_group),
data = d %>% filter(
!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
!Level == "2"
),
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)
## boundary (singular) fit: see help('isSingular')
lmer_S_CA_EUR %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA_z + EUR + (1 + CA_z + EUR | L1_group)
## Data: d %>% filter(!has_L1_subunits, !is.na(EUR), !is.na(CA), !Level ==
## "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: -13.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.764 -0.445 0.012 0.523 3.722
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 0.409 0.640
## CA_z 0.102 0.319 0.82
## EUR 0.168 0.410 -0.89 -0.46
## Residual 44.179 6.647
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.007 0.199 5.06
## CA_z 0.843 0.106 7.96
## EUR 0.415 0.147 2.82
##
## Correlation of Fixed Effects:
## (Intr) CA_z
## CA_z 0.836
## EUR -0.834 -0.461
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmer_S_CA_EUR %>% standardize_parameters()
## boundary (singular) fit: see help('isSingular')
#subset to full data for all variables we need
d_complex = d %>%
select(
CA, S,
EUR, AMR, AFR, OTR,
lat, lon,
IS_LATIN, IS_NONSOV,
L1_group, has_L1_subunits, Level, SQRT_POP
) %>% na.omit() %>%
filter(
!has_L1_subunits,
!Level == "2"
) %>%
spatial_knn(
vars = c("S", "CA"),
k = 3
)
d_complex %>% spatial_lag_cors()
## CA S
## 0.893 0.856
#
lmer_IQ_complex = lmer(
CA ~ abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR + OTR + (1 + abs(lat) + lon + AMR + AFR + OTR | L1_group),
data = d_complex,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)
## boundary (singular) fit: see help('isSingular')
lmer_IQ_complex %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR + OTR +
## (1 + abs(lat) + lon + AMR + AFR + OTR | L1_group)
## Data: d_complex
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+06))
##
## REML criterion at convergence: 1703
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.214 -0.582 -0.084 0.549 4.358
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 4.67e+01 6.8340
## abs(lat) 3.00e-02 0.1732 -0.75
## lon 4.15e-03 0.0644 0.21 0.49
## AMR 8.80e+01 9.3832 -0.93 0.94 0.17
## AFR 1.30e+02 11.4030 -0.44 0.93 0.79 0.74
## OTR 8.94e+02 29.8974 0.10 -0.11 -0.02 -0.11 -0.08
## Residual 7.95e+03 89.1782
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 92.1727 2.9301 31.46
## abs(lat) 0.0391 0.0545 0.72
## lon -0.0613 0.0263 -2.33
## IS_LATIN -10.6696 0.8483 -12.58
## IS_NONSOV 3.3027 0.7605 4.34
## AMR -13.6502 2.9591 -4.61
## AFR -20.6992 3.9296 -5.27
## OTR -8.2429 15.0550 -0.55
##
## Correlation of Fixed Effects:
## (Intr) abs(l) lon IS_LAT IS_NON AMR AFR
## abs(lat) -0.635
## lon 0.496 0.287
## IS_LATIN -0.495 0.219 -0.263
## IS_NONSOV -0.331 0.059 -0.028 0.334
## AMR -0.539 0.803 0.191 -0.180 -0.039
## AFR -0.370 0.817 0.504 0.059 0.090 0.627
## OTR 0.028 -0.033 0.027 0.058 0.020 -0.079 0.003
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmer_IQ_complex %>% standardize_parameters()
## boundary (singular) fit: see help('isSingular')
#ditto but without Other group
lmer_IQ_complex_noOtherNatGroup = lmer(
CA ~ abs(lat) + lon + AMR + AFR + OTR + (1 + abs(lat) + lon + AMR + AFR + OTR | L1_group),
data = d_complex,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)
## boundary (singular) fit: see help('isSingular')
lmer_IQ_complex_noOtherNatGroup %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ abs(lat) + lon + AMR + AFR + OTR + (1 + abs(lat) + lon +
## AMR + AFR + OTR | L1_group)
## Data: d_complex
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+06))
##
## REML criterion at convergence: 1801
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.693 -0.492 -0.052 0.576 3.727
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_group (Intercept) 2.35e+01 4.8453
## abs(lat) 1.11e-02 0.1054 0.20
## lon 5.49e-03 0.0741 0.09 0.99
## AMR 1.18e+02 10.8724 -0.95 0.12 0.22
## AFR 1.68e+02 12.9621 0.18 1.00 1.00 0.13
## OTR 2.82e+02 16.8030 0.19 1.00 1.00 0.12 1.00
## Residual 1.08e+04 103.7523
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 84.5923 2.0670 40.93
## abs(lat) 0.1192 0.0391 3.05
## lon -0.0680 0.0281 -2.42
## AMR -16.3517 3.5491 -4.61
## AFR -21.1920 4.5971 -4.61
## OTR -12.2737 6.3885 -1.92
##
## Correlation of Fixed Effects:
## (Intr) abs(l) lon AMR AFR
## abs(lat) -0.141
## lon 0.325 0.668
## AMR -0.709 0.280 0.273
## AFR -0.056 0.806 0.749 0.282
## OTR 0.088 0.717 0.765 0.221 0.813
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmer_IQ_complex_noOtherNatGroup %>% standardize_parameters()
## boundary (singular) fit: see help('isSingular')
#S
lmer_S_complex = lmer(
S ~ CA + abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR + OTR + (1 | L1_group),
data = d_complex,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)
lmer_S_complex %>% summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA + abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR +
## OTR + (1 | L1_group)
## Data: d_complex
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+06))
##
## REML criterion at convergence: 22.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.006 -0.453 0.068 0.508 3.642
##
## Random effects:
## Groups Name Variance Std.Dev.
## L1_group (Intercept) 0.0479 0.219
## Residual 47.5940 6.899
## Number of obs: 334, groups: L1_group, 14
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -3.441173 0.417858 -8.24
## CA 0.046117 0.003869 11.92
## abs(lat) 0.001871 0.002072 0.90
## lon 0.001178 0.000991 1.19
## IS_LATIN -0.150208 0.095457 -1.57
## IS_NONSOV 0.318913 0.114546 2.78
## AMR -0.566102 0.133288 -4.25
## AFR -0.517106 0.128417 -4.03
## OTR -0.025614 0.243400 -0.11
##
## Correlation of Fixed Effects:
## (Intr) CA abs(l) lon IS_LAT IS_NON AMR AFR
## CA -0.880
## abs(lat) 0.008 -0.218
## lon 0.166 -0.019 0.159
## IS_LATIN -0.638 0.431 0.099 0.037
## IS_NONSOV -0.239 -0.107 0.069 -0.026 0.320
## AMR -0.420 0.334 0.316 0.118 0.052 -0.067
## AFR -0.578 0.452 0.260 -0.076 0.395 0.003 0.527
## OTR -0.253 0.120 0.271 0.249 0.452 0.162 0.208 0.249
performance::r2_nakagawa(lmer_S_complex)
## # R2 for Mixed Models
##
## Conditional R2: 0.008
## Marginal R2: 0.007
model_performance(lmer_S_complex, metrics = c("RMSE", "R2", "AIC"))
m_null <- lmer(
S ~ 1 + (1 | L1_group),
data = d_complex,
weights = SQRT_POP,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)
AIC(m_null, lmer_S_complex)
anova(m_null, lmer_S_complex)
## refitting model(s) with ML (instead of REML)
#fixed effects
fe_S_complex = feols(
S ~ CA + abs(lat) + lon + AMR + AFR + OTR | L1_group,
data = d_complex,
weights = d_complex$SQRT_POP
)
fe_S_complex %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## CA 0.05199 0.00348 14.958 < 2.2e-16 ***
## abs(lat) 0.00146 0.00223 0.657 0.5119338
## lon 0.00145 0.00103 1.407 0.1603214
## AMR -0.47661 0.14321 -3.328 0.0009782 ***
## AFR -0.37438 0.12017 -3.115 0.0020062 **
## OTR 0.13450 0.22495 0.598 0.5503342
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.187356 Adj. R2: 0.919762
## Within R2: 0.57892
fe_CA_complex = feols(
CA ~ abs(lat) + lon + AMR + AFR + OTR | L1_group,
data = d_complex,
weights = d_complex$SQRT_POP
)
fe_CA_complex %>% summary()
## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## abs(lat) 0.1745 0.0348 5.02 8.7883e-07 ***
## lon 0.0247 0.0167 1.48 1.3925e-01
## AMR -12.8920 2.2049 -5.85 1.2509e-08 ***
## AFR -10.8765 1.8491 -5.88 1.0344e-08 ***
## OTR 5.8129 3.6318 1.60 1.1047e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 3.04194 Adj. R2: 0.872724
## Within R2: 0.293103
#with spatial lags
fe_S_complex_lag = feols(
S ~ CA + abs(lat) + lon + AMR + AFR + OTR + S_lag | L1_group,
data = d_complex,
weights = d_complex$SQRT_POP
)
fe_S_complex_lag %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## CA 0.05242 0.00358 14.653 < 2.2e-16 ***
## abs(lat) 0.00211 0.00256 0.824 0.41050900
## lon 0.00156 0.00105 1.481 0.13959533
## AMR -0.48971 0.14561 -3.363 0.00086595 ***
## AFR -0.37338 0.12033 -3.103 0.00209039 **
## OTR 0.15803 0.22979 0.688 0.49213156
## S_lag -0.03147 0.06101 -0.516 0.60635409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.187276 Adj. R2: 0.919574
## Within R2: 0.579277
fe_CA_complex_lag = feols(
CA ~ abs(lat) + lon + AMR + AFR + OTR + CA_lag | L1_group,
data = d_complex,
weights = d_complex$SQRT_POP
)
fe_CA_complex_lag %>% summary()
## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## abs(lat) 0.0427 0.0361 1.183 2.3776e-01
## lon 0.0146 0.0153 0.952 3.4159e-01
## AMR -9.2150 2.0755 -4.440 1.2484e-05 ***
## AFR -9.5866 1.7032 -5.629 4.0281e-08 ***
## OTR 3.4273 3.3434 1.025 3.0610e-01
## CA_lag 0.5141 0.0659 7.799 9.3016e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 2.78421 Adj. R2: 0.893037
## Within R2: 0.407811
#no miss subset
d_nomiss = d %>% filter(
!has_L1_subunits,
!is.na(EUR),
!is.na(CA),
!Level == "2"
)
feols_CA_eur = feols(
CA ~ EUR | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_CA_eur %>% summary()
## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## EUR 13.1 1.56 8.44 1.1276e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 3.2712 Adj. R2: 0.854661
## Within R2: 0.182533
feols_S_eur = feols(
S ~ EUR | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_S_eur %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## EUR 1.05 0.124 8.47 9.4105e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.260902 Adj. R2: 0.846843
## Within R2: 0.183447
feols_S_CA = feols(
S ~ CA | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_S_CA %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## CA 0.0594 0.00298 19.9 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.19289 Adj. R2: 0.916285
## Within R2: 0.553677
feols_S_CA_z = feols(
S ~ CA_z | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_S_CA_z %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## CA_z 0.891 0.0448 19.9 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.19289 Adj. R2: 0.916285
## Within R2: 0.553677
feols_S_CA_EUR = feols(
S ~ CA_z + EUR | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_S_CA_EUR %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## CA_z 0.822 0.0488 16.85 < 2.2e-16 ***
## EUR 0.331 0.0999 3.32 0.0010217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.189641 Adj. R2: 0.918827
## Within R2: 0.568587
#all ancestries
feols_CA_all = feols(
CA ~ AMR + AFR + OTR | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_CA_all %>% summary()
## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## AMR -17.620 2.07 -8.5051 7.2794e-16 ***
## AFR -14.028 1.76 -7.9543 3.2134e-14 ***
## OTR 0.217 3.52 0.0616 9.5090e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 3.1638 Adj. R2: 0.86319
## Within R2: 0.235328
feols_S_all = feols(
S ~ AMR + AFR + OTR | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_S_all %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## AMR -1.4441 0.165 -8.770 < 2.2e-16 ***
## AFR -1.1093 0.140 -7.913 4.2398e-14 ***
## OTR 0.0382 0.280 0.136 8.9172e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.251483 Adj. R2: 0.856804
## Within R2: 0.241342
feols_S_CA_all = feols(
S ~ CA + AMR + AFR + OTR | L1_group,
data = d_nomiss,
weights = d_nomiss$SQRT_POP
)
feols_S_CA_all %>% summary()
## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## CA 0.0528 0.00334 15.793 < 2.2e-16 ***
## AMR -0.5139 0.13665 -3.761 0.00020177 ***
## AFR -0.3687 0.11496 -3.207 0.00147870 **
## OTR 0.0267 0.20976 0.127 0.89879521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.188002 Adj. R2: 0.919719
## Within R2: 0.576012
#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] spdep_1.4-1 spatialreg_1.4-2
## [3] spData_2.3.4 geodata_0.6-6
## [5] terra_1.8-70 tmap_4.2
## [7] rnaturalearthhires_1.0.0.9000 countrycode_1.6.1
## [9] rnaturalearth_1.1.0 sf_1.0-21
## [11] metafor_4.8-0 numDeriv_2016.8-1.1
## [13] metadat_1.4-0 mediation_4.5.1
## [15] sandwich_3.1-1 mvtnorm_1.3-3
## [17] MASS_7.3-65 fixest_0.13.2
## [19] effectsize_1.0.1 ggeffects_2.3.1
## [21] ggfortify_0.4.19 performance_0.15.2
## [23] lme4_1.1-37 Matrix_1.7-4
## [25] ggrepel_0.9.6 googlesheets4_1.1.2
## [27] kirkegaard_2025-11-19 robustbase_0.99-6
## [29] psych_2.5.6 assertthat_0.2.1
## [31] weights_1.1.2 magrittr_2.0.4
## [33] lubridate_1.9.4 forcats_1.0.1
## [35] stringr_1.6.0 dplyr_1.1.4
## [37] purrr_1.2.0 readr_2.1.5
## [39] tidyr_1.3.1 tibble_3.3.0
## [41] ggplot2_4.0.1.9000 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.5.2 leaflegend_1.2.1 cellranger_1.1.0
## [4] datawizard_1.3.0 XML_3.99-0.19 rpart_4.1.24
## [7] lifecycle_1.0.4 Rdpack_2.6.4 vroom_1.6.6
## [10] lattice_0.22-7 insight_1.4.2 crosstalk_1.2.2
## [13] vcd_1.4-13 backports_1.5.0 Hmisc_5.2-4
## [16] sass_0.4.10 rmarkdown_2.30 jquerylib_0.1.4
## [19] yaml_2.3.10 sp_2.2-0 DBI_1.2.3
## [22] minqa_1.2.8 RColorBrewer_1.1-3 multcomp_1.4-28
## [25] abind_1.4-8 nnet_7.3-20 TH.data_1.1-4
## [28] rappdirs_0.3.3 dreamerr_1.5.0 gdata_3.0.1
## [31] units_1.0-0 codetools_0.2-20 tidyselect_1.2.1
## [34] shape_1.4.6.1 raster_3.6-32 farver_2.1.2
## [37] base64enc_0.1-3 googledrive_2.1.2 mathjaxr_2.0-0
## [40] jsonlite_2.0.0 cols4all_0.9 e1071_1.7-16
## [43] mitml_0.4-5 Formula_1.2-5 survival_3.8-3
## [46] iterators_1.0.14 emmeans_1.11.2-8 systemfonts_1.3.1
## [49] foreach_1.5.2 tools_4.5.2 ragg_1.5.0
## [52] Rcpp_1.1.0 glue_1.8.0 mnormt_2.1.1
## [55] gridExtra_2.3 pan_1.9 mgcv_1.9-3
## [58] ranger_0.17.0 laeken_0.5.3 xfun_0.53
## [61] withr_3.0.2 fastmap_1.2.0 boot_1.3-32
## [64] VIM_6.2.6 digest_0.6.39 timechange_0.3.0
## [67] R6_2.6.1 estimability_1.5.1 textshaping_1.0.4
## [70] wk_0.9.4 mice_3.18.0 microbenchmark_1.5.0
## [73] colorspace_2.1-2 LearnBayes_2.15.1 spacesXYZ_1.6-0
## [76] lpSolve_5.6.23 gtools_3.9.5 generics_0.1.4
## [79] data.table_1.17.8 class_7.3-23 httr_1.4.7
## [82] htmlwidgets_1.6.4 parameters_0.28.2 tmaptools_3.3
## [85] pkgconfig_2.0.3 gtable_0.3.6 lmtest_0.9-40
## [88] S7_0.2.1 htmltools_0.5.8.1 carData_3.0-5
## [91] scales_1.4.0 png_0.1-8 reformulas_0.4.1
## [94] knitr_1.50 rstudioapi_0.17.1 tzdb_0.5.0
## [97] curl_7.0.0 coda_0.19-4.1 checkmate_2.3.3
## [100] nlme_3.1-168 nloptr_2.2.1 proxy_0.4-27
## [103] cachem_1.1.0 zoo_1.8-14 KernSmooth_2.23-26
## [106] parallel_4.5.2 foreign_0.8-90 s2_1.1.9
## [109] leafsync_0.1.0 pillar_1.11.1 grid_4.5.2
## [112] stringmagic_1.2.0 logger_0.4.1 vctrs_0.6.5
## [115] car_3.1-3 jomo_2.7-6 xtable_1.8-4
## [118] cluster_2.1.8.1 archive_1.1.12 htmlTable_2.4.3
## [121] evaluate_1.0.5 cli_3.6.5 compiler_4.5.2
## [124] crayon_1.5.3 rlang_1.1.6.9000 maptiles_0.10.0
## [127] labeling_0.4.3 classInt_0.4-11 plyr_1.8.9
## [130] fs_1.6.6 stringi_1.8.7 deldir_2.0-4
## [133] stars_0.6-8 leaflet_2.2.3 glmnet_4.1-10
## [136] bayestestR_0.17.0 hms_1.1.3 bit64_4.6.0-1
## [139] leafem_0.2.5 haven_2.5.5 rbibutils_2.3
## [142] igraph_2.1.4 gargle_1.6.0 broom_1.0.11
## [145] bslib_0.9.0 lwgeom_0.2-14 bit_4.6.0
## [148] DEoptimR_1.1-4
#write data to file for reuse
d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}