Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  googlesheets4,
  ggrepel,
  lme4, performance, ggfortify, ggeffects, effectsize, fixest, mediation, metafor, #models
  sf, rnaturalearth, countrycode, rnaturalearthhires, tmap, geodata, spatialreg, spdep, #spatial
)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'effectsize'
## 
## The following object is masked from 'package:kirkegaard':
## 
##     standardize
## 
## The following object is masked from 'package:psych':
## 
##     phi
## 
## 
## Attaching package: 'fixest'
## 
## The following object is masked from 'package:performance':
## 
##     r2
## 
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: mvtnorm
## 
## Attaching package: 'mvtnorm'
## 
## The following object is masked from 'package:effectsize':
## 
##     standardize
## 
## The following object is masked from 'package:kirkegaard':
## 
##     standardize
## 
## Loading required package: sandwich
## mediation: Causal Mediation Analysis
## Version: 4.5.1
## 
## 
## Attaching package: 'mediation'
## 
## The following object is masked from 'package:psych':
## 
##     mediate
## 
## Loading required package: metadat
## Loading required package: numDeriv
## 
## Loading the 'metafor' package (version 4.8-0). For an
## introduction to the package please type: help(metafor)
## 
## 
## Attaching package: 'metafor'
## 
## The following object is masked from 'package:fixest':
## 
##     se
## 
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
## Loading required package: terra
## terra 1.8.70
## 
## Attaching package: 'terra'
## 
## The following object is masked from 'package:MASS':
## 
##     area
## 
## The following object is masked from 'package:fixest':
## 
##     panel
## 
## The following object is masked from 'package:kirkegaard':
## 
##     rescale
## 
## The following objects are masked from 'package:psych':
## 
##     describe, distance, rescale
## 
## The following object is masked from 'package:assertthat':
## 
##     noNA
## 
## The following objects are masked from 'package:magrittr':
## 
##     extract, inset
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## 
## Attaching package: 'spData'
## 
## The following object is masked from 'package:ggeffects':
## 
##     coffee_data
## 
## 
## Attaching package: 'spdep'
## 
## The following objects are masked from 'package:spatialreg':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

#preferred functions
standardize = kirkegaard::standardize
select = dplyr::select

Functions

# Define a palette with 13+ colors (ColorBrewer-inspired, distinct for light/dark themes)
large_palette <- c(
  "#1F77B4", "#FF7F0E", "#2CA02C", "#D62728", "#9467BD",
  "#8C564B", "#E377C2", "#7F7F7F", "#BCBD22", "#17BECF",
  "#FFBB78", "#98DF8A", "#FF9896", "#C5B0D5", "#C49C94"
)

#fix a list
fix_list_col = function(x) {
  #if x is a list, return the first element
  if (is.list(x)) {
    return(x[[1]] %>% as.character())
  } else {
    return(x %>% as.character())
  }
}


#recale within country S to global scale
rescale_S_to_global = function(dd, L_level) {
  # browser()

  #level variable
  if (L_level == "1") {
    level_var = "has_L1_subunits"
  } else if (L_level == "2") {
    level_var = "has_L2_subunits"
  } else {
    stop("bad input")
  }
  
  #rescale values
  #first select appropriate subset
  dd %>% 
  select(
    L0_name, L1_name, L0_name, Level, has_L1_subunits, has_L2_subunits, POP, HDI_latest, ISO3,
    matches("^S_\\d$")) %>% 
  filter(
    (.data[[level_var]] & Level == "0") | Level == L_level
  ) %>% 
  plyr::ddply(c("L0_name"), function(dd) {
    #subset to sub units only
    dd_subs = dd[-1, ]
    message(dd_subs$L0_name[1])
    if (dd_subs$L0_name[1] == "Chile") {
      # browser()
    }
    
    #z score S variables
    dd_subs_S_z = dd_subs %>% 
      select(matches("^S_\\d$")) %>% 
      df_standardize(exclude_range_01 = F)
    
    #drop variables with too much missing data
    miss = miss_by_var(dd_subs_S_z, prop = T)
    
    #keep only variables with less than 30% missing data
    dd_subs_S_z2 = dd_subs_S_z %>% 
      select(-which(miss >= 0.3))
    
    #impute missing data if there are at least 2 variables
    if (ncol(dd_subs_S_z2) >= 3) {
      #check if any data is missing
      if (any(is.na(dd_subs_S_z2))) {
        dd_subs_S_z2_imp = dd_subs_S_z2 %>% miss_impute()
      } else {
        #if no missing data, use as is
        dd_subs_S_z2_imp = dd_subs_S_z2
      }
      
    } else {
      #if only 1-2 variables, use them as they are
      dd_subs_S_z2_imp = dd_subs_S_z2
    }
    
    #ensure all cors positive, otherwise we need to reverse the sign
    S_cors = cor(bind_cols(
      dd_subs %>% select(contains("HDI")),
      dd_subs_S_z2_imp
    ),
    use = "pairwise.complete.obs")
    
    #reverse variables that correlate negatively with HDI
    S_reversed = which(S_cors[1, ] < 0) - 1 #-1 because first column is HDI
    
    if (length(S_reversed) > 0) {
      dd_subs_S_z2_imp[, S_reversed] = -dd_subs_S_z2_imp[, S_reversed]
    }
    
    #compute mean z score
    dd_subs$S_mean_within = dd_subs_S_z2_imp %>% 
        rowMeans(na.rm = T) %>% 
        standardize()
    
    #scale to international S norms
    #do this by making the weighted mean equal the country mean
    #and the within country S SD equal to subnational HDI SD/international HDI SD
    #first we need to compute the subnational HDI SD relative to the global HDI SD
    hdi_sd = sd(dd_subs %>% pull(contains("HDI")), na.rm = T)
    hdi_sd_global = global_hdi_stats$sd[4] #2020 HDI SD
    hdi_sd_ratio = hdi_sd / hdi_sd_global
    
    #then calculate the weighted mean of the subnational S scores
    S_mean_weighted = weighted.mean(dd_subs$S_mean_within, dd_subs$POP, na.rm = T) %>% as.numeric() #this avoids a weird attribute it attaches dw_transformer
    
    #set weighted mean to 0
    dd_subs$S_mean_within = dd_subs$S_mean_within - S_mean_weighted

    #then convert to international norms
    dd_subs$S_global = dd_subs$S_mean_within * hdi_sd_ratio + dd[1, "S_1"]
    
    #check that the weighted mean is now equal to the country mean
    means_diff =  dd[1, "S_1"] - weighted.mean(dd_subs$S_global, dd_subs$POP, na.rm = T)
    if (is.na(means_diff)) {
      browser()
    }
    if (abs(means_diff) > 0.01) {
      stop("Weighted mean of subnational S scores does not match country mean.")
    }
    
    dd_subs
  })
}

#last value in a data frame, rowwise
get_last_value = function(df) {
  df %>%
    rowwise() %>%
    mutate(HDI_latest = {
      vals <- c_across(HDI_2005:HDI_2022_PUND)
      # get last non-NA value
      last_val <- rev(vals)[which(!is.na(rev(vals)))[1]]
      last_val
    }) %>%
    ungroup() %>% 
    pull(HDI_latest)
}


#summarizer for spatialreg models
summarize_spatial_models = function(model_list) {
    signif_stars <- function(p) {
        if (p < 0.001) "***"
        else if (p < 0.01) "**"
        else if (p < 0.05) "*"
        else ""
    }
    
    summarize_one <- function(model) {
        tidy_coef <- broom::tidy(model)
        tidy_coef <- tidy_coef[!duplicated(tidy_coef$term), ]
        tidy_coef$formatted <- sprintf("%0.2f (%0.3f%s)",
                                       tidy_coef$estimate,
                                       tidy_coef$std.error,
                                       sapply(tidy_coef$p.value, signif_stars))
        spatial_param_name <- if (!is.null(model$rho)) "rho" else if (!is.null(model$lambda)) "lambda" else NA_character_
        spatial_val <- if (spatial_param_name == "rho") model$rho else if (spatial_param_name == "lambda") model$lambda else NA_real_
        spatial_se <- if (spatial_param_name == "rho") model$rho.se else if (spatial_param_name == "lambda") model$lambda.se else NA_real_
        spatial_p <- 2 * (1 - pnorm(abs(spatial_val / spatial_se)))
        spatial_str <- sprintf("%0.2f (%0.3f%s)", spatial_val, spatial_se, signif_stars(spatial_p))
        
        tidy_coef <- tidy_coef[tidy_coef$term != spatial_param_name, ]
        
        pred_tbl <- tidy_coef %>%
            select(term, formatted) %>%
            bind_rows(tibble(term = "spatial", formatted = spatial_str))
        
        resid <- residuals(model)
        y <- model$y
        r_squared <- 1 - var(resid) / var(y)
        n <- length(y)
        p <- length(coef(model))
        adj_r_squared <- 1 - (1 - r_squared) * ((n - 1) / (n - p))
        
        stats_tbl <- tibble(
            term = c("R2 adj.", "N"),
            formatted = c(sprintf("%.3f", adj_r_squared), as.character(n))
        )
        
        bind_rows(pred_tbl, stats_tbl)
    }
    
    summaries <- lapply(model_list, summarize_one)
    
    all_terms <- unique(unlist(lapply(summaries, function(df) df$term)))
    
    result_mat <- sapply(summaries, function(df) {
        formatted_vec <- setNames(df$formatted, df$term)
        unname(formatted_vec[all_terms])
    })
    
    stats_terms <- c("R2 adj.", "N")
    term_rank <- ifelse(all_terms %in% stats_terms, 2, 1)
    ordered_terms <- all_terms[order(term_rank, all_terms)]
    
    df <- as.data.frame(result_mat[match(ordered_terms, all_terms), , drop = FALSE],
                        stringsAsFactors = FALSE)
    df$term <- ordered_terms
    df <- df[, c(ncol(df), 1:(ncol(df) - 1))]
    rownames(df) <- NULL
    
    as_tibble(df)
}


#mediation function
fit_mediation_by_group <- function(data, outcome, treatment, mediator, group_var,
                                   covariates_m = NULL, covariates_y = NULL, weight_var = NULL,
                                   sims = 1000) {
  groups <- unique(data[[group_var]])
  
  results_list <- lapply(groups, function(g) {
    d_sub <- data %>% dplyr::filter(.data[[group_var]] == g)
    
    # Build formulas more safely
    rhs_m <- c(treatment, covariates_m)
    rhs_y <- c(mediator, treatment, covariates_y)
    
    formula_m <- as.formula(paste(mediator, "~", paste(rhs_m, collapse = " + ")))
    formula_y <- as.formula(paste(outcome,  "~", paste(rhs_y, collapse = " + ")))
    
    # Fit mediator and outcome models (with optional weights)
    if (!is.null(weight_var)) {
      w <- d_sub[[weight_var]]
      model_m <- lm(formula_m, data = d_sub, weights = w)
      model_y <- lm(formula_y, data = d_sub, weights = w)
    } else {
      model_m <- lm(formula_m, data = d_sub)
      model_y <- lm(formula_y, data = d_sub)
    }
    
    # Perform mediation analysis
    med_fit <- mediate(model.m = model_m,
                       model.y = model_y,
                       treat   = treatment,
                       mediator = mediator,
                       sims     = sims)
    
    # Tidy summary of mediation results with group column
    med_sum <- summary(med_fit)
    
    #approx. SE
    prop_sims <- med_fit$n0.sims
    prop_sims <- prop_sims[is.finite(prop_sims)]
    
    tibble::tibble(
      group            = g,
      ACME             = med_sum$d0,
      ACME_p           = med_sum$d0.p,
      ADE              = med_sum$z0,
      ADE_p            = med_sum$z0.p,
      TotalEffect      = med_sum$tau.coef,
      TotalEffect_p    = med_sum$tau.p,
      PropMediated     = med_sum$n0,
      PropMediated_p   = med_sum$n0.p,
      PropMediated_se  = sd(pmin(pmax(prop_sims, -1), 1))
    )
  })
  
  dplyr::bind_rows(results_list)
}

Data

#use live version of local version
if (T) {
  #download live version
  gs4_deauth()
  d = read_sheet("https://docs.google.com/spreadsheets/d/1_YjHil-uOFy6mPor1tU0ZM-aXI0HxlD3SQWY45E95u8/edit?gid=0#gid=0", sheet = 2)
  d_NIQ_S = read_sheet("https://docs.google.com/spreadsheets/d/1cReoeIZLlbxOrN4_wnj52Q6UWFepXSJGi3Gj5m6ZAZg/edit?gid=528842851#gid=528842851", sheet = "Seb2024b")
  
  #save local version in case we need it
  d %>% write_rds("data/main_file.rds")
  d_NIQ_S %>% write_rds("data/d_NIQ_S.rds")
} else {
  d = read_rds("data/main_file.rds")
  d_NIQ_S = read_rds("data/d_NIQ_S.rds")
}

## ✔ Reading from "Summary Admix Data File".

## ✔ Range ''Tab 1: Summary data''.

## New names:
## ✔ Reading from "National IQ datasets".
## ✔ Range ''Seb2024b''.
## • `` -> `...63`
## • `` -> `...95`

#HDI data
d_hdi = read_csv("data/human-development-index/human-development-index.csv") %>% 
  rename(
    HDI = `Human Development Index`,
    Region = `World regions according to OWID`,
    ISO3 = Code
  )

## Rows: 6682 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Entity, Code, World regions according to OWID
## dbl (2): Year, Human Development Index
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Recode

#ISO for countries
d %<>% mutate(
  row_i = 1:n(), #for preserving the spreadsheet row order
  L0_ISO2 = countrycode::countrycode(L0_ISO3, origin = "iso3c", destination = "iso2c"),
  #combine region ISO codes if present
  ISO3 = L0_ISO3 + ifelse(!is.na(L1_ISO), "-" + L1_ISO, "") + ifelse(!is.na(L2_ISO), "-" + L2_ISO, ""),
  ISO2 = L0_ISO2 + ifelse(!is.na(L1_ISO), "-" + L1_ISO, "") + ifelse(!is.na(L2_ISO), "-" + L2_ISO, ""),
  Level = Level %>% map_chr(fix_list_col) %>% as.character() %>% str_remove(".000000"),
  
  #smallest unit name for plotting
  lowest_name = miss_fill(L2_name, L1_name, L0_name),
  
  #latest HDI value, better than average since time may be off
  HDI_latest = d %>% select(HDI_2005:HDI_2022_PUND) %>% get_last_value(),
  
  #CA on z scale
  CA = Summary_ACHQ,
  CA_z = (CA-100)/15
)

#manually override Puerto Rico ISO
d$ISO3[d$L0_name == "Puerto Rico"] = "PRI"
d$ISO2[d$L0_name == "Puerto Rico"] = "US-PR"

#assert no dup ISOs
assert_that(
  !any(duplicated(d$ISO3)),
  msg = "Duplicate ISO codes found in the dataset."
)

## [1] TRUE

#assert no dup ISO2 codes
assert_that(
  !any(duplicated(d$ISO2)),
  msg = "Duplicate ISO2 codes found in the dataset."
)

## [1] TRUE

#determine relationships among levels
#loop across rows and look up other cases to determine relationships
d = map_dfr(1:nrow(d), function(i) {
  #get row
  dd = d[i, ]
  
  #if it is level 0b, they never have subunits
  if (dd$Level == "0b") {
    y = dd %>% 
      mutate(
        L1_subunits = 0,  
        has_L1_subunits = FALSE,
        L2_subunits = 0,  
        has_L2_subunits = FALSE,
        L1_group = "Others",
        L2_group = "Others",
      ) 
    
    return(y)
  }
  
  #if level 2, it doesn't have subunits either
  if (dd$Level == "2") {
    y = dd %>% 
      mutate(
        L1_subunits = 0,  
        has_L1_subunits = FALSE,
        L2_subunits = 0,  
        has_L2_subunits = FALSE,
        L1_group = dd$L0_name,
        L2_group = dd$L1_name,
      )
    
    return(y)
  }
  
  #if level 0
  if (dd$Level == "0") {
    #filter to see if any L1 and L2 units exist
    dd_L1 = d %>% filter(L0_name == dd$L0_name, Level == "1")
    dd_L2 = d %>% filter(L0_name == dd$L0_name, Level == "2")
    
    y = dd %>% mutate(
      L1_subunits = nrow(dd_L1),  
      has_L1_subunits = L1_subunits > 0,
      L2_subunits = nrow(dd_L2),  
      has_L2_subunits = L2_subunits > 0,
      L1_group = if_else(has_L1_subunits, true = dd$L0_name, false = "Others"),
      L2_group = if_else(has_L2_subunits, true = dd$L0_name, false = "Others")
    )
    
    return(y)
  }
  
  #if level 1
    if (dd$Level == "1") {
    #filter to see if any L1 and L2 units exist
    dd_L2 = d %>% filter(L0_name == dd$L0_name, L1_name == dd$L1_name, Level == "2")
    
    y = dd %>% mutate(
      L1_subunits = 0,
      has_L1_subunits = F,
      L2_subunits = nrow(dd_L2),  
      has_L2_subunits = L2_subunits > 0,
      L1_group = dd$L0_name,
      L2_group = NA
    )
    
    return(y)
    }
  
  #any others is an error
  browser()
})

#check results
d %>% filter(has_L1_subunits) %>% select(L0_name, L1_subunits)

d %>% filter(has_L2_subunits) %>% select(L0_name, L1_name, L2_subunits)

#not Americans
d$Americans = !d$L0_name %in% c("France", "Denmark", "Netherlands", "United Kingdom")

#leave these out, they cause a lot of headaches
d_euros = d %>% 
  filter(
    !Americans
  )

#get rid of non-Americas units
d_orig = d
d %<>% filter(Americans)

#set subnational S to HDI scale
#first we need to compute global HDI mean/SD, then the mean/SD for countries in our sample
#then we need to group data for countries with L 1 units, then compute the SD among those units, and set their weighted mean to the country mean, and the S/S_subnational SD to be equal to the HDI/HDI_subnational SD.
global_hdi_stats = d_hdi %>% 
  select(-Region) %>% 
  filter(
    !is.na(ISO3),
    Year %in% c(2005, 2010, 2015, 2020)
    ) %>%
  #wide format
  pivot_wider(
    names_from = Year,
    values_from = HDI
  ) %>% 
  describe2()

#international S scale
#get international data
global_S_stats = d_NIQ_S$SDI %>% describe2()

American_S_stats = d %>% 
  filter(
    S_1_source == "Kirkegaard_Seb_Jensen"
  ) %>% 
  select(S_1) %>% 
  describe2()

#aggregate and rescale the data for L1 units
d_L1_rescaled = rescale_S_to_global(d, L_level = "1")

## Argentina

## Bolivia

## Brazil

## Canada

## Chile

## Colombia

## Costa Rica

## Cuba

## Ecuador

## Guatemala

## Mexico

## Panama

## Peru

## USA

## Venezuela

d_L2_rescaled = rescale_S_to_global(d, L_level = "2")

## Chile

# d %>% filter(Level == "0", has_L1_subunits) %>% GG_scatter("HDI_2020", "S_1", case_names = "Region_1")

#join these values back to main dataset
d$S_global = NULL
d = full_join(
  d,
  bind_rows(
    d_L1_rescaled,
    d_L2_rescaled
  ) %>% select(ISO3, S_global),
  by = "ISO3"
)

#reset order to regular
d %<>% arrange(row_i)

#combine S_1 and S_global
#use S_global when present, otherwise, use S_1
d %<>% mutate(
  S = ifelse(!is.na(S_global), S_global, S_1)
)

#normalize the ancestries to sum = 1
d %<>% mutate(across(ends_with("_fin"), ~ . / (Genetic_West_Eurasian_fin + Genetic_AMER_fin + Genetic_AFR_fin + Genetic_OTHER_fin)))

#confirm it is correct
d %>% select(Genetic_West_Eurasian_fin, Genetic_AMER_fin, Genetic_AFR_fin, Genetic_OTHER_fin) %>% rowSums() %>% table2()

#rename ancestries for ease of use
d %<>% mutate(
  EUR = Genetic_West_Eurasian_fin,
  AMR = Genetic_AMER_fin,
  AFR = Genetic_AFR_fin,
  OTR = Genetic_OTHER_fin
)

#check that the sum is 1
d %>% select(matches("_fin$")) %>% 
  rowSums() %>% 
  subtract(1) %>% 
  is_less_than(0.0001) %>% 
  all(na.rm = T) %>% 
  assert_that()

## [1] TRUE

#rename some variables for ease of use
# d %<>% mutate(
#   CA = Summary_ACHQ,
# )

#code to export recomputed S to spreadsheet via clipboard and also keeping the original order of rows
if (F) {
  left_join(
    d_orig %>% select(row_i, ISO3),
    d %>% select(row_i, S)
  ) %>% 
    arrange(row_i) %>% 
    write_clipboard()
}

#fill in S for any using without it which have HDI
S_pred_fit = lm(S ~ HDI_latest, data = d %>% filter(Level == "0"))
S_pred_fit %>% summary()

## 
## Call:
## lm(formula = S ~ HDI_latest, data = d %>% filter(Level == "0"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2893 -0.1258  0.0053  0.1020  0.4127 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -4.627      0.278   -16.6   <2e-16 ***
## HDI_latest     6.233      0.364    17.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.165 on 32 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.901,  Adjusted R-squared:  0.898 
## F-statistic:  292 on 1 and 32 DF,  p-value: <2e-16

d$S_pred = predict(S_pred_fit, newdata = d %>% select(HDI_latest))
d %>% filter(Level == "0") %>% GG_scatter("S_pred", "S", case_names = "L0_name")

## `geom_smooth()` using formula = 'y ~ x'

d %>% filter(Level == "0b") %>% select(lowest_name, S_pred, HDI_latest)

d %>% filter(Level == "0b") %>% GG_scatter("S_pred", "HDI_latest", case_names = "L0_name")

## `geom_smooth()` using formula = 'y ~ x'

d %>% filter(Level == "1") %>% GG_scatter("S_pred", "S", color = "L1_group", case_names = "L0_name")

## `geom_smooth()` using formula = 'y ~ x'

d %>% filter(Level == "1", L0_name == "Guatemala") %>% GG_scatter("S", "HDI_latest", case_names = "lowest_name")

## `geom_smooth()` using formula = 'y ~ x'

#fill in missing S values
d %<>% mutate(
  S = ifelse(is.na(S), S_pred, S)
)

Spatial data

We must do this step after the above.

National-like units

L0 and L1b units.

#countries we have subnational data for
#since we 
nats_with_L1_ISO3 = d %>% filter(has_L1_subunits) %>% pull(ISO3)
nats_with_L1_ISO2 <- countrycode::countrycode(nats_with_L1_ISO3, origin = "iso3c", destination = "iso2c")

#spatial
sp_nat = ne_countries(type = "countries") %>% 
  mutate(
    ISO3 = adm0_a3,
    ISO2 = iso_a2
  )

#drop data we dont need
sp_nat %<>% 
  filter(
    ISO3 %in% d$ISO3
  ) %>% 
  select(name, ISO2, ISO3)

#make the data more valid
sp_nat %<>% 
  st_make_valid()

#a few countries are inexplicably ISO2 codes, so we need to fix them
#more are missing, but these ones are the important ones, though we don't need them for this study
#the remaining missings are: Somaliland and northern Cyprus
sp_nat$ISO2[sp_nat$ISO3 == "NOR"] = "NO"
sp_nat$ISO2[sp_nat$ISO3 == "FRA"] = "FR"
sp_nat$ISO2[sp_nat$ISO3 == "KOS"] = "XK"

# sp_extras = rnaturalearth::ne_download(scale = 10, type = "admin_0_scale_rank_minor_islands", returnclass = "sf", destdir = "data")
sp_extras = read_sf("data/ne_10m_admin_0_scale_rank_minor_islands/ne_10m_admin_0_scale_rank_minor_islands.shp") %>% 
  mutate(
    ISO3 = sr_su_a3,
    # ISO2 = countrycode::countrycode(sr_adm0_a3, origin = "iso3c", destination = "iso2c")
    name = sr_subunit
  ) %>% 
  #drop data we dont need
  select(
    name, ISO3, sr_adm0_a3, sr_su_a3
  )

#drop units without ISO codes
sp_extras %<>% filter(!is.na(ISO3))

#recode Antigua & Barbuda to ATG
sp_extras$ISO3[sp_extras$sr_adm0_a3 == "ATG"] = "ATG"
#and US virgin islands to our special code
sp_extras$ISO3[sp_extras$sr_adm0_a3 == "VIR"] = "USA-VI"
#Puerto Rico as well
sp_extras$ISO3[sp_extras$sr_adm0_a3 == "PRI"] = "USA-PR"
#Netherlands Caribbean = Bonaire, Sint Eustatius and Saba
sp_extras$ISO3[sp_extras$name == "Caribbean Netherlands"] = "BES"

#are we missing anything that isn't L1 or L2?
missing_mini_countries = d %>% 
  filter(Level %in% c("0", "0b"), Americans) %>%
  filter(!ISO3 %in% sp_nat$ISO3) %>% select(
    L0_name, ISO3
  ) %>% 
  mutate(
    #add ISO2 for easier joining later
    adm0_match = ISO3 %in% sp_extras$sr_adm0_a3,
    sr_su_match = ISO3 %in% sp_extras$sr_su_a3
  )

#add these from the extras file
#but first we need to filter and merge them
sp_extras_missing = sp_extras %>% 
  filter(
    #some use their own ISO codes, but others only have a sub-code, and we need to find them in the subunit ISO
    ISO3 %in% missing_mini_countries$ISO3
    ) %>% 
  select(ISO3, name, geometry) %>% 
  group_by(ISO3) %>%
  summarise(
    name = first(na.omit(name)), 
    ISO3 = first(ISO3),
    geometry = st_union(geometry)
  ) %>% 
  ungroup()

sp_extras_missing %>% tm_shape() + tm_polygons()

#do we have all of them?
nrow(sp_extras_missing)

## [1] 23

nrow(missing_mini_countries)

## [1] 23

#what's missing?
missing_mini_countries_check = missing_mini_countries %>% 
  filter(!ISO3 %in% c(sp_extras_missing$ISO3))

missing_mini_countries_check

#add them to main maps
sp_nat2 = bind_rows(
  sp_nat,
  sp_extras_missing
)

#validate and simplify geometries
sp_nat2 %<>% 
  #repair
  st_make_valid() %>% 
  st_simplify(preserveTopology = TRUE, dTolerance = 0.01) %>% 
  #add lat lon and controids
  mutate(centroid = st_centroid(geometry)) %>% 
  mutate(lon = st_coordinates(centroid)[,1], lat = st_coordinates(centroid)[,2])

#check all nations are there again
missing_mini_countries_check = d %>% 
  filter(Level %in% c("0", "0b")) %>%
  filter(!ISO3 %in% sp_nat2$ISO3) %>% 
  select(
    L0_name, ISO3
  )

assert_that(nrow(missing_mini_countries_check) == 0, 
            msg = "Some countries are missing from the spatial data.")

## [1] TRUE

#no dups
assert_that(!sp_nat2$ISO3 %>% anyDuplicated())

## [1] TRUE

#test map by plotting names
sp_nat2 %>% 
  ggplot(aes(x = lon, y = lat, label = name)) +
  geom_sf() +
  geom_text_repel(size = 3, 
                  max.overlaps = Inf, 
                  min.segment.length = 0, 
                  force = 1, 
                  force_pull = 1, 
                  point.padding = 0.5, 
                  segment.color = "black", 
                  segment.size = 0.5) +
  theme_minimal()

GG_save("figs/maps/nat names.png")

#join data to spatial
sp_nat3 = left_join(
  sp_nat2,
  d %>% select(ISO3, CA, S, ends_with("_fin")),
  by = c("ISO3" = "ISO3")
)

#which have missing data?
miss_covars = sp_nat3 %>% 
  filter(is.na(CA) | is.na(S)) %>% 
  select(ISO3, name, CA, S) %>% 
  arrange(ISO3)

#confirm with main file
d %>% select(L0_name, ISO3, CA, S, HDI_latest, ends_with("_fin")) %>% filter(ISO3 %in% miss_covars$ISO3)

Subnational units

L1 units.

#L1 units
sp_subdiv = read_sf("data/ne_10m_admin_1_states_provinces/ne_10m_admin_1_states_provinces.shp") %>% 
  mutate(ISO2 = iso_3166_2,
         L0_ISO3 = adm0_a3,
         L0_ISO2 = iso_a2,
         ISO3 = L0_ISO3 + (ISO2 %>% str_remove("^[^-]+"))
         )

sp_subdiv %>% select(name, ISO2, ISO3, L0_ISO2, L0_ISO3)

#there are some errors we need to fix
#first stop units without names
sp_subdiv %<>% filter(!is.na(name))

#then fix the issues
sp_subdiv$ISO3[sp_subdiv$name == "Bogota"] = "COL-DC"
sp_subdiv$ISO3[sp_subdiv$ISO3 == "MEX-DIF"] = "MEX-CMX"

#Panama data is out of date, so we download from another source
sp_panama <- gadm(country = "PAN", level = 1, path = "data") %>% st_as_sf()

#it has missing ISO codes, so we add them
sp_panama$ISO3 = c(
  "PAN-1",
  "PAN-4",
  "PAN-2",
  "PAN-3",
  "PAN-5",
  "PAN-EM",
  "PAN-6",
  "PAN-KY",
  "PAN-7",
  "PAN-NB",
  "PAN-8",
  "PAN-10",
  "PAN-9"
)

#Puerto Rico and US Virgin Islands
#but first merge their geometries
sp_USA_PR_VI = sp_extras %>% 
  filter(ISO3 %in% c("USA-PR", "USA-VI")) %>% 
  select(ISO3, name, geometry) %>% 
  group_by(ISO3) %>%
  summarise(
    name = first(na.omit(name)), 
    ISO3 = first(ISO3),
    geometry = st_union(geometry)
  ) %>% 
  ungroup()

#subnational divs
sp_subdiv2 <- bind_rows(
  sp_nat3 %>%
    #then remove those that have L1 units
    filter(!ISO3 %in% nats_with_L1_ISO3) %>% 
    #only keep the vars we need
    select(name, ISO3),
  #add most subunits, but exclude Panama
  sp_subdiv %>% filter(L0_ISO3 %in% nats_with_L1_ISO3) %>% filter(L0_ISO3 != "PAN"),
  #and then Panama
  sp_panama %>% rename(name = NAME_1) %>% select(name, ISO3),
  #USA territories
  sp_USA_PR_VI
)

sp_subdiv2$name[!st_is_valid(sp_subdiv2)]

## [1] "Goiás"

#make valid
sp_subdiv2 %<>% 
  st_make_valid()

#Lima Peru has 2 parts, we need to merge them
sp_subdiv2 = sp_subdiv2 %>% 
  group_by(ISO3) %>%
  summarise(
    name = first(na.omit(name)), 
    ISO3 = first(ISO3),
    geometry = st_union(geometry)
  ) %>% 
  ungroup()

#check for duplicates
sp_subdiv2$ISO3 %>% anyDuplicated() %>% not() %>% assert_that()

## [1] TRUE

#make valid and add lon lat
sp_subdiv2 %<>% 
  #repair
  st_make_valid() %>%  # Repair invalid geometries
  st_simplify(preserveTopology = TRUE, dTolerance = 0.01) %>%  # Simplify to remove duplicates
  #add lat lon and controids
  mutate(centroid = st_centroid(geometry)) %>% 
  mutate(lon = st_coordinates(centroid)[,1], lat = st_coordinates(centroid)[,2])

#check for missing units
missing_subunits = d %>% 
  select(L0_name, L1_name, ISO2, ISO3, Level) %>% 
  mutate(
    matches = ISO3 %in% sp_subdiv2$ISO3
  ) %>% 
  filter(
    !ISO3 %in% nats_with_L1_ISO3,
    !matches,
    Level != "2"
    )

#assert none
assert_that(nrow(missing_subunits) == 0)

## [1] TRUE

#test map by plotting names
sp_subdiv2 %>% 
  ggplot(aes(x = lon, y = lat, label = name)) +
  geom_sf() +
  geom_text(size = 1) +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal()

GG_save("figs/maps/subdiv names.png")


#join in the data
sp_subdiv3 = left_join(
  sp_subdiv2,
  d %>% select(ISO3, L0_ISO3, CA, S, 
    EUR, AMR, AFR, OTR),
  by = "ISO3"
)

#which have missing data?
miss_covars = sp_subdiv3 %>% 
  filter(is.na(CA) | is.na(S)) %>% 
  select(ISO3, name, CA, S) %>% 
  arrange(ISO3)

miss_covars

Water-access

library(lwgeom)
library(igraph)

# 0) Start from your sf polygons
# sp_subdiv3

# 2️⃣ Turn off s2 for topology ops
sf_use_s2(FALSE)

# 3️⃣ Reproject to a metric CRS
# (Web Mercator EPSG:3857 is fine for global work)
sp <- st_transform(sp_subdiv3, 3857)

# 4️⃣ Clean and snap to a small grid
sp <- sp |>
  st_make_valid() |>
  st_set_precision(1) |>
  st_snap_to_grid(size = 1)

# meters; tune 300–1500 if needed
conn_dist <- 750   

# adjacency allowing tiny gaps
nb_list <- st_is_within_distance(sp, sp, dist = conn_dist)
nb_list <- lapply(seq_along(nb_list), function(i) setdiff(nb_list[[i]], i))  # drop self

g <- graph_from_adj_list(nb_list, mode = "all") |> as_undirected("collapse")
sp$landmass <- components(g)$membership

# union per landmass on original (unbuffered) sp
landmass_union <- sp |>
  group_by(landmass) |>
  summarise(geometry = st_union(geometry), .groups = "drop") |>
  st_make_valid()

# keep holes (lakes are OK)
landmass_coast <- landmass_union |>
  mutate(coast = st_boundary(geometry)) |>
  select(landmass, coast)

# coastal proportion (same helper as before)
poly_boundary <- st_boundary(sp)
coast_by_poly <- landmass_coast$coast[ match(sp$landmass, landmass_coast$landmass) ]

len_intersection <- function(i) {
  seg <- st_intersection(st_geometry(poly_boundary)[i], st_geometry(coast_by_poly)[i])
  if (length(seg) == 0 || all(st_is_empty(seg))) return(0)
  as.numeric(sum(st_length(seg)))
}

coastal_lengths <- vapply(seq_len(nrow(sp)), len_intersection, numeric(1))
total_lengths   <- as.numeric(st_length(poly_boundary))
sp$coastal_proportion <- ifelse(total_lengths > 0, pmin(coastal_lengths/total_lengths, 1), NA_real_)

#centroids
sp_labels <- sp %>%
  st_centroid() %>%
  mutate(coords = st_coordinates(geometry)) %>%
  mutate(x = coords[,1], y = coords[,2])

#lat lon format
sp_plot <- st_transform(sp, 4326)
sp_labels <- sp_plot %>%
  st_point_on_surface() %>%
  mutate(coords = st_coordinates(geometry),
         x = coords[,1], y = coords[,2])

# 8) Plot
sp_plot %>% 
  ggplot() +
  geom_sf(aes(fill = coastal_proportion), color = "black", size = 0.2) +
  # geom_text(data = sp_labels, aes(x = x, y = y, label = round(coastal_proportion, 2))) +
  scale_fill_gradient(low = "white", high = "red", name = "Coastal proportion",
                      limits = c(0, 1), labels = scales::percent_format(accuracy = 1)) +
  theme_minimal() +
  labs(title = "Coastal Proportion by Unit") +
  theme(legend.position = "right") +
  coord_sf(
  xlim = c(-170, -10),   # longitudes (west to east)
  ylim = c(-60, 120),    # latitudes (south to north)
  expand = FALSE         # don’t add extra margins
)

GG_save("figs/maps/water access.png")

Back-join geographical information

From the map data, we derived lat lon, which we want for analysis.

#compile the data we need
d_latlon = bind_rows(
  sp_subdiv3 %>% select(ISO3, lat, lon) %>% st_drop_geometry(),
  sp_nat3 %>% select(ISO3, lat, lon) %>% filter(!ISO3 %in% sp_subdiv3$ISO3) %>% st_drop_geometry()
)

d = left_join(
  d,
  d_latlon,
  by = "ISO3"
)

#add spatial lags
d = spatial_knn(
  d,
  vars = c("S", "CA"),
  k = 3
)

d %>% spatial_lag_cors()

##     S    CA 
## 0.863 0.885

#join water %
# d = left_join(
#   d,
#   sp_plot %>% st_drop_geometry() %>% select(ISO3, coastal_proportion),
#   by = "ISO3"
# )

#no dups
d$ISO3 %>% anyDuplicated()

## [1] 0

Analysis

Maps

#Country maps
sp_nat3 %>% 
  ggplot(aes(fill = CA)) +
  scale_fill_gradient(
    low = "red", 
    high = "blue", 
    na.value = "grey80"
  ) +
  geom_sf() +
  geom_text_repel(aes(x = lon, y = lat, label = floor(CA)), 
                  size = 3, 
                  max.overlaps = Inf, 
                  min.segment.length = 0, 
                  force = 1, 
                  force_pull = 1, 
                  point.padding = 0.5, 
                  segment.color = "black", 
                  segment.size = 0.5) +
  theme_minimal()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).

GG_save("figs/maps/nat CA.png")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).

#do the same for S
sp_nat3 %>% 
  ggplot(aes(fill = S)) +
  scale_fill_gradient(
    low = "red", 
    high = "blue", 
    na.value = "grey80"
  ) +
  geom_sf() +
  geom_text_repel(aes(x = lon, y = lat, label = round(S, 2)), 
                  size = 3, 
                  max.overlaps = Inf, 
                  min.segment.length = 0, 
                  force = 1, 
                  force_pull = 1, 
                  point.padding = 0.5, 
                  segment.color = "black", 
                  segment.size = 0.5) +
  theme_minimal()

GG_save("figs/maps/nat S.png")

#Subnational maps
sp_subdiv3 %>% 
  #since Cuba and Venezuela have missing data for L1, remove them and use the L0
  filter(!L0_ISO3 %in% c("CUB", "VEN")) %>% 
  bind_rows(
    sp_nat3 %>% 
      filter(ISO3 %in% c("CUB", "VEN"))
  ) %>% 
  ggplot(aes(fill = CA)) +
  scale_fill_gradient(
    low = "red", 
    high = "blue", 
    na.value = "grey80"
  ) +
  geom_sf() +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal()

GG_save("figs/maps/subdiv CA.png")

sp_subdiv3 %>% 
  ggplot(aes(fill = S)) +
  scale_fill_gradient(
    low = "red", 
    high = "blue", 
    na.value = "grey80"
  ) +
  geom_sf() +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal()

GG_save("figs/maps/subdiv S.png")

#euro
sp_subdiv3 %>% 
  ggplot(aes(fill = EUR)) +
  scale_fill_gradient(
    low = "white", 
    high = "blue", 
    na.value = "grey80",
    labels = scales::percent,
    limits = c(0, 1)
  ) +
  geom_sf() +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal() +
  labs(
    fill = "West Eurasian ancestry %"
  )

GG_save("figs/maps/subdiv EUR.png")

#african
sp_subdiv3 %>% 
  ggplot(aes(fill = AFR)) +
  scale_fill_gradient(
    low = "white", 
    high = "black", 
    na.value = "grey80",
    labels = scales::percent,
    limits = c(0, 1)
  ) +
  geom_sf() +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal() +
  labs(
    fill = "African ancestry %"
  )

GG_save("figs/maps/subdiv AFR.png")

#amerindian
sp_subdiv3 %>% 
  ggplot(aes(fill = AMR)) +
  scale_fill_gradient(
    low = "white", 
    high = "red", 
    na.value = "grey80",
    labels = scales::percent,
    limits = c(0, 1)
  ) +
  geom_sf() +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal() +
  labs(
    fill = "Amerindian ancestry %"
  )

GG_save("figs/maps/subdiv AMR.png")

#other
sp_subdiv3 %>% 
  ggplot(aes(fill = OTR)) +
  scale_fill_gradient(
    low = "white", 
    high = "green", 
    na.value = "grey80",
    labels = scales::percent,
    limits = c(0, 1)
  ) +
  geom_sf() +
  coord_sf(xlim = c(-170, -20), ylim = c(-60, 80)) +  # Adjusted range for Americas
  theme_minimal() +
  labs(
    fill = "Other ancestry %"
  )

GG_save("figs/maps/subdiv OTR.png")

Simple plots

d %>% 
  filter(
    Level != "2",
    !has_L1_subunits,
    ) %>% 
  GG_scatter("EUR", "S", case_names = "lowest_name", weights = "SQRT_POP", color = "L1_group") +
  scale_x_continuous(
    labels = scales::percent,
    limits = c(0, 1)
  ) +
 labs(
    x = "West Eurasian ancestry %",
    y = "Socioeconomic development",
    color = "Country group"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter subnat S~EUR.png")

## `geom_smooth()` using formula = 'y ~ x'

d %>% 
  filter(
    Level != "2",
    !has_L1_subunits,
    ) %>% 
  GG_scatter("EUR", "CA", case_names = "lowest_name", weights = "SQRT_POP", color = "L1_group") +
  labs(
    x = "West Eurasian ancestry %",
    y = "Cognitive ability",
    color = "Country group"
  ) +
  scale_x_continuous(
    labels = scales::percent,
    limits = c(0, 1)
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter subnat CA~EUR.png")

## `geom_smooth()` using formula = 'y ~ x'

d %>% 
  filter(
    Level != "2",
    !has_L1_subunits,
    ) %>% 
  GG_scatter("CA", "S", case_names = "lowest_name", weights = "SQRT_POP", color = "L1_group") +
  labs(
    x = "Cognitive ability",
    y = "Socioeconomic development",
    color = "Country group"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter subnat S~CA.png")

## `geom_smooth()` using formula = 'y ~ x'

#L1 units split by country
#EUR x S
d %>% 
  filter(!has_L1_subunits,
         !is.na(EUR),
         Level != "2",
         ) %>%
  ggplot(aes(EUR, S, color = L1_group)) +
  geom_point(aes(size = SQRT_POP), show.legend = FALSE) +
  geom_smooth(method = "lm") +
  facet_wrap(~L1_group, scales = "free") +
  labs(
    x = "West Eurasian ancestry %",
    y = "Socioeconomic development",
    color = "Country group"
  ) +
  scale_x_continuous(
    labels = scales::percent,
    limits = c(0, 1)
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter by country S~EUR.png")

## `geom_smooth()` using formula = 'y ~ x'

#EUR x CA
d %>% 
  filter(!has_L1_subunits,
         !is.na(EUR),
         !is.na(CA),
          Level != "2",
         ) %>%
  ggplot(aes(EUR, CA, color = L1_group)) +
  geom_point(aes(size = SQRT_POP), show.legend = F) +
  geom_smooth(method = "lm") +
  facet_wrap(~L1_group, scales = "free") +
  labs(
    x = "West Eurasian ancestry %",
    y = "Cognitive ability",
    color = "Country group"
  ) +
  scale_x_continuous(
    labels = scales::percent,
    limits = c(0, 1)
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter by country CA~EUR.png")

## `geom_smooth()` using formula = 'y ~ x'

#CA x S
d %>% 
  filter(!has_L1_subunits,
         !is.na(EUR),
         !is.na(CA),
          Level != "2",
         ) %>%
  ggplot(aes(CA, S, color = L1_group)) +
  geom_point(aes(size = SQRT_POP), show.legend = F) +
  geom_smooth(method = "lm") +
  facet_wrap(~L1_group, scales = "free") +
  labs(
    x = "Cognitive ability",
    y = "Socioeconomic development",
    color = "Country group"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter by country S~CA.png")

## `geom_smooth()` using formula = 'y ~ x'

OLS regressions

#prep data subset
d_ols_models_data = d %>% 
  filter(!has_L1_subunits,
         !is.na(EUR),
         !is.na(CA),
          Level != "2",
         ) %>% 
  spatial_knn(
  vars = c("S", "CA"),
  k = 3
)

d_ols_models_data %>% spatial_lag_cors()

##     S    CA 
## 0.856 0.893

#fit models
#S
ols_models_S = list(
  #no country effects
  lm(S ~ CA, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ EUR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ CA + EUR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ AMR + AFR + OTR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  
  
  #fixed effects for countries
  lm(S ~ CA + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ EUR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ CA + EUR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)

#summarize models
ols_models_S %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))

#CA
#fit models
ols_models_CA = list(
  #no country effects
  lm(CA ~ EUR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(CA ~ AMR + AFR + OTR, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  
  #fixed effects for countries
  lm(CA ~ EUR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(CA ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)

#summarize models
ols_models_CA %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))

#mediation
ols_models_S_mediation <- mediation::mediate(ols_models_CA[[1]], ols_models_S[[3]], treat = "EUR", mediator = "CA", sims = 1000)
summary(ols_models_S_mediation)

## 
## Causal Mediation Analysis 
## 
## Quasi-Bayesian Confidence Intervals
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME              1.527        1.333        1.728  <2e-16 ***
## ADE               0.658        0.504        0.802  <2e-16 ***
## Total Effect      2.185        1.974        2.398  <2e-16 ***
## Prop. Mediated    0.698        0.636        0.765  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 334 
## 
## 
## Simulations: 1000

#add spatial lags
#S
ols_models_S_lag = list(
  #no country effects
  lm(S ~ CA + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ EUR + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ CA + EUR + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ AMR + AFR + OTR + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  
  
  #fixed effects for countries
  lm(S ~ CA + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ EUR + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ CA + EUR + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(S ~ AMR + AFR + OTR + L1_group + S_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)

#summarize models
ols_models_S_lag %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))

#CA
ols_models_CA_lag = list(
  #no country effects
  lm(CA ~ EUR + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(CA ~ AMR + AFR + OTR + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  
  #fixed effects for countries
  lm(CA ~ EUR + L1_group + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP),
  lm(CA ~ AMR + AFR + OTR + L1_group + CA_lag, data = d_ols_models_data, weights = d_ols_models_data$SQRT_POP)
)

#summarize models
ols_models_CA_lag %>% summarize_models() %>% filter(!str_detect(`Predictor/Model`, "L1_group"))

#mediation
ols_models_S_lag_mediation <- mediation::mediate(ols_models_CA_lag[[1]], ols_models_S_lag[[3]], treat = "EUR", mediator = "CA", sims = 1000)
summary(ols_models_S_lag_mediation)

## 
## Causal Mediation Analysis 
## 
## Quasi-Bayesian Confidence Intervals
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME              0.210        0.120        0.302  <2e-16 ***
## ADE               0.465        0.317        0.614  <2e-16 ***
## Total Effect      0.675        0.505        0.855  <2e-16 ***
## Prop. Mediated    0.312        0.192        0.441  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 334 
## 
## 
## Simulations: 1000

#mediation by country group
set.seed(1)
ols_models_S_L1 = fit_mediation_by_group(
  data = d_ols_models_data,
  outcome = "S",
  treatment = "EUR",
  mediator = "CA",
  group_var = "L1_group",
  weight_var = "SQRT_POP"
)

#meta-analysis
rma(
  yi = ols_models_S_L1$PropMediated,
  sei = ols_models_S_L1$PropMediated_se
)

## 
## Random-Effects Model (k = 14; tau^2 estimator: REML)
## 
## tau^2 (estimated amount of total heterogeneity): 0.0801 (SE = 0.0542)
## tau (square root of estimated tau^2 value):      0.2830
## I^2 (total heterogeneity / total variability):   70.89%
## H^2 (total variability / sampling variability):  3.43
## 
## Test for Heterogeneity:
## Q(df = 13) = 84.7379, p-val < .0001
## 
## Model Results:
## 
## estimate      se    zval    pval   ci.lb   ci.ub      
##   0.6271  0.1032  6.0787  <.0001  0.4249  0.8293  *** 
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#mediation by country group with lags
set.seed(1)
ols_models_S_lag_L1 = fit_mediation_by_group(
  data = d_ols_models_data,
  outcome = "S",
  treatment = "EUR",
  mediator = "CA",
  group_var = "L1_group",
  weight_var = "SQRT_POP",
  covariates_m = "CA_lag",
  covariates_y = "S_lag"
)

#meta-analysis
rma(
  yi = ols_models_S_lag_L1$PropMediated,
  sei = ols_models_S_lag_L1$PropMediated_se
)

## 
## Random-Effects Model (k = 14; tau^2 estimator: REML)
## 
## tau^2 (estimated amount of total heterogeneity): 0.0658 (SE = 0.0556)
## tau (square root of estimated tau^2 value):      0.2566
## I^2 (total heterogeneity / total variability):   52.47%
## H^2 (total variability / sampling variability):  2.10
## 
## Test for Heterogeneity:
## Q(df = 13) = 36.3729, p-val = 0.0005
## 
## Model Results:
## 
## estimate      se    zval    pval   ci.lb   ci.ub      
##   0.5598  0.1041  5.3794  <.0001  0.3558  0.7637  *** 
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Spatial regressions

#get distance matrix for spatial models
d_ols_models_dists = st_as_sf(d_ols_models_data, coords = c("lon", "lat"), crs = 4326) %>% st_distance()
quantile(d_ols_models_dists, probs = seq(0, 1, 0.1))

## Units: [m]
##       0%      10%      20%      30%      40%      50%      60%      70% 
##        0  1102412  1831422  2429527  3020888  3586622  4196866  4912233 
##      80%      90%     100% 
##  5944567  7420748 15127223

#neighbors
d_ols_models_nbs = knearneigh(d_ols_models_data %>% select(lat, lon), k=4)
d_ols_models_nbs_listw = d_ols_models_nbs %>% knn2nb() %>% nb2listw(style = "W")

#fit models
#S
sa_lag_models_S = list(
  #no country effects
  lagsarlm(S ~ CA, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(S ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(S ~ CA + EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(S ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  
  
  #fixed effects for countries
  lagsarlm(S ~ CA + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(S ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(S ~ CA + EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(S ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw)
)

#summarize
sa_lag_models_S %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))

#impacts (direct and indirect effects)
sa_lag_models_S %>% map(\(x) impacts(x, listw = d_ols_models_nbs_listw))

## [[1]]
## Impact measures (lag, exact):
##          Direct Indirect  Total
## CA dy/dx 0.0491   0.0268 0.0759
## 
## [[2]]
## Impact measures (lag, exact):
##           Direct Indirect Total
## EUR dy/dx   1.03     1.34  2.37
## 
## [[3]]
## Impact measures (lag, exact):
##           Direct Indirect Total
## CA dy/dx  0.0433   0.0187 0.062
## EUR dy/dx 0.4190   0.1807 0.600
## 
## [[4]]
## Impact measures (lag, exact):
##            Direct Indirect   Total
## AMR dy/dx -1.1814  -1.2345 -2.4159
## AFR dy/dx -0.9169  -0.9581 -1.8749
## OTR dy/dx -0.0464  -0.0485 -0.0949
## 
## [[5]]
## Impact measures (lag, exact):
##                           Direct  Indirect   Total
## CA dy/dx                  0.0549 -0.000560  0.0544
## L1_groupBolivia dy/dx    -0.7731  0.007879 -0.7652
## L1_groupBrazil dy/dx     -0.1683  0.001715 -0.1665
## L1_groupCanada dy/dx      0.1580 -0.001610  0.1563
## L1_groupChile dy/dx       0.0918 -0.000936  0.0909
## L1_groupColombia dy/dx   -0.3795  0.003868 -0.3756
## L1_groupCosta Rica dy/dx -0.1314  0.001339 -0.1301
## L1_groupEcuador dy/dx    -0.5314  0.005416 -0.5260
## L1_groupGuatemala dy/dx  -0.7640  0.007786 -0.7562
## L1_groupMexico dy/dx     -0.3569  0.003637 -0.3532
## L1_groupOthers dy/dx     -0.3085  0.003145 -0.3054
## L1_groupPanama dy/dx      0.0242 -0.000247  0.0240
## L1_groupPeru dy/dx       -0.3632  0.003702 -0.3595
## L1_groupUSA dy/dx         0.0414 -0.000422  0.0410
## 
## [[6]]
## Impact measures (lag, exact):
##                           Direct Indirect   Total
## EUR dy/dx                 0.9808  0.23689  1.2176
## L1_groupBolivia dy/dx    -0.5473 -0.13220 -0.6795
## L1_groupBrazil dy/dx     -0.2818 -0.06807 -0.3499
## L1_groupCanada dy/dx      0.5713  0.13798  0.7092
## L1_groupChile dy/dx       0.2595  0.06267  0.3221
## L1_groupColombia dy/dx   -0.2412 -0.05825 -0.2994
## L1_groupCosta Rica dy/dx  0.0190  0.00459  0.0236
## L1_groupEcuador dy/dx    -0.2228 -0.05382 -0.2766
## L1_groupGuatemala dy/dx  -0.4766 -0.11511 -0.5917
## L1_groupMexico dy/dx      0.0828  0.02000  0.1028
## L1_groupOthers dy/dx      0.1153  0.02784  0.1431
## L1_groupPanama dy/dx      0.0332  0.00802  0.0412
## L1_groupPeru dy/dx       -0.0509 -0.01230 -0.0632
## L1_groupUSA dy/dx         0.5407  0.13060  0.6713
## 
## [[7]]
## Impact measures (lag, exact):
##                           Direct  Indirect   Total
## CA dy/dx                  0.0527 -0.000910  0.0518
## EUR dy/dx                 0.1502 -0.002595  0.1476
## L1_groupBolivia dy/dx    -0.7389  0.012767 -0.7262
## L1_groupBrazil dy/dx     -0.1821  0.003147 -0.1790
## L1_groupCanada dy/dx      0.1755 -0.003032  0.1725
## L1_groupChile dy/dx       0.1087 -0.001878  0.1068
## L1_groupColombia dy/dx   -0.3602  0.006223 -0.3540
## L1_groupCosta Rica dy/dx -0.1233  0.002130 -0.1211
## L1_groupEcuador dy/dx    -0.4950  0.008552 -0.4865
## L1_groupGuatemala dy/dx  -0.7277  0.012572 -0.7151
## L1_groupMexico dy/dx     -0.3173  0.005482 -0.3118
## L1_groupOthers dy/dx     -0.2649  0.004577 -0.2603
## L1_groupPanama dy/dx      0.0607 -0.001049  0.0597
## L1_groupPeru dy/dx       -0.3103  0.005362 -0.3050
## L1_groupUSA dy/dx         0.0549 -0.000949  0.0540
## 
## [[8]]
## Impact measures (lag, exact):
##                            Direct  Indirect    Total
## AMR dy/dx                -1.47346 -0.275338 -1.74880
## AFR dy/dx                -0.75553 -0.141182 -0.89671
## OTR dy/dx                -0.59698 -0.111554 -0.70853
## L1_groupBolivia dy/dx    -0.38566 -0.072067 -0.45773
## L1_groupBrazil dy/dx     -0.41190 -0.076970 -0.48887
## L1_groupCanada dy/dx      0.47689  0.089114  0.56600
## L1_groupChile dy/dx       0.33275  0.062179  0.39493
## L1_groupColombia dy/dx   -0.22818 -0.042638 -0.27082
## L1_groupCosta Rica dy/dx  0.00473  0.000885  0.00562
## L1_groupEcuador dy/dx    -0.10085 -0.018846 -0.11970
## L1_groupGuatemala dy/dx  -0.31179 -0.058262 -0.37005
## L1_groupMexico dy/dx      0.19831  0.037058  0.23537
## L1_groupOthers dy/dx     -0.12112 -0.022634 -0.14376
## L1_groupPanama dy/dx      0.12139  0.022684  0.14408
## L1_groupPeru dy/dx        0.13992  0.026146  0.16606
## L1_groupUSA dy/dx         0.40895  0.076418  0.48537

#CA
sa_lag_models_CA = list(
  #no country effects
  lagsarlm(CA ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(CA ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  
  #fixed effects for countries
  lagsarlm(CA ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw),
  lagsarlm(CA ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw)
)

sa_lag_models_CA %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))

#spatial error model
#S
sa_error_models_S = list(
  #no country effects
  errorsarlm(S ~ CA, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(S ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(S ~ CA + EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(S ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  
  
  #fixed effects for countries
  errorsarlm(S ~ CA + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(S ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(S ~ CA + EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(S ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP)
)

sa_error_models_S %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))

#CA
sa_error_models_CA = list(
  #no country effects
  errorsarlm(CA ~ EUR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(CA ~ AMR + AFR + OTR, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  
  #fixed effects for countries
  errorsarlm(CA ~ EUR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP),
  errorsarlm(CA ~ AMR + AFR + OTR + L1_group, data = d_ols_models_data, listw = d_ols_models_nbs_listw, weights = d_ols_models_data$SQRT_POP)
)

sa_error_models_CA %>% summarize_spatial_models() %>% filter(!str_detect(term, "L1_group"))

Random effects regressions

Ancestry, CA, S

#we need z scale variables for random effects models
d_ols_models_data

#European only
lmer_S_eur_ri = lmer(
  S ~ EUR + (1 | L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

lmer_S_eur_ri %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ EUR + (1 | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 222
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -5.847 -0.421  0.014  0.496  2.880 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  L1_group (Intercept)  0.168   0.41    
##  Residual             94.286   9.71    
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   -0.216      0.123   -1.75
## EUR            1.112      0.121    9.17
## 
## Correlation of Fixed Effects:
##     (Intr)
## EUR -0.431

check_outliers(lmer_S_eur_ri)

## 2 outliers detected: cases 184, 185.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).

lmer_S_eur_ri %>% plot()

lmer_S_eur_full = lmer(
  S ~ EUR + (1 + EUR | L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

lmer_S_eur_full %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ EUR + (1 + EUR | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 220
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -5.973 -0.424 -0.004  0.477  2.906 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  L1_group (Intercept)  0.277   0.527         
##           EUR          0.154   0.392    -0.73
##  Residual             91.980   9.591         
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   -0.250      0.156   -1.60
## EUR            1.258      0.179    7.05
## 
## Correlation of Fixed Effects:
##     (Intr)
## EUR -0.707

lmer_S_eur_full %>% standardize_parameters()

check_outliers(lmer_S_eur_full)

## 2 outliers detected: cases 184, 185.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).

lmer_S_eur_full %>% plot()

#model test, do we need random slopes?
anova(lmer_S_eur_ri, lmer_S_eur_full)

## refitting model(s) with ML (instead of REML)

ggpredict(
  lmer_S_eur_full, 
  terms = c("EUR", "L1_group"),
  type = "random"
  ) %>% 
  plot(
    colors = large_palette
  )

lmer_IQ_eur_ri = lmer(
  CA ~ EUR + (1 | L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

lmer_IQ_eur_ri %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ EUR + (1 | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 1901
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -5.895 -0.482  0.019  0.593  3.999 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  L1_group (Intercept)    25      5     
##  Residual             14815    122     
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)    78.31       1.51   51.72
## EUR            13.80       1.52    9.09
## 
## Correlation of Fixed Effects:
##     (Intr)
## EUR -0.440

lmer_IQ_eur_full = lmer(
  CA ~ EUR + (1 + EUR| L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

lmer_IQ_eur_full %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ EUR + (1 + EUR | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 1883
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.160 -0.438 -0.011  0.581  3.535 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  L1_group (Intercept)    48.3    6.95        
##           EUR            94.4    9.72   -0.71
##  Residual             13446.3  115.96        
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)    76.53       2.11   36.35
## EUR            18.52       3.47    5.34
## 
## Correlation of Fixed Effects:
##     (Intr)
## EUR -0.751

lmer_IQ_eur_full %>% standardize_parameters()

check_outliers(lmer_IQ_eur_full)

## 3 outliers detected: cases 134, 184, 264.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).

#model test, do we need random slopes?
anova(lmer_IQ_eur_ri, lmer_IQ_eur_full)

## refitting model(s) with ML (instead of REML)

ggpredict(
  lmer_IQ_eur_full, 
  terms = c("EUR", "L1_group"),
  type = "random"
  ) %>% 
  plot(
    colors = large_palette
  )

#S ~ CA
lmer_S_CA_ri = lmer(
  S ~ CA + (1 | L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

lmer_S_CA_ri %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA + (1 | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 25.6
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -7.197 -0.441  0.105  0.520  3.788 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  L1_group (Intercept)  0.07    0.265   
##  Residual             51.56    7.180   
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -4.83849    0.25329   -19.1
## CA           0.06057    0.00288    21.1
## 
## Correlation of Fixed Effects:
##    (Intr)
## CA -0.958

check_outliers(lmer_S_CA_ri)

## 3 outliers detected: cases 134, 184, 248.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).

lmer_S_CA_ri %>% plot()

lmer_S_CA_full = lmer(
  S ~ CA + (1 + CA | L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 9.42875 (tol = 0.002, component 1)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

lmer_S_CA_full %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA + (1 + CA | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 20
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -7.037 -0.431  0.071  0.498  4.181 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  L1_group (Intercept) 5.43e-01 0.73669       
##           CA          7.19e-05 0.00848  -0.93
##  Residual             4.98e+01 7.05472       
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -4.8322     0.3399   -14.2
## CA            0.0606     0.0040    15.2
## 
## Correlation of Fixed Effects:
##    (Intr)
## CA -0.976
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 9.42875 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

lmer_S_CA_full %>% standardize_parameters()

check_outliers(lmer_S_CA_full)

## 4 outliers detected: cases 134, 184, 248, 316.
## - Based on the following method and threshold: cook (0.8).
## - For variable: (Whole model).

lmer_S_CA_full %>% plot()

#model test, do we need random slopes?
anova(lmer_S_CA_ri, lmer_S_CA_full)

## refitting model(s) with ML (instead of REML)

#z scored
lmer_S_CA_full_z = lmer(
  S ~ CA_z + (1 + CA_z | L1_group),
  data = d_ols_models_data,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

## boundary (singular) fit: see help('isSingular')

lmer_S_CA_full_z %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA_z + (1 + CA_z | L1_group)
##    Data: d_ols_models_data
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 37.3
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.236 -0.505  0.062  0.576  4.501 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  L1_group (Intercept)  0.0000  0.000        
##           CA_z         0.0459  0.214     NaN
##  Residual             55.1268  7.425        
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   1.3800     0.0201    68.7
## CA_z          1.0227     0.0629    16.2
## 
## Correlation of Fixed Effects:
##      (Intr)
## CA_z 0.273 
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

#non-European
lmer_S_all = lmer(
  S ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group),
  data = d %>% filter(
    !has_L1_subunits,
    !is.na(AMR),
    !Level == "2"
  ),
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

## boundary (singular) fit: see help('isSingular')

lmer_S_all %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group)
##    Data: d %>% filter(!has_L1_subunits, !is.na(AMR), !Level == "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 191
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.071 -0.453  0.008  0.476  3.379 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr             
##  L1_group (Intercept)  0.248   0.498                     
##           AMR          0.436   0.660    -0.72            
##           AFR          0.103   0.322     0.37  0.38      
##           OTR          2.588   1.609     0.85 -0.98 -0.16
##  Residual             75.622   8.696                     
## Number of obs: 375, groups:  L1_group, 16
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)    0.997      0.148    6.76
## AMR           -1.617      0.232   -6.96
## AFR           -1.138      0.165   -6.89
## OTR           -1.206      0.668   -1.80
## 
## Correlation of Fixed Effects:
##     (Intr) AMR    AFR   
## AMR -0.774              
## AFR -0.062  0.315       
## OTR  0.544 -0.548 -0.096
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

lmer_S_all %>% standardize_parameters()

## boundary (singular) fit: see help('isSingular')

check_outliers(lmer_S_all)

## 2 outliers detected: cases 175, 200.
## - Based on the following method and threshold: cook (0.9).
## - For variable: (Whole model).

lmer_IQ_all = lmer(
  CA ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group),
  data = d %>% filter(
    !has_L1_subunits,
    !is.na(AMR),
    !Level == "2"
  ),
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

## boundary (singular) fit: see help('isSingular')

lmer_IQ_all %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ AMR + AFR + OTR + (1 + AMR + AFR + OTR | L1_group)
##    Data: d %>% filter(!has_L1_subunits, !is.na(AMR), !Level == "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 1837
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -5.526 -0.478  0.022  0.539  3.573 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr             
##  L1_group (Intercept)    34.8    5.9                     
##           AMR           120.1   11.0    -0.87            
##           AFR           166.2   12.9    -0.09 -0.37      
##           OTR           784.5   28.0     0.05 -0.50  0.99
##  Residual             12192.6  110.4                     
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)    95.53       2.00   47.79
## AMR           -21.38       3.58   -5.98
## AFR           -27.17       4.72   -5.75
## OTR           -30.95      10.17   -3.04
## 
## Correlation of Fixed Effects:
##     (Intr) AMR    AFR   
## AMR -0.882              
## AFR -0.270 -0.087       
## OTR -0.102 -0.237  0.872
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

lmer_IQ_all %>% standardize_parameters()

## boundary (singular) fit: see help('isSingular')

check_outliers(lmer_IQ_all)

## 2 outliers detected: cases 134, 184.
## - Based on the following method and threshold: cook (0.9).
## - For variable: (Whole model).

#S ~ EUR + CA
lmer_S_EUR = lmer(
  S ~ EUR + (1 + EUR| L1_group),
  data = d %>% filter(
    !has_L1_subunits,
    !is.na(EUR),
    !is.na(CA),
    !Level == "2"
  ),
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

lmer_S_EUR %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ EUR + (1 + EUR | L1_group)
##    Data: d %>% filter(!has_L1_subunits, !is.na(EUR), !is.na(CA), !Level ==  
##     "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 220
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -5.973 -0.424 -0.004  0.477  2.906 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  L1_group (Intercept)  0.277   0.527         
##           EUR          0.154   0.392    -0.73
##  Residual             91.980   9.591         
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   -0.250      0.156   -1.60
## EUR            1.258      0.179    7.05
## 
## Correlation of Fixed Effects:
##     (Intr)
## EUR -0.707

lmer_S_EUR %>% standardize_parameters()

lmer_S_CA = lmer(
  S ~ CA_z + (1 + CA_z | L1_group),
  data = d %>% filter(
    !has_L1_subunits,
    !is.na(EUR),
    !is.na(CA),
    !Level == "2"
  ),
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

## boundary (singular) fit: see help('isSingular')

lmer_S_CA %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA_z + (1 + CA_z | L1_group)
##    Data: d %>% filter(!has_L1_subunits, !is.na(EUR), !is.na(CA), !Level ==  
##     "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 37.3
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.236 -0.505  0.062  0.576  4.501 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  L1_group (Intercept)  0.0000  0.000        
##           CA_z         0.0459  0.214     NaN
##  Residual             55.1268  7.425        
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   1.3800     0.0201    68.7
## CA_z          1.0227     0.0629    16.2
## 
## Correlation of Fixed Effects:
##      (Intr)
## CA_z 0.273 
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

lmer_S_CA %>% standardize_parameters()

lmer_S_CA_EUR = lmer(
  S ~ CA_z + EUR + (1 + CA_z + EUR| L1_group),
  data = d %>% filter(
    !has_L1_subunits,
    !is.na(EUR),
    !is.na(CA),
    !Level == "2"
  ),
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5))
)

## boundary (singular) fit: see help('isSingular')

lmer_S_CA_EUR %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA_z + EUR + (1 + CA_z + EUR | L1_group)
##    Data: d %>% filter(!has_L1_subunits, !is.na(EUR), !is.na(CA), !Level ==  
##     "2")
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: -13.4
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -5.764 -0.445  0.012  0.523  3.722 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  L1_group (Intercept)  0.409   0.640               
##           CA_z         0.102   0.319     0.82      
##           EUR          0.168   0.410    -0.89 -0.46
##  Residual             44.179   6.647               
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)    1.007      0.199    5.06
## CA_z           0.843      0.106    7.96
## EUR            0.415      0.147    2.82
## 
## Correlation of Fixed Effects:
##      (Intr) CA_z  
## CA_z  0.836       
## EUR  -0.834 -0.461
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

lmer_S_CA_EUR %>% standardize_parameters()

## boundary (singular) fit: see help('isSingular')

Complex

#subset to full data for all variables we need
d_complex = d %>% 
  select(
    CA, S,
    EUR, AMR, AFR, OTR, 
    lat, lon,
    IS_LATIN, IS_NONSOV,
    L1_group, has_L1_subunits, Level, SQRT_POP
  ) %>% na.omit() %>% 
  filter(
    !has_L1_subunits,
    !Level == "2"
  ) %>% 
  spatial_knn(
    vars = c("S", "CA"),
    k = 3
  )

d_complex %>% spatial_lag_cors()

##    CA     S 
## 0.893 0.856

#
lmer_IQ_complex = lmer(
  CA ~ abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR + OTR + (1 + abs(lat) + lon + AMR + AFR + OTR | L1_group),
  data = d_complex,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)

## boundary (singular) fit: see help('isSingular')

lmer_IQ_complex %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR + OTR +  
##     (1 + abs(lat) + lon + AMR + AFR + OTR | L1_group)
##    Data: d_complex
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+06))
## 
## REML criterion at convergence: 1703
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.214 -0.582 -0.084  0.549  4.358 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr                         
##  L1_group (Intercept) 4.67e+01  6.8340                               
##           abs(lat)    3.00e-02  0.1732  -0.75                        
##           lon         4.15e-03  0.0644   0.21  0.49                  
##           AMR         8.80e+01  9.3832  -0.93  0.94  0.17            
##           AFR         1.30e+02 11.4030  -0.44  0.93  0.79  0.74      
##           OTR         8.94e+02 29.8974   0.10 -0.11 -0.02 -0.11 -0.08
##  Residual             7.95e+03 89.1782                               
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  92.1727     2.9301   31.46
## abs(lat)      0.0391     0.0545    0.72
## lon          -0.0613     0.0263   -2.33
## IS_LATIN    -10.6696     0.8483  -12.58
## IS_NONSOV     3.3027     0.7605    4.34
## AMR         -13.6502     2.9591   -4.61
## AFR         -20.6992     3.9296   -5.27
## OTR          -8.2429    15.0550   -0.55
## 
## Correlation of Fixed Effects:
##           (Intr) abs(l) lon    IS_LAT IS_NON AMR    AFR   
## abs(lat)  -0.635                                          
## lon        0.496  0.287                                   
## IS_LATIN  -0.495  0.219 -0.263                            
## IS_NONSOV -0.331  0.059 -0.028  0.334                     
## AMR       -0.539  0.803  0.191 -0.180 -0.039              
## AFR       -0.370  0.817  0.504  0.059  0.090  0.627       
## OTR        0.028 -0.033  0.027  0.058  0.020 -0.079  0.003
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

lmer_IQ_complex %>% standardize_parameters()

## boundary (singular) fit: see help('isSingular')

#ditto but without Other group
lmer_IQ_complex_noOtherNatGroup = lmer(
  CA ~ abs(lat) + lon + AMR + AFR + OTR + (1 + abs(lat) + lon + AMR + AFR + OTR | L1_group),
  data = d_complex,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)

## boundary (singular) fit: see help('isSingular')

lmer_IQ_complex_noOtherNatGroup %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: CA ~ abs(lat) + lon + AMR + AFR + OTR + (1 + abs(lat) + lon +  
##     AMR + AFR + OTR | L1_group)
##    Data: d_complex
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+06))
## 
## REML criterion at convergence: 1801
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -4.693 -0.492 -0.052  0.576  3.727 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr                         
##  L1_group (Intercept) 2.35e+01   4.8453                              
##           abs(lat)    1.11e-02   0.1054  0.20                        
##           lon         5.49e-03   0.0741  0.09  0.99                  
##           AMR         1.18e+02  10.8724 -0.95  0.12  0.22            
##           AFR         1.68e+02  12.9621  0.18  1.00  1.00  0.13      
##           OTR         2.82e+02  16.8030  0.19  1.00  1.00  0.12  1.00
##  Residual             1.08e+04 103.7523                              
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  84.5923     2.0670   40.93
## abs(lat)      0.1192     0.0391    3.05
## lon          -0.0680     0.0281   -2.42
## AMR         -16.3517     3.5491   -4.61
## AFR         -21.1920     4.5971   -4.61
## OTR         -12.2737     6.3885   -1.92
## 
## Correlation of Fixed Effects:
##          (Intr) abs(l) lon    AMR    AFR   
## abs(lat) -0.141                            
## lon       0.325  0.668                     
## AMR      -0.709  0.280  0.273              
## AFR      -0.056  0.806  0.749  0.282       
## OTR       0.088  0.717  0.765  0.221  0.813
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

lmer_IQ_complex_noOtherNatGroup %>% standardize_parameters()

## boundary (singular) fit: see help('isSingular')

#S
lmer_S_complex = lmer(
  S ~ CA + abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR + OTR + (1  | L1_group),
  data = d_complex,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)

lmer_S_complex %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: S ~ CA + abs(lat) + lon + IS_LATIN + IS_NONSOV + AMR + AFR +  
##     OTR + (1 | L1_group)
##    Data: d_complex
## Weights: SQRT_POP
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+06))
## 
## REML criterion at convergence: 22.9
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.006 -0.453  0.068  0.508  3.642 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  L1_group (Intercept)  0.0479  0.219   
##  Residual             47.5940  6.899   
## Number of obs: 334, groups:  L1_group, 14
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) -3.441173   0.417858   -8.24
## CA           0.046117   0.003869   11.92
## abs(lat)     0.001871   0.002072    0.90
## lon          0.001178   0.000991    1.19
## IS_LATIN    -0.150208   0.095457   -1.57
## IS_NONSOV    0.318913   0.114546    2.78
## AMR         -0.566102   0.133288   -4.25
## AFR         -0.517106   0.128417   -4.03
## OTR         -0.025614   0.243400   -0.11
## 
## Correlation of Fixed Effects:
##           (Intr) CA     abs(l) lon    IS_LAT IS_NON AMR    AFR   
## CA        -0.880                                                 
## abs(lat)   0.008 -0.218                                          
## lon        0.166 -0.019  0.159                                   
## IS_LATIN  -0.638  0.431  0.099  0.037                            
## IS_NONSOV -0.239 -0.107  0.069 -0.026  0.320                     
## AMR       -0.420  0.334  0.316  0.118  0.052 -0.067              
## AFR       -0.578  0.452  0.260 -0.076  0.395  0.003  0.527       
## OTR       -0.253  0.120  0.271  0.249  0.452  0.162  0.208  0.249

performance::r2_nakagawa(lmer_S_complex)

## # R2 for Mixed Models
## 
##   Conditional R2: 0.008
##      Marginal R2: 0.007

model_performance(lmer_S_complex, metrics = c("RMSE", "R2", "AIC"))

m_null <- lmer(
  S ~ 1 + (1 | L1_group),
  data = d_complex,
  weights = SQRT_POP,
  control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6))
)
AIC(m_null, lmer_S_complex)

anova(m_null, lmer_S_complex)

## refitting model(s) with ML (instead of REML)

#fixed effects
fe_S_complex = feols(
  S ~ CA + abs(lat) + lon + AMR + AFR + OTR | L1_group,
  data = d_complex,
  weights = d_complex$SQRT_POP
)

fe_S_complex %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##          Estimate Std. Error t value  Pr(>|t|)    
## CA        0.05199    0.00348  14.958 < 2.2e-16 ***
## abs(lat)  0.00146    0.00223   0.657 0.5119338    
## lon       0.00145    0.00103   1.407 0.1603214    
## AMR      -0.47661    0.14321  -3.328 0.0009782 ***
## AFR      -0.37438    0.12017  -3.115 0.0020062 ** 
## OTR       0.13450    0.22495   0.598 0.5503342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.187356     Adj. R2: 0.919762
##                  Within R2: 0.57892

fe_CA_complex = feols(
  CA ~ abs(lat) + lon + AMR + AFR + OTR | L1_group,
  data = d_complex,
  weights = d_complex$SQRT_POP
)

fe_CA_complex %>% summary()

## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##          Estimate Std. Error t value   Pr(>|t|)    
## abs(lat)   0.1745     0.0348    5.02 8.7883e-07 ***
## lon        0.0247     0.0167    1.48 1.3925e-01    
## AMR      -12.8920     2.2049   -5.85 1.2509e-08 ***
## AFR      -10.8765     1.8491   -5.88 1.0344e-08 ***
## OTR        5.8129     3.6318    1.60 1.1047e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 3.04194     Adj. R2: 0.872724
##                 Within R2: 0.293103

#with spatial lags
fe_S_complex_lag = feols(
  S ~ CA + abs(lat) + lon + AMR + AFR + OTR + S_lag | L1_group,
  data = d_complex,
  weights = d_complex$SQRT_POP
)

fe_S_complex_lag %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##          Estimate Std. Error t value   Pr(>|t|)    
## CA        0.05242    0.00358  14.653  < 2.2e-16 ***
## abs(lat)  0.00211    0.00256   0.824 0.41050900    
## lon       0.00156    0.00105   1.481 0.13959533    
## AMR      -0.48971    0.14561  -3.363 0.00086595 ***
## AFR      -0.37338    0.12033  -3.103 0.00209039 ** 
## OTR       0.15803    0.22979   0.688 0.49213156    
## S_lag    -0.03147    0.06101  -0.516 0.60635409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.187276     Adj. R2: 0.919574
##                  Within R2: 0.579277

fe_CA_complex_lag = feols(
  CA ~ abs(lat) + lon + AMR + AFR + OTR + CA_lag | L1_group,
  data = d_complex,
  weights = d_complex$SQRT_POP
)

fe_CA_complex_lag %>% summary()

## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_complex$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##          Estimate Std. Error t value   Pr(>|t|)    
## abs(lat)   0.0427     0.0361   1.183 2.3776e-01    
## lon        0.0146     0.0153   0.952 3.4159e-01    
## AMR       -9.2150     2.0755  -4.440 1.2484e-05 ***
## AFR       -9.5866     1.7032  -5.629 4.0281e-08 ***
## OTR        3.4273     3.3434   1.025 3.0610e-01    
## CA_lag     0.5141     0.0659   7.799 9.3016e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 2.78421     Adj. R2: 0.893037
##                 Within R2: 0.407811

Fixed effects regressions

Ancestry, CA, S

#no miss subset
d_nomiss = d %>% filter(
  !has_L1_subunits,
  !is.na(EUR),
  !is.na(CA),
  !Level == "2"
)

feols_CA_eur = feols(
  CA ~ EUR | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_CA_eur %>% summary()

## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##     Estimate Std. Error t value   Pr(>|t|)    
## EUR     13.1       1.56    8.44 1.1276e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 3.2712     Adj. R2: 0.854661
##                Within R2: 0.182533

feols_S_eur = feols(
  S ~ EUR | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_S_eur %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##     Estimate Std. Error t value   Pr(>|t|)    
## EUR     1.05      0.124    8.47 9.4105e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.260902     Adj. R2: 0.846843
##                  Within R2: 0.183447

feols_S_CA = feols(
  S ~ CA | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_S_CA %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##    Estimate Std. Error t value  Pr(>|t|)    
## CA   0.0594    0.00298    19.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.19289     Adj. R2: 0.916285
##                 Within R2: 0.553677

feols_S_CA_z = feols(
  S ~ CA_z | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_S_CA_z %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##      Estimate Std. Error t value  Pr(>|t|)    
## CA_z    0.891     0.0448    19.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.19289     Adj. R2: 0.916285
##                 Within R2: 0.553677

feols_S_CA_EUR = feols(
  S ~ CA_z + EUR | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_S_CA_EUR %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##      Estimate Std. Error t value  Pr(>|t|)    
## CA_z    0.822     0.0488   16.85 < 2.2e-16 ***
## EUR     0.331     0.0999    3.32 0.0010217 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.189641     Adj. R2: 0.918827
##                  Within R2: 0.568587

#all ancestries
feols_CA_all = feols(
  CA ~ AMR + AFR + OTR | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_CA_all %>% summary()

## OLS estimation, Dep. Var.: CA
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##     Estimate Std. Error t value   Pr(>|t|)    
## AMR  -17.620       2.07 -8.5051 7.2794e-16 ***
## AFR  -14.028       1.76 -7.9543 3.2134e-14 ***
## OTR    0.217       3.52  0.0616 9.5090e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 3.1638     Adj. R2: 0.86319 
##                Within R2: 0.235328

feols_S_all = feols(
  S ~ AMR + AFR + OTR | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_S_all %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##     Estimate Std. Error t value   Pr(>|t|)    
## AMR  -1.4441      0.165  -8.770  < 2.2e-16 ***
## AFR  -1.1093      0.140  -7.913 4.2398e-14 ***
## OTR   0.0382      0.280   0.136 8.9172e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.251483     Adj. R2: 0.856804
##                  Within R2: 0.241342

feols_S_CA_all = feols(
  S ~ CA + AMR + AFR + OTR | L1_group,
  data = d_nomiss,
  weights = d_nomiss$SQRT_POP
)

feols_S_CA_all %>% summary()

## OLS estimation, Dep. Var.: S
## Observations: 334
## Weights: d_nomiss$SQRT_POP
## Fixed-effects: L1_group: 14
## Standard-errors: IID 
##     Estimate Std. Error t value   Pr(>|t|)    
## CA    0.0528    0.00334  15.793  < 2.2e-16 ***
## AMR  -0.5139    0.13665  -3.761 0.00020177 ***
## AFR  -0.3687    0.11496  -3.207 0.00147870 ** 
## OTR   0.0267    0.20976   0.127 0.89879521    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.188002     Adj. R2: 0.919719
##                  Within R2: 0.576012

Admixture in the Americas 2025

10 December, 2025

Init

Functions

Data

Recode

Spatial data

National-like units

Subnational units

Water-access

Back-join geographical information

Analysis

Maps

Simple plots

OLS regressions

Spatial regressions

Random effects regressions

Ancestry, CA, S

Complex

Fixed effects regressions

Ancestry, CA, S

Meta