Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  googlesheets4,
  patchwork,
  ggeffects
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

Functions

Data

#read NIQs
gs4_deauth()
d_niq = read_sheet("https://docs.google.com/spreadsheets/d/1cReoeIZLlbxOrN4_wnj52Q6UWFepXSJGi3Gj5m6ZAZg/edit?gid=528842851#gid=528842851", sheet = 3) %>% 
  mutate(
    ISO = pu_translate(Name)
  )

## ✔ Reading from "National IQ datasets".

## ✔ Range ''Seb2024b''.

## No exact match: Hong Kong SAR China

## No exact match: Macao SAR China

## No exact match: Palestinian Territories

## No exact match: Congo - Brazzaville

## No exact match: Congo - Kinshasa

## No exact match: São Tomé & Príncipe

## Best fuzzy match found: Hong Kong SAR China -> Hong Kong SAR, China with distance 1.00

## Best fuzzy match found: Macao SAR China -> Macao SAR, China with distance 1.00

## Best fuzzy match found: Palestinian Territories -> Palestinian territories with distance 1.00

## Best fuzzy match found: Congo - Brazzaville -> Congo (Brazzaville) with distance 3.00

## Best fuzzy match found: Congo - Kinshasa -> Congo (Zaire) with distance 9.00

## Best fuzzy match found: São Tomé & Príncipe -> São Tomé and Príncipe with distance 3.00

#SPI
d_spi = readxl::read_excel("data/2014-2019-SPI-Public.xlsx", sheet = 2) %>% 
  df_legalize_names() %>% 
  rename(ISO = Code)

#regional coding
d_regions = read_sheet("https://docs.google.com/spreadsheets/d/1ToJWNbwYY--w0_nh05slEv4ZCko4a-XZc1rAkymTxAA/edit?gid=0#gid=0")

## ✔ Reading from "Countries regional codings".

## ✔ Range 'Sheet1'.

#join
d = smart_join(
  d_niq,
  d_spi,
  id_col = "ISO"
) %>% 
  smart_join(
    d_regions %>% select(ISO = ISO3, Region1, Region2, Region3),
    id_col = "ISO"
  ) %>% 
  filter(!is.na(ISO), !is.na(NIQ)) %>% 
  arrange(ISO)

#African ISOs
African_ISOs = d %>% filter(Region2 == "Sub-Saharan Africa") %>% pull(ISO)

#SPI vars
spi_vars = d %>% select(Undernourishment_pct_of_pop:Percent_of_tertiary_students_enrolled_in_globally_ranked_universities) %>% names()

#imputed SPI data, without IQs (prevent data leakage)
d_spi_imp = d %>% select(!!spi_vars) %>% miss_impute()

Analysis

Simulation

#simulate data that correlates with NIQs at random
set.seed(1)
for (i in 1:50) {
  d_niq[[str_glue("cor_{i}")]] = standardize(d_niq$NIQ) + rnorm(n = nrow(d_niq), sd = runif(1, min = 0.5, max = 4))
}

#distribution of correlations with NIQ
d_niq_cors = d_niq %>% select(NIQ, starts_with("cor_")) %>% wtd.cors() %>% .[-1, 1]

d_niq_cors %>% describe2()

d_niq_cors %>% GG_denhist()

## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`

#pick 5 random varialbes, and impute IQs below 75- based on the 75+ data
set.seed(1)
d1_imp_res = map_df(1:1000, function(sim) {
  # browser()
  #which vars
  pred_vars = str_glue("cor_{sample(1:50, 5)}")
  
  #formula
  form = str_glue("NIQ ~ {str_c(pred_vars, collapse = ' + ')}")
  
  #subset idx
  subset_idx = (d_niq$NIQ >= 75)
  
  #fit
  niq_lm = lm(form, data = d_niq[subset_idx, ])
  
  #impute
  niq_lm_pred = predict(niq_lm, newdata = d_niq[!subset_idx, ]) 
  
  tibble(
    sim = sim,
    mean_imputed = mean(niq_lm_pred),
    model_rsq = niq_lm %>% broom::glance() %>% .[["adj.r.squared"]]
  )
})

#true value
true_mean_below_75 = d_niq %>% filter(NIQ <= 75) %>% pull(NIQ) %>% mean()

#imputed IQ vs. model accuracy
p1 = d1_imp_res %>% 
  ggplot(aes(model_rsq, mean_imputed)) + 
  geom_point() + 
  geom_smooth(method = "lm", fullrange = T) +
  geom_hline(yintercept = true_mean_below_75, linetype = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Predict countries below 75 IQ",
    x = "Model R²",
    y = "Average predicted IQ"
  )

#predicted value if perfect model accuracy
lm_pred_models = lm(mean_imputed ~ model_rsq, data = d1_imp_res)

predict(lm_pred_models, newdata = data.frame(model_rsq = 1))

##    1 
## 69.6

#how many below 75?
(d_niq$NIQ <= 75) %>% sum()

## [1] 50

(d_niq$NIQ >= 90) %>% sum()

## [1] 50

#repeat for those
set.seed(1)
d1_imp_res2 = map_df(1:1000, function(sim) {
  # browser()
  #which vars
  pred_vars = str_glue("cor_{sample(1:50, 5)}")
  
  #formula
  form = str_glue("NIQ ~ {str_c(pred_vars, collapse = ' + ')}")
  
  #subset idx
  subset_idx = (d_niq$NIQ <= 90)
  
  #fit
  niq_lm = lm(form, data = d_niq[subset_idx, ])
  
  #impute
  niq_lm_pred = predict(niq_lm, newdata = d_niq[!subset_idx, ]) 
  
  tibble(
    sim = sim,
    mean_imputed = mean(niq_lm_pred),
    model_rsq = niq_lm %>% broom::glance() %>% .[["adj.r.squared"]]
  )
})

#true value
true_mean_above_90 = d_niq %>% filter(NIQ >= 90) %>% pull(NIQ) %>% mean()

#imputed IQ vs. model accuracy
p2 = d1_imp_res2 %>% 
  ggplot(aes(model_rsq, mean_imputed)) + 
  geom_point() + 
  geom_smooth(method = "lm", fullrange = T) +
  geom_hline(yintercept = true_mean_above_90, linetype = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Predict countries above 90 IQ",
    x = "Model R²",
    y = "Average predicted IQ"
  )

#predicted value if perfect model accuracy
lm_pred_models2 = lm(mean_imputed ~ model_rsq, data = d1_imp_res2)

predict(lm_pred_models2, newdata = data.frame(model_rsq = 1))

##    1 
## 97.8

patchwork::wrap_plots(p1, p2)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/model_preds.png")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Impute African IQs

#impute African IQs
set.seed(1)
d_pred_african = map_df(1:10000, function(sim) {
  # browser()
  
  #how many vars
  pred_var_count = sample(2:7, size = 1)
  
  #which vars
  pred_vars = sample(spi_vars, size = pred_var_count)
  
  #formula
  form = str_glue("NIQ ~ {str_c(pred_vars, collapse = ' + ')}")
  
  #make subset with imputed SPI and IQs
  d_sub = bind_cols(
    d_spi_imp,
    d %>% select(ISO, NIQ)
  )
  
  #subset idx
  subset_idx = !d_sub$ISO %in% African_ISOs
  
  #fit
  niq_lm = lm(form, data = d_sub[subset_idx, ])
  
  #impute
  niq_lm_pred = predict(niq_lm, newdata = d_sub[!subset_idx, ]) 
  
  tibble(
    sim = sim,
    mean_imputed = mean(niq_lm_pred, na.rm = T),
    pred_vars_n = length(pred_vars),
    pred_vars = list(pred_vars),
    imputed = list(tibble(ISO = African_ISOs, NIQ_pred = niq_lm_pred)),
    model_rsq = niq_lm %>% broom::glance() %>% .[["adj.r.squared"]]
  )
})

#true value
true_mean_SSA = d %>% filter(ISO %in% African_ISOs) %>% pull(NIQ) %>% mean()

#plot meta-regression
d_pred_african %>% 
  ggplot(aes(model_rsq, mean_imputed)) + 
  geom_point() + 
  geom_smooth(method = "lm", fullrange = T) +
  geom_hline(yintercept = true_mean_SSA, linetype = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Predict African NIQs",
    subtitle = "From random models with 2-7 variables from Social Progress Index database",
    x = "Model R²",
    y = "Average predicted IQ"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/pred_African.png")

## `geom_smooth()` using formula = 'y ~ x'

#investigate causes of very low predicted IQs
#we need the matrix of included variables
# d_pred_african_vars = matrix(F, nrow = nrow(d_pred_african), ncol = length(spi_vars))
# colnames(d_pred_african_vars) = spi_vars
# d_pred_african_vars %<>% as_tibble()
# 
# #fill in
# for (model in 1:nrow(d_pred_african)) {
#   this_model_preds = d_pred_african$pred_vars %>% unlist()
#   
#   for(v in this_model_preds) {
#     d_pred_african_vars[model, v] = T
#   }
# }

#above code is very slow
#GPT5.1
n_models <- nrow(d_pred_african)
p        <- length(spi_vars)

# start with all FALSE
d_pred_african_vars <- matrix(FALSE, nrow = n_models, ncol = p,
                              dimnames = list(NULL, spi_vars))

# list-column of predictors
pv <- d_pred_african$pred_vars

# row indices: model 1 repeated length(pv[[1]]) times, etc.
i <- rep(seq_len(n_models), lengths(pv))

# column indices: match variable names to spi_vars
j <- match(unlist(pv), spi_vars)

# set included cells to TRUE
d_pred_african_vars[cbind(i, j)] <- TRUE

# if you want a tibble:
d_pred_african_vars <- as_tibble(d_pred_african_vars)

#verify by checking with predictor counts
(d_pred_african$pred_vars_n == (d_pred_african_vars %>% rowSums())) %>% all()

## [1] TRUE

#change to factor coding
d_pred_african_vars = map_df(d_pred_african_vars, as.factor)

#join
d_pred_african = bind_cols(
  d_pred_african, 
  d_pred_african_vars
)

#so figure out which mnodels predict higher and lower
african_models = list(
  simplest = lm(mean_imputed ~ model_rsq, data = d_pred_african),
  pred_count = lm(mean_imputed ~ model_rsq + pred_vars_n, data = d_pred_african),
  full = lm(str_glue("mean_imputed ~ model_rsq + pred_vars_n + {str_c(spi_vars, collapse = ' + ')}"), data = d_pred_african),
  simplest_wo_bad_vars = lm(mean_imputed ~ model_rsq, data = d_pred_african %>% filter(Deaths_from_infectious_diseases_deaths_100_000_people == F))
)

ggpredict(african_models$simplest, terms = c("model_rsq"))

ggpredict(african_models$pred_count, terms = c("model_rsq"))

ggpredict(african_models$full, terms = c("model_rsq"))

## Warning in predict.lm(model, newdata = data_grid, type = "response", se.fit =
## se, : prediction from rank-deficient fit; attr(*, "non-estim") has doubtful
## cases

ggpredict(african_models$simplest_wo_bad_vars, terms = c("model_rsq"))

african_models %>% 
  summarize_models() %>%
  filter(!str_detect(`Predictor/Model`, "TRUE"))

#imputed means by variable inclusion or not
african_models_means_by_pred = map_df(spi_vars, function(v) {
  # browser()
  #models with this variable
  d2 = bind_cols(
    model = 1:nrow(d_pred_african_vars),
    d_pred_african_vars
  )
  d2 = d2 %>% filter(.data[[v]] == T)
  
  tibble(
    var = v,
    n_models = nrow(d2),
    mean_imputed = mean(d_pred_african[d2$model,] %>% pull(mean_imputed))
  )
})

#color models by variables that cause bias
d_pred_african %>% 
  ggplot(aes(model_rsq, mean_imputed, color = Deaths_from_infectious_diseases_deaths_100_000_people)) + 
  geom_point() + 
  geom_smooth(method = "lm", fullrange = T) +
  geom_hline(yintercept = true_mean_SSA, linetype = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Predict African NIQs",
    subtitle = "From random models with 2-7 variables from Social Progress Index database",
    x = "Model R²",
    y = "Average predicted IQ"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/pred_African_infec_diseases.png")

## `geom_smooth()` using formula = 'y ~ x'

d_pred_african %>% 
  filter(Deaths_from_infectious_diseases_deaths_100_000_people == T) %>% 
  ggplot(aes(model_rsq, mean_imputed, color = Access_to_essential_services_0_none_100_full_coverage)) + 
  geom_point() + 
  geom_smooth(method = "lm", fullrange = T) +
  geom_hline(yintercept = true_mean_SSA, linetype = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Predict African NIQs",
    subtitle = "From random models with 2-7 variables from Social Progress Index database",
    x = "Model R²",
    y = "Average predicted IQ"
  )

## `geom_smooth()` using formula = 'y ~ x'

#code models for these two variables
d_pred_african %>% 
  mutate(
    var_combo = case_when(
      Deaths_from_infectious_diseases_deaths_100_000_people==T & Access_to_essential_services_0_none_100_full_coverage == T ~ "infec. dis. yes, ess. serv. yes",
      Deaths_from_infectious_diseases_deaths_100_000_people==T & Access_to_essential_services_0_none_100_full_coverage == F ~ "infec. dis. yes, ess. serv. no",
      Deaths_from_infectious_diseases_deaths_100_000_people==F & Access_to_essential_services_0_none_100_full_coverage == T ~ "infec. dis. no, ess. serv. yes",
      Deaths_from_infectious_diseases_deaths_100_000_people==F & Access_to_essential_services_0_none_100_full_coverage == F ~ "infec. dis. no, ess. serv. no",
    )
  ) %>% 
  ggplot(aes(model_rsq, mean_imputed, color = var_combo)) + 
  geom_point() + 
  geom_smooth(method = "lm", fullrange = T) +
  geom_hline(yintercept = true_mean_SSA, linetype = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Predict African NIQs",
    subtitle = "From random models with 2-7 variables from Social Progress Index database",
    x = "Model R²",
    y = "Average predicted IQ"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/pred_African_bad_vars.png")

## `geom_smooth()` using formula = 'y ~ x'

Meta

#versions
write_sessioninfo()

## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeffects_2.3.1       patchwork_1.3.2       googlesheets4_1.1.2  
##  [4] kirkegaard_2025-11-19 robustbase_0.99-6     psych_2.5.6          
##  [7] assertthat_0.2.1      weights_1.1.2         magrittr_2.0.4       
## [10] lubridate_1.9.4       forcats_1.0.1         stringr_1.5.2        
## [13] dplyr_1.1.4           purrr_1.1.0           readr_2.1.5          
## [16] tidyr_1.3.1           tibble_3.3.0          ggplot2_4.0.0        
## [19] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] Rdpack_2.6.4       mnormt_2.1.1       gridExtra_2.3     
##   [4] readxl_1.4.5       rlang_1.1.6        e1071_1.7-16      
##   [7] compiler_4.5.2     mgcv_1.9-3         gdata_3.0.1       
##  [10] systemfonts_1.3.1  vctrs_0.6.5        pkgconfig_2.0.3   
##  [13] shape_1.4.6.1      fastmap_1.2.0      backports_1.5.0   
##  [16] labeling_0.4.3     rmarkdown_2.30     tzdb_0.5.0        
##  [19] haven_2.5.5        nloptr_2.2.1       ragg_1.5.0        
##  [22] xfun_0.53          glmnet_4.1-10      jomo_2.7-6        
##  [25] cachem_1.1.0       jsonlite_2.0.0     pan_1.9           
##  [28] broom_1.0.10       parallel_4.5.2     cluster_2.1.8.1   
##  [31] R6_2.6.1           vcd_1.4-13         bslib_0.9.0       
##  [34] stringi_1.8.7      RColorBrewer_1.1-3 ranger_0.17.0     
##  [37] car_3.1-3          boot_1.3-32        rpart_4.1.24      
##  [40] lmtest_0.9-40      jquerylib_0.1.4    cellranger_1.1.0  
##  [43] Rcpp_1.1.0         iterators_1.0.14   knitr_1.50        
##  [46] zoo_1.8-14         base64enc_0.1-3    Matrix_1.7-4      
##  [49] splines_4.5.2      nnet_7.3-20        timechange_0.3.0  
##  [52] tidyselect_1.2.1   abind_1.4-8        rstudioapi_0.17.1 
##  [55] stringdist_0.9.15  yaml_2.3.10        codetools_0.2-20  
##  [58] curl_7.0.0         plyr_1.8.9         lattice_0.22-7    
##  [61] withr_3.0.2        S7_0.2.0           evaluate_1.0.5    
##  [64] foreign_0.8-90     survival_3.8-3     proxy_0.4-27      
##  [67] pillar_1.11.1      carData_3.0-5      mice_3.18.0       
##  [70] checkmate_2.3.3    VIM_6.2.6          foreach_1.5.2     
##  [73] reformulas_0.4.1   insight_1.4.2      generics_0.1.4    
##  [76] sp_2.2-0           hms_1.1.3          scales_1.4.0      
##  [79] laeken_0.5.3       minqa_1.2.8        gtools_3.9.5      
##  [82] class_7.3-23       glue_1.8.0         Hmisc_5.2-4       
##  [85] tools_4.5.2        data.table_1.17.8  lme4_1.1-37       
##  [88] fs_1.6.6           grid_4.5.2         rbibutils_2.3     
##  [91] datawizard_1.3.0   colorspace_2.1-2   nlme_3.1-168      
##  [94] htmlTable_2.4.3    googledrive_2.1.2  Formula_1.2-5     
##  [97] cli_3.6.5          textshaping_1.0.4  gargle_1.6.0      
## [100] gtable_0.3.6       DEoptimR_1.1-4     sass_0.4.10       
## [103] digest_0.6.37      htmlwidgets_1.6.4  farver_2.1.2      
## [106] htmltools_0.5.8.1  lifecycle_1.0.4    httr_1.4.7        
## [109] mitml_0.4-5        MASS_7.3-65

#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}

Imputation and regression towards the mean

03 December, 2025

Init

Functions

Data

Analysis

Simulation

Impute African IQs

Meta