Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  patchwork
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

#sample size
n_country = 1e5

Functions

IQ_to_z = function(x) (x-100)/15
z_to_IQ = function(x) (x*15)+100

#function to get unreliable measurement but keeping the scale
get_measured_values = function(x, reliability) {
  (IQ_to_z(x)*sqrt(reliability) + rnorm(n_country, sd = sqrt(1-reliability))) %>% z_to_IQ()
}

#est true score
estimate_true_score = function(x, x_mean, reliability) {
  (reliability*(x - x_mean) + x_mean) %>% unname()
}

Analysis

#countries
countries = tibble(
  name = c("Nigeria (70)", "India (80)", "Thailand (90)", "NW Euro (100)", "Hong Kong (110)"),
  IQ = c(70, 80, 90, 100, 110)
)

#named vector
countries_vec = countries$IQ %>% set_names(countries$name)

#simulate people from each
set.seed(1)
d = map_dfr(seq_along_rows(countries), \(i) {
  # browser()
  tibble(
    country = countries[i, ]$name,
    IQ_true = rnorm(n_country, mean = countries[i, ]$IQ, sd = 15),
    IQ_measured = get_measured_values(IQ_true, reliability = 0.80)
  )
})

#check is correct SD
describe2(
  d %>% select(IQ_true, IQ_measured),
  group = d$country
)
#naive selection with threshold
d1_res = map_dfr(seq(80, 130, by = 5), \(thres) {
  #summary stats after selection
  d %>% 
    group_by(country) %>% 
    filter(IQ_measured >= thres) %>% 
    summarise(
      mean_IQ_true = mean(IQ_true),
      mean_IQ_measured = mean(IQ_measured),
      n = n(),
      fraction_selected = n/n_country,
      threshold = thres
    )
})

d1_res %>% 
  ggplot(aes(threshold, mean_IQ_true, color = country)) +
  geom_point() +
  geom_text(aes(label = round(mean_IQ_true)), nudge_y = 1) +
  labs(
    title = "Mean IQ after selection by origin group IQ",
    subtitle = "Simulation results. Selection based on IQ test with 0.80 reliability.",
    x = "IQ threshold used",
    y = "Mean true IQ of selected persons",
    color = "Country (IQ)"
  )

GG_save("figs/post-selection means by threshold.png")

#instead vary the reliability and use constant threshold
d2_res = map_dfr(seq(0.5, 1, by = 0.05), \(rel) {
  #summary stats after selection
  d %>% 
    mutate(
      IQ_measured = get_measured_values(IQ_true, reliability = rel)
    ) %>% 
    group_by(country) %>% 
    filter(IQ_measured >= 115) %>% 
    summarise(
      mean_IQ_true = mean(IQ_true),
      mean_IQ_measured = mean(IQ_measured),
      n = n(),
      fraction_selected = n/n_country,
      reliability = rel
    )
})

#plot
d2_res %>% 
  ggplot(aes(reliability, mean_IQ_true, color = country)) +
  geom_point() +
  geom_text(aes(label = round(mean_IQ_true)), nudge_y = 1) +
  labs(
    title = "Mean IQ after selection by origin group IQ",
    subtitle = "Simulation results. Selection based on IQ test with varying reliability, and selecting persons >= 115 IQ.",
    x = "IQ measurement reliability",
    y = "Mean true IQ of selected persons",
    color = "Country (IQ)"
  )

GG_save("figs/post-selection means by reliability.png")

#estimated true scores
d$IQ_estimated_true = estimate_true_score(d$IQ_measured, x_mean = countries_vec[d$country], reliability = 0.8)

#correlations before and after
p1_measured_true = d %>% 
  ggplot(aes(IQ_measured, IQ_true, color = country)) +
  # geom_point(alpha = 0.01) + #very slow
  geom_smooth(method = "lm") +
  labs(
    y = "True IQ",
    x = "Measured IQ",
    title = "Regression models for predicting true IQ from measured IQ",
    subtitle = "Reliability = 0.80"
  ) +
  theme(
    plot.title = element_text(size = 10),
    plot.subtitle = element_text(size = 7)
  )

p2_est_true_true = d %>% 
  ggplot(aes(IQ_estimated_true, IQ_true, color = country)) +
  # geom_point(alpha = 0.01) + #very slow
  geom_smooth(method = "lm") +
  labs(
    y = "True IQ",
    x = "Estimated true IQ",
    title = "Regression models for predicting true IQ from measured IQ",
    subtitle = "Reliability = 0.80"
  ) +
  theme(
    plot.title = element_text(size = 10),
    plot.subtitle = element_text(size = 7)
  )

patchwork::wrap_plots(
  p1_measured_true,
  p2_est_true_true,
  nrow = 1
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/measured_vs_est_true_regs.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#table of corrections needed
d3_res = expand_grid(
  IQ_measured = seq(100, 145, by = 5),
  IQ_group = c(70, 80, 90, 100, 110),
  reliability = c(0.5, 0.6, 0.7, 0.8, 0.9, 1)
) %>% 
  mutate(
    IQ_estimated_true = estimate_true_score(IQ_measured, x_mean = IQ_group, reliability = reliability),
    IQ_correction = IQ_estimated_true-IQ_measured,
    country = mapvalues(IQ_group, from = countries$IQ, to = countries$name)
  )

d3_res %>% 
  ggplot(aes(reliability, IQ_correction, color = country)) +
  geom_point() +
  facet_wrap(~IQ_measured, labeller = "label_both") +
  labs(
    y = "IQ correction",
    x = "Reliability of test",
    title = "Amount of correction needed for unbiased selection",
    subtitle = "Simulation results. Varying reliability and measured IQ."
  )

GG_save("figs/correction_values.png")

Meta

#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.3.2       kirkegaard_2025-11-19 robustbase_0.99-6    
##  [4] psych_2.5.6           assertthat_0.2.1      weights_1.1.2        
##  [7] magrittr_2.0.4        lubridate_1.9.4       forcats_1.0.1        
## [10] stringr_1.5.2         dplyr_1.1.4           purrr_1.1.0          
## [13] readr_2.1.5           tidyr_1.3.1           tibble_3.3.0         
## [16] ggplot2_4.0.0         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   farver_2.1.2       S7_0.2.0           fastmap_1.2.0     
##  [5] digest_0.6.37      rpart_4.1.24       timechange_0.3.0   lifecycle_1.0.4   
##  [9] cluster_2.1.8.1    survival_3.8-3     gdata_3.0.1        compiler_4.5.2    
## [13] rlang_1.1.6        Hmisc_5.2-4        sass_0.4.10        tools_4.5.2       
## [17] yaml_2.3.10        data.table_1.17.8  knitr_1.50         labeling_0.4.3    
## [21] htmlwidgets_1.6.4  mnormt_2.1.1       plyr_1.8.9         RColorBrewer_1.1-3
## [25] withr_3.0.2        foreign_0.8-90     nnet_7.3-20        grid_4.5.2        
## [29] jomo_2.7-6         colorspace_2.1-2   mice_3.18.0        scales_1.4.0      
## [33] gtools_3.9.5       iterators_1.0.14   MASS_7.3-65        cli_3.6.5         
## [37] rmarkdown_2.30     ragg_1.5.0         reformulas_0.4.1   generics_0.1.4    
## [41] rstudioapi_0.17.1  tzdb_0.5.0         minqa_1.2.8        cachem_1.1.0      
## [45] splines_4.5.2      parallel_4.5.2     base64enc_0.1-3    vctrs_0.6.5       
## [49] boot_1.3-32        glmnet_4.1-10      Matrix_1.7-4       jsonlite_2.0.0    
## [53] hms_1.1.3          mitml_0.4-5        Formula_1.2-5      htmlTable_2.4.3   
## [57] systemfonts_1.3.1  foreach_1.5.2      jquerylib_0.1.4    glue_1.8.0        
## [61] DEoptimR_1.1-4     nloptr_2.2.1       pan_1.9            codetools_0.2-20  
## [65] stringi_1.8.7      gtable_0.3.6       shape_1.4.6.1      lme4_1.1-37       
## [69] pillar_1.11.1      htmltools_0.5.8.1  R6_2.6.1           textshaping_1.0.4 
## [73] Rdpack_2.6.4       evaluate_1.0.5     lattice_0.22-7     rbibutils_2.3     
## [77] backports_1.5.0    broom_1.0.10       bslib_0.9.0        Rcpp_1.1.0        
## [81] gridExtra_2.3      nlme_3.1-168       checkmate_2.3.3    mgcv_1.9-3        
## [85] xfun_0.53          pkgconfig_2.0.3
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}