library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
patchwork
)
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
#sample size
n_country = 1e5
IQ_to_z = function(x) (x-100)/15
z_to_IQ = function(x) (x*15)+100
#function to get unreliable measurement but keeping the scale
get_measured_values = function(x, reliability) {
(IQ_to_z(x)*sqrt(reliability) + rnorm(n_country, sd = sqrt(1-reliability))) %>% z_to_IQ()
}
#est true score
estimate_true_score = function(x, x_mean, reliability) {
(reliability*(x - x_mean) + x_mean) %>% unname()
}
#countries
countries = tibble(
name = c("Nigeria (70)", "India (80)", "Thailand (90)", "NW Euro (100)", "Hong Kong (110)"),
IQ = c(70, 80, 90, 100, 110)
)
#named vector
countries_vec = countries$IQ %>% set_names(countries$name)
#simulate people from each
set.seed(1)
d = map_dfr(seq_along_rows(countries), \(i) {
# browser()
tibble(
country = countries[i, ]$name,
IQ_true = rnorm(n_country, mean = countries[i, ]$IQ, sd = 15),
IQ_measured = get_measured_values(IQ_true, reliability = 0.80)
)
})
#check is correct SD
describe2(
d %>% select(IQ_true, IQ_measured),
group = d$country
)
#naive selection with threshold
d1_res = map_dfr(seq(80, 130, by = 5), \(thres) {
#summary stats after selection
d %>%
group_by(country) %>%
filter(IQ_measured >= thres) %>%
summarise(
mean_IQ_true = mean(IQ_true),
mean_IQ_measured = mean(IQ_measured),
n = n(),
fraction_selected = n/n_country,
threshold = thres
)
})
d1_res %>%
ggplot(aes(threshold, mean_IQ_true, color = country)) +
geom_point() +
geom_text(aes(label = round(mean_IQ_true)), nudge_y = 1) +
labs(
title = "Mean IQ after selection by origin group IQ",
subtitle = "Simulation results. Selection based on IQ test with 0.80 reliability.",
x = "IQ threshold used",
y = "Mean true IQ of selected persons",
color = "Country (IQ)"
)
GG_save("figs/post-selection means by threshold.png")
#instead vary the reliability and use constant threshold
d2_res = map_dfr(seq(0.5, 1, by = 0.05), \(rel) {
#summary stats after selection
d %>%
mutate(
IQ_measured = get_measured_values(IQ_true, reliability = rel)
) %>%
group_by(country) %>%
filter(IQ_measured >= 115) %>%
summarise(
mean_IQ_true = mean(IQ_true),
mean_IQ_measured = mean(IQ_measured),
n = n(),
fraction_selected = n/n_country,
reliability = rel
)
})
#plot
d2_res %>%
ggplot(aes(reliability, mean_IQ_true, color = country)) +
geom_point() +
geom_text(aes(label = round(mean_IQ_true)), nudge_y = 1) +
labs(
title = "Mean IQ after selection by origin group IQ",
subtitle = "Simulation results. Selection based on IQ test with varying reliability, and selecting persons >= 115 IQ.",
x = "IQ measurement reliability",
y = "Mean true IQ of selected persons",
color = "Country (IQ)"
)
GG_save("figs/post-selection means by reliability.png")
#estimated true scores
d$IQ_estimated_true = estimate_true_score(d$IQ_measured, x_mean = countries_vec[d$country], reliability = 0.8)
#correlations before and after
p1_measured_true = d %>%
ggplot(aes(IQ_measured, IQ_true, color = country)) +
# geom_point(alpha = 0.01) + #very slow
geom_smooth(method = "lm") +
labs(
y = "True IQ",
x = "Measured IQ",
title = "Regression models for predicting true IQ from measured IQ",
subtitle = "Reliability = 0.80"
) +
theme(
plot.title = element_text(size = 10),
plot.subtitle = element_text(size = 7)
)
p2_est_true_true = d %>%
ggplot(aes(IQ_estimated_true, IQ_true, color = country)) +
# geom_point(alpha = 0.01) + #very slow
geom_smooth(method = "lm") +
labs(
y = "True IQ",
x = "Estimated true IQ",
title = "Regression models for predicting true IQ from measured IQ",
subtitle = "Reliability = 0.80"
) +
theme(
plot.title = element_text(size = 10),
plot.subtitle = element_text(size = 7)
)
patchwork::wrap_plots(
p1_measured_true,
p2_est_true_true,
nrow = 1
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/measured_vs_est_true_regs.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#table of corrections needed
d3_res = expand_grid(
IQ_measured = seq(100, 145, by = 5),
IQ_group = c(70, 80, 90, 100, 110),
reliability = c(0.5, 0.6, 0.7, 0.8, 0.9, 1)
) %>%
mutate(
IQ_estimated_true = estimate_true_score(IQ_measured, x_mean = IQ_group, reliability = reliability),
IQ_correction = IQ_estimated_true-IQ_measured,
country = mapvalues(IQ_group, from = countries$IQ, to = countries$name)
)
d3_res %>%
ggplot(aes(reliability, IQ_correction, color = country)) +
geom_point() +
facet_wrap(~IQ_measured, labeller = "label_both") +
labs(
y = "IQ correction",
x = "Reliability of test",
title = "Amount of correction needed for unbiased selection",
subtitle = "Simulation results. Varying reliability and measured IQ."
)
GG_save("figs/correction_values.png")
#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] patchwork_1.3.2 kirkegaard_2025-11-19 robustbase_0.99-6
## [4] psych_2.5.6 assertthat_0.2.1 weights_1.1.2
## [7] magrittr_2.0.4 lubridate_1.9.4 forcats_1.0.1
## [10] stringr_1.5.2 dplyr_1.1.4 purrr_1.1.0
## [13] readr_2.1.5 tidyr_1.3.1 tibble_3.3.0
## [16] ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 farver_2.1.2 S7_0.2.0 fastmap_1.2.0
## [5] digest_0.6.37 rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.4
## [9] cluster_2.1.8.1 survival_3.8-3 gdata_3.0.1 compiler_4.5.2
## [13] rlang_1.1.6 Hmisc_5.2-4 sass_0.4.10 tools_4.5.2
## [17] yaml_2.3.10 data.table_1.17.8 knitr_1.50 labeling_0.4.3
## [21] htmlwidgets_1.6.4 mnormt_2.1.1 plyr_1.8.9 RColorBrewer_1.1-3
## [25] withr_3.0.2 foreign_0.8-90 nnet_7.3-20 grid_4.5.2
## [29] jomo_2.7-6 colorspace_2.1-2 mice_3.18.0 scales_1.4.0
## [33] gtools_3.9.5 iterators_1.0.14 MASS_7.3-65 cli_3.6.5
## [37] rmarkdown_2.30 ragg_1.5.0 reformulas_0.4.1 generics_0.1.4
## [41] rstudioapi_0.17.1 tzdb_0.5.0 minqa_1.2.8 cachem_1.1.0
## [45] splines_4.5.2 parallel_4.5.2 base64enc_0.1-3 vctrs_0.6.5
## [49] boot_1.3-32 glmnet_4.1-10 Matrix_1.7-4 jsonlite_2.0.0
## [53] hms_1.1.3 mitml_0.4-5 Formula_1.2-5 htmlTable_2.4.3
## [57] systemfonts_1.3.1 foreach_1.5.2 jquerylib_0.1.4 glue_1.8.0
## [61] DEoptimR_1.1-4 nloptr_2.2.1 pan_1.9 codetools_0.2-20
## [65] stringi_1.8.7 gtable_0.3.6 shape_1.4.6.1 lme4_1.1-37
## [69] pillar_1.11.1 htmltools_0.5.8.1 R6_2.6.1 textshaping_1.0.4
## [73] Rdpack_2.6.4 evaluate_1.0.5 lattice_0.22-7 rbibutils_2.3
## [77] backports_1.5.0 broom_1.0.10 bslib_0.9.0 Rcpp_1.1.0
## [81] gridExtra_2.3 nlme_3.1-168 checkmate_2.3.3 mgcv_1.9-3
## [85] xfun_0.53 pkgconfig_2.0.3
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}