library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
patchwork
)
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
#settings
#group means
group_means = c(0, -1)
#group names
group_names = c("blue", "green")
#group colors
group_colors = c("#377EB8", "#4DAF4A")
#sample size
n_group = 1e4
#sim data
sim_data = function(.obs_e_sd = obs_e_sd) {
tibble(
group = rep(group_names, each = n_group),
trait = c(rnorm(n_group, mean = group_means[1]),
rnorm(n_group, mean = group_means[2])),
trait_obs = trait + rnorm(n_group*2, sd = .obs_e_sd)
)
}
#apply affirmative action
apply_transforms = function(x, .rel_x) {
#group means
group_means_empirical = describe2(
x$trait_obs,
group = x$group
)
#corrected obs score
x %>% mutate(
trait_obs_adj = if_else(
condition = x$group == group_names[1],
true = group_means_empirical$mean[1] + .rel_x * (trait_obs - group_means_empirical$mean[1]),
false = group_means_empirical$mean[2] + .rel_x * (trait_obs - group_means_empirical$mean[2])
),
trait_obs_within = if_else(
condition = x$group == group_names[1],
true = x$trait_obs,
false = x$trait_obs - group_means_empirical$mean[2]
)
)
}
#apply pilot selection and calculate benefit
apply_pilot_selection = function(x, vars = c("trait", "trait_obs", "trait_obs_adj", "trait_obs_within"), ratio = 0.1) {
map_dfr(vars, function(var) {
selected = x %>%
arrange(-.data[[var]]) %>%
slice_head(prop = ratio)
selected_desc = describe2(selected["trait"], group = selected$group) %>% select(group, n, mean) %>% mutate(var = var, overall_mean_trait = mean(selected$trait), overall_sd_trait = sd(selected$trait))
})
}
#apply tenant selection
apply_tenant_selection = function(x, vars = c("trait", "trait_obs", "trait_obs_adj", "trait_obs_within"), ratio = 0.8) {
y = map(vars, function(var) {
# browser()
#assign an id
x %<>% mutate(id = row_number())
#select
selected = x %>%
arrange(-.data[[var]]) %>%
slice_head(prop = ratio)
#add in those who didnt get selected
x2 = bind_rows(
selected %>% mutate(rented = T),
x %>% filter(!id %in% selected$id) %>% mutate(rented = F)
)
#calculate utility
x2$tenant_utility = case_when(
!x2$bad_tenant & x2$rented ~ 20,
x2$bad_tenant & x2$rented ~ 50,
!x2$rented ~ -10
)
x2$landlord_utility = case_when(
!x2$bad_tenant & x2$rented ~ 20,
x2$bad_tenant & x2$rented ~ -100,
!x2$rented ~ 0
)
#split
blues = x2 %>% filter(group == "blue")
greens = x2 %>% filter(group == "green")
list(
group_rented = x2 %>%
group_by(group, rented) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
good_tenants = sum(!bad_tenant),
bad_tenants = sum(bad_tenant),
total_tenants = good_tenants + bad_tenants
) %>%
ungroup() %>%
mutate(var = var),
tenant = x2 %>%
group_by(bad_tenant) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
good_tenants = sum(!bad_tenant),
bad_tenants = sum(bad_tenant),
total_tenants = good_tenants + bad_tenants
) %>%
ungroup() %>%
mutate(var = var),
group_tenant = x2 %>%
group_by(group, bad_tenant) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
good_tenants = sum(!bad_tenant),
bad_tenants = sum(bad_tenant),
total_tenants = good_tenants + bad_tenants
) %>%
ungroup() %>%
mutate(var = var),
tenant_rented = x2 %>%
group_by(bad_tenant, rented) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
good_tenants = sum(!bad_tenant),
bad_tenants = sum(bad_tenant),
total_tenants = good_tenants + bad_tenants
) %>%
ungroup() %>%
mutate(var = var)
)
})
#restructure
list(
group_rented = map_dfr(y, ~.$group_rented),
tenant = map_dfr(y, ~.$tenant),
group_tenant = map_dfr(y, ~.$group_tenant),
tenant_rented = map_dfr(y, ~.$tenant_rented)
)
}
# simulate 2 groups, blues and greens, that differ in some trait by 1 SD
# also measure this trait but only imperfectly
# then we apply top down selection for a given ratio
# and observed the effects of using direct true trait value vs. observed trait value vs. corrected values
# we model the benefits and costs to the persons using a cost-benefit matrix
#observation error
obs_e_sd = 0.5
#reliability within group
rel_x = 1 - (obs_e_sd / 1)^2
set.seed(1)
d1 = sim_data()
#apply aa
d1 %<>% apply_transforms(.rel_x = rel_x)
## New names:
## • `` -> `...1`
p1a = GG_denhist(d1, var = "trait", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "True score")
p1b = GG_denhist(d1, var = "trait_obs", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Observed score")
p1c = GG_denhist(d1, var = "trait_obs_adj", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Adjusted observed score")
p1d = GG_denhist(d1, var = "trait_obs_within", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Within group-normed observed score")
patchwork::wrap_plots(p1a, p1b, p1c, p1d)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
GG_save("figs/dists.png")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
#gaps
cohen.d(
d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
group = d1$group
)
## Call: cohen.d(x = d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
## group = d1$group)
## Cohen d statistic of difference between two means
## lower effect upper
## trait -1.03 -1.0 -0.97
## trait_obs -0.93 -0.9 -0.87
## trait_obs_adj -1.23 -1.2 -1.17
## trait_obs_within -0.03 0.0 0.03
##
## Multivariate (Mahalanobis) distance between groups
## [1] 1.1
## r equivalent of difference between two means
## trait trait_obs trait_obs_adj trait_obs_within
## -0.45 -0.41 -0.51 0.00
describe2(
d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
group = d1$group
)
#selection effects
#top 10% selection with simple benefits
apply_pilot_selection(d1) %>% flextable::flextable()
group | n | mean | var | overall_mean_trait | overall_sd_trait |
|---|---|---|---|---|---|
blue | 1,739 | 1.490 | trait | 1.47 | 0.430 |
green | 261 | 1.352 | trait | 1.47 | 0.430 |
blue | 1,699 | 1.339 | trait_obs | 1.30 | 0.602 |
green | 301 | 1.070 | trait_obs | 1.30 | 0.602 |
blue | 1,827 | 1.302 | trait_obs_adj | 1.30 | 0.602 |
green | 173 | 1.245 | trait_obs_adj | 1.30 | 0.602 |
blue | 1,027 | 1.570 | trait_obs_within | 1.10 | 0.749 |
green | 973 | 0.605 | trait_obs_within | 1.10 | 0.749 |
#settings
#observation error
obs_e_sd = 0.7
#reliability within group
rel_x = 1 - (obs_e_sd / 1)^2
rel_x
## [1] 0.51
sqrt(rel_x)
## [1] 0.714
set.seed(1)
d2 = sim_data(obs_e_sd)
#bad tenants
d2$bad_tenant = d2$trait < -2
#apply transforms
d2 %<>% apply_transforms(.rel_x = rel_x)
## New names:
## • `` -> `...1`
table(
group = d2$group,
bad_tenant = d2$bad_tenant
)
## bad_tenant
## group FALSE TRUE
## blue 9741 259
## green 8428 1572
#utility matrix
sim2_utility = expand_grid(
tenant = c("good", "bad"),
rented = c("yes", "no")
) %>% mutate(
tenant_utility = c(20, -10, 50, -10),
landlord_utility = c(20, 0, -100, 0)
)
sim2_utility %>% flextable::flextable()
tenant | rented | tenant_utility | landlord_utility |
|---|---|---|---|
good | yes | 20 | 20 |
good | no | -10 | 0 |
bad | yes | 50 | -100 |
bad | no | -10 | 0 |
#apply selection
sim2_utility = d2 %>% apply_tenant_selection()
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
sim2_utility$group_rented %>% flextable::flextable()
group | rented | tenant_utility | landlord_utility | good_tenants | bad_tenants | total_tenants | var |
|---|---|---|---|---|---|---|---|
blue | false | -7,490 | 0 | 490 | 259 | 749 | trait |
blue | true | 185,020 | 185,020 | 9,251 | 0 | 9,251 | trait |
green | false | -32,510 | 0 | 1,679 | 1,572 | 3,251 | trait |
green | true | 134,980 | 134,980 | 6,749 | 0 | 6,749 | trait |
blue | false | -9,230 | 0 | 716 | 207 | 923 | trait_obs |
blue | true | 183,100 | 175,300 | 9,025 | 52 | 9,077 | trait_obs |
green | false | -30,770 | 0 | 1,742 | 1,335 | 3,077 | trait_obs |
green | true | 145,570 | 110,020 | 6,686 | 237 | 6,923 | trait_obs |
blue | false | -2,790 | 0 | 155 | 124 | 279 | trait_obs_adj |
blue | true | 198,470 | 178,220 | 9,586 | 135 | 9,721 | trait_obs_adj |
green | false | -37,210 | 0 | 2,293 | 1,428 | 3,721 | trait_obs_adj |
green | true | 129,900 | 108,300 | 6,135 | 144 | 6,279 | trait_obs_adj |
blue | false | -20,320 | 0 | 1,785 | 247 | 2,032 | trait_obs_within |
blue | true | 159,720 | 157,920 | 7,956 | 12 | 7,968 | trait_obs_within |
green | false | -19,680 | 0 | 881 | 1,087 | 1,968 | trait_obs_within |
green | true | 175,190 | 102,440 | 7,547 | 485 | 8,032 | trait_obs_within |
#by group and method
sim2_utility$group_rented %>%
group_by(group, var) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
total_utility = tenant_utility + landlord_utility
) %>%
flextable::flextable()
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
group | var | tenant_utility | landlord_utility | total_utility |
|---|---|---|---|---|
blue | trait | 177,530 | 185,020 | 362,550 |
blue | trait_obs | 173,870 | 175,300 | 349,170 |
blue | trait_obs_adj | 195,680 | 178,220 | 373,900 |
blue | trait_obs_within | 139,400 | 157,920 | 297,320 |
green | trait | 102,470 | 134,980 | 237,450 |
green | trait_obs | 114,800 | 110,020 | 224,820 |
green | trait_obs_adj | 92,690 | 108,300 | 200,990 |
green | trait_obs_within | 155,510 | 102,440 | 257,950 |
#by method
sim2_utility$group_rented %>%
group_by(var) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
total_utility = tenant_utility + landlord_utility
) %>%
flextable::flextable()
var | tenant_utility | landlord_utility | total_utility |
|---|---|---|---|
trait | 280,000 | 320,000 | 600,000 |
trait_obs | 288,670 | 285,320 | 573,990 |
trait_obs_adj | 288,370 | 286,520 | 574,890 |
trait_obs_within | 294,910 | 260,360 | 555,270 |
#by tenant and rent
sim2_utility$tenant_rented %>%
flextable::flextable()
bad_tenant | rented | tenant_utility | landlord_utility | good_tenants | bad_tenants | total_tenants | var |
|---|---|---|---|---|---|---|---|
false | false | -21,690 | 0 | 2,169 | 0 | 2,169 | trait |
false | true | 320,000 | 320,000 | 16,000 | 0 | 16,000 | trait |
true | false | -18,310 | 0 | 0 | 1,831 | 1,831 | trait |
false | false | -24,580 | 0 | 2,458 | 0 | 2,458 | trait_obs |
false | true | 314,220 | 314,220 | 15,711 | 0 | 15,711 | trait_obs |
true | false | -15,420 | 0 | 0 | 1,542 | 1,542 | trait_obs |
true | true | 14,450 | -28,900 | 0 | 289 | 289 | trait_obs |
false | false | -24,480 | 0 | 2,448 | 0 | 2,448 | trait_obs_adj |
false | true | 314,420 | 314,420 | 15,721 | 0 | 15,721 | trait_obs_adj |
true | false | -15,520 | 0 | 0 | 1,552 | 1,552 | trait_obs_adj |
true | true | 13,950 | -27,900 | 0 | 279 | 279 | trait_obs_adj |
false | false | -26,660 | 0 | 2,666 | 0 | 2,666 | trait_obs_within |
false | true | 310,060 | 310,060 | 15,503 | 0 | 15,503 | trait_obs_within |
true | false | -13,340 | 0 | 0 | 1,334 | 1,334 | trait_obs_within |
true | true | 24,850 | -49,700 | 0 | 497 | 497 | trait_obs_within |
#by group and tenant
sim2_utility$group_rented %>%
flextable::flextable()
group | rented | tenant_utility | landlord_utility | good_tenants | bad_tenants | total_tenants | var |
|---|---|---|---|---|---|---|---|
blue | false | -7,490 | 0 | 490 | 259 | 749 | trait |
blue | true | 185,020 | 185,020 | 9,251 | 0 | 9,251 | trait |
green | false | -32,510 | 0 | 1,679 | 1,572 | 3,251 | trait |
green | true | 134,980 | 134,980 | 6,749 | 0 | 6,749 | trait |
blue | false | -9,230 | 0 | 716 | 207 | 923 | trait_obs |
blue | true | 183,100 | 175,300 | 9,025 | 52 | 9,077 | trait_obs |
green | false | -30,770 | 0 | 1,742 | 1,335 | 3,077 | trait_obs |
green | true | 145,570 | 110,020 | 6,686 | 237 | 6,923 | trait_obs |
blue | false | -2,790 | 0 | 155 | 124 | 279 | trait_obs_adj |
blue | true | 198,470 | 178,220 | 9,586 | 135 | 9,721 | trait_obs_adj |
green | false | -37,210 | 0 | 2,293 | 1,428 | 3,721 | trait_obs_adj |
green | true | 129,900 | 108,300 | 6,135 | 144 | 6,279 | trait_obs_adj |
blue | false | -20,320 | 0 | 1,785 | 247 | 2,032 | trait_obs_within |
blue | true | 159,720 | 157,920 | 7,956 | 12 | 7,968 | trait_obs_within |
green | false | -19,680 | 0 | 881 | 1,087 | 1,968 | trait_obs_within |
green | true | 175,190 | 102,440 | 7,547 | 485 | 8,032 | trait_obs_within |
#negative utility for bad tenants
sim2_utility$tenant_rented %>%
mutate(
tenant_utility = if_else(bad_tenant, true = -tenant_utility, false = tenant_utility)
) %>%
group_by(var) %>%
summarise(
tenant_utility = sum(tenant_utility),
landlord_utility = sum(landlord_utility),
total_utility = tenant_utility + landlord_utility
)
#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] patchwork_1.3.2.9000 kirkegaard_2025-11-19 robustbase_0.99-6
## [4] psych_2.5.6 assertthat_0.2.1 weights_1.1.2
## [7] magrittr_2.0.4 lubridate_1.9.4 forcats_1.0.1
## [10] stringr_1.6.0 dplyr_1.1.4 purrr_1.2.0
## [13] readr_2.1.5 tidyr_1.3.1 tibble_3.3.0
## [16] ggplot2_4.0.1.9000 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] Rdpack_2.6.4 mnormt_2.1.1 gridExtra_2.3
## [4] rlang_1.1.6.9000 compiler_4.5.2 gdata_3.0.1
## [7] systemfonts_1.3.1 vctrs_0.6.5 pkgconfig_2.0.3
## [10] shape_1.4.6.1 fastmap_1.2.0 backports_1.5.0
## [13] labeling_0.4.3 rmarkdown_2.30 tzdb_0.5.0
## [16] nloptr_2.2.1 ragg_1.5.0 xfun_0.53
## [19] glmnet_4.1-10 jomo_2.7-6 cachem_1.1.0
## [22] jsonlite_2.0.0 uuid_1.2-1 pan_1.9
## [25] broom_1.0.11 parallel_4.5.2 cluster_2.1.8.1
## [28] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [31] RColorBrewer_1.1-3 boot_1.3-32 rpart_4.1.24
## [34] jquerylib_0.1.4 Rcpp_1.1.0 iterators_1.0.14
## [37] knitr_1.50 base64enc_0.1-3 Matrix_1.7-4
## [40] splines_4.5.2 nnet_7.3-20 timechange_0.3.0
## [43] tidyselect_1.2.1 rstudioapi_0.17.1 yaml_2.3.10
## [46] codetools_0.2-20 lattice_0.22-7 plyr_1.8.9
## [49] withr_3.0.2 S7_0.2.1 askpass_1.2.1
## [52] flextable_0.9.10 evaluate_1.0.5 foreign_0.8-90
## [55] survival_3.8-3 zip_2.3.3 xml2_1.4.0
## [58] pillar_1.11.1 mice_3.18.0 checkmate_2.3.3
## [61] foreach_1.5.2 reformulas_0.4.1 generics_0.1.4
## [64] hms_1.1.3 scales_1.4.0 minqa_1.2.8
## [67] gtools_3.9.5 glue_1.8.0 gdtools_0.4.4
## [70] Hmisc_5.2-4 tools_4.5.2 data.table_1.17.8
## [73] lme4_1.1-37 grid_4.5.2 rbibutils_2.3
## [76] colorspace_2.1-2 nlme_3.1-168 htmlTable_2.4.3
## [79] Formula_1.2-5 cli_3.6.5 textshaping_1.0.4
## [82] officer_0.7.0 fontBitstreamVera_0.1.1 gtable_0.3.6
## [85] DEoptimR_1.1-4 sass_0.4.10 digest_0.6.39
## [88] fontquiver_0.2.1 htmlwidgets_1.6.4 farver_2.1.2
## [91] htmltools_0.5.8.1 lifecycle_1.0.4 mitml_0.4-5
## [94] fontLiberation_0.1.0 openssl_2.3.4 MASS_7.3-65
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}