Init
Functions
Analysis
- Simulation 1 - Pilots
- Simulation 2 - Landlords and tenants
Meta

Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  patchwork
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

#settings
#group means
group_means = c(0, -1)
#group names
group_names = c("blue", "green")
#group colors
group_colors = c("#377EB8", "#4DAF4A")
#sample size
n_group = 1e4

Functions

#sim data
sim_data = function(.obs_e_sd = obs_e_sd) {
  tibble(
    group = rep(group_names, each = n_group),
    trait = c(rnorm(n_group, mean = group_means[1]),
              rnorm(n_group, mean = group_means[2])),
    trait_obs = trait + rnorm(n_group*2, sd = .obs_e_sd)
  )
}

#apply affirmative action
apply_transforms = function(x, .rel_x) {
  #group means
  group_means_empirical = describe2(
    x$trait_obs,
    group = x$group
  )
  
  #corrected obs score
  x %>% mutate(
    trait_obs_adj = if_else(
      condition = x$group == group_names[1],
      true = group_means_empirical$mean[1] + .rel_x * (trait_obs - group_means_empirical$mean[1]),
      false = group_means_empirical$mean[2] + .rel_x * (trait_obs - group_means_empirical$mean[2])
    ),
    trait_obs_within = if_else(
      condition = x$group == group_names[1],
      true = x$trait_obs,
      false = x$trait_obs - group_means_empirical$mean[2]
    )
  )
}

#apply pilot selection and calculate benefit
apply_pilot_selection = function(x, vars = c("trait", "trait_obs", "trait_obs_adj", "trait_obs_within"), ratio = 0.1) {
  map_dfr(vars, function(var) {
    selected = x %>% 
      arrange(-.data[[var]]) %>% 
      slice_head(prop = ratio)
    
    selected_desc = describe2(selected["trait"], group = selected$group) %>% select(group, n, mean) %>% mutate(var = var, overall_mean_trait = mean(selected$trait), overall_sd_trait = sd(selected$trait))
  })
}

#apply tenant selection
apply_tenant_selection = function(x, vars = c("trait", "trait_obs", "trait_obs_adj", "trait_obs_within"), ratio = 0.8) {
  y = map(vars, function(var) {
    # browser()
    #assign an id
    x %<>% mutate(id = row_number())
      
    #select
    selected = x %>% 
        arrange(-.data[[var]]) %>% 
        slice_head(prop = ratio)
      
    #add in those who didnt get selected
    x2 = bind_rows(
        selected %>% mutate(rented = T),
        x %>% filter(!id %in% selected$id) %>% mutate(rented = F)
    )
    
    #calculate utility
    x2$tenant_utility = case_when(
      !x2$bad_tenant & x2$rented ~ 20,
      x2$bad_tenant & x2$rented ~ 50,
      !x2$rented ~ -10
    )
    
    x2$landlord_utility = case_when(
      !x2$bad_tenant & x2$rented ~ 20,
      x2$bad_tenant & x2$rented ~ -100,
      !x2$rented ~ 0
    )
    
    #split
    blues = x2 %>% filter(group == "blue")
    greens = x2 %>% filter(group == "green")

    list(
      group_rented = x2 %>% 
        group_by(group, rented) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var),
      
      tenant = x2 %>% 
        group_by(bad_tenant) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var),
      
      group_tenant = x2 %>% 
        group_by(group, bad_tenant) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var),
      
      tenant_rented = x2 %>% 
        group_by(bad_tenant, rented) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var)
    )
  })

  #restructure
  list(
    group_rented = map_dfr(y, ~.$group_rented),
    tenant = map_dfr(y, ~.$tenant),
    group_tenant = map_dfr(y, ~.$group_tenant),
    tenant_rented = map_dfr(y, ~.$tenant_rented)
  )
}

Analysis

Simulation 1 - Pilots

# simulate 2 groups, blues and greens, that differ in some trait by 1 SD
# also measure this trait but only imperfectly
# then we apply top down selection for a given ratio
# and observed the effects of using direct true trait value vs. observed trait value vs. corrected values
# we model the benefits and costs to the persons using a cost-benefit matrix

#observation error
obs_e_sd = 0.5
#reliability within group
rel_x = 1 - (obs_e_sd / 1)^2

set.seed(1)
d1 = sim_data()

#apply aa
d1 %<>% apply_transforms(.rel_x = rel_x)

## New names:
## • `` -> `...1`

p1a = GG_denhist(d1, var = "trait", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "True score")
p1b = GG_denhist(d1, var = "trait_obs", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Observed score")
p1c = GG_denhist(d1, var = "trait_obs_adj", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Adjusted observed score")
p1d = GG_denhist(d1, var = "trait_obs_within", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Within group-normed observed score")

patchwork::wrap_plots(p1a, p1b, p1c, p1d)

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

GG_save("figs/dists.png")

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#gaps
cohen.d(
  d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
  group = d1$group
)

## Call: cohen.d(x = d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within), 
##     group = d1$group)
## Cohen d statistic of difference between two means
##                  lower effect upper
## trait            -1.03   -1.0 -0.97
## trait_obs        -0.93   -0.9 -0.87
## trait_obs_adj    -1.23   -1.2 -1.17
## trait_obs_within -0.03    0.0  0.03
## 
## Multivariate (Mahalanobis) distance between groups
## [1] 1.1
## r equivalent of difference between two means
##            trait        trait_obs    trait_obs_adj trait_obs_within 
##            -0.45            -0.41            -0.51             0.00

describe2(
  d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
  group = d1$group
)

#selection effects
#top 10% selection with simple benefits
apply_pilot_selection(d1) %>% flextable::flextable()

group	n	mean	var	overall_mean_trait	overall_sd_trait
blue	1,739	1.490	trait	1.47	0.430
green	261	1.352	trait	1.47	0.430
blue	1,699	1.339	trait_obs	1.30	0.602
green	301	1.070	trait_obs	1.30	0.602
blue	1,827	1.302	trait_obs_adj	1.30	0.602
green	173	1.245	trait_obs_adj	1.30	0.602
blue	1,027	1.570	trait_obs_within	1.10	0.749
green	973	0.605	trait_obs_within	1.10	0.749

Simulation 2 - Landlords and tenants

#settings
#observation error
obs_e_sd = 0.7
#reliability within group
rel_x = 1 - (obs_e_sd / 1)^2
rel_x

## [1] 0.51

sqrt(rel_x)

## [1] 0.714

set.seed(1)
d2 = sim_data(obs_e_sd)

#bad tenants
d2$bad_tenant = d2$trait < -2

#apply transforms
d2 %<>% apply_transforms(.rel_x = rel_x)

## New names:
## • `` -> `...1`

table(
  group = d2$group,
  bad_tenant = d2$bad_tenant
)

##        bad_tenant
## group   FALSE TRUE
##   blue   9741  259
##   green  8428 1572

#utility matrix
sim2_utility = expand_grid(
  tenant = c("good", "bad"),
  rented = c("yes", "no")
  ) %>% mutate(
    tenant_utility = c(20, -10, 50, -10),
    landlord_utility = c(20, 0, -100, 0)
  )

sim2_utility %>% flextable::flextable()

tenant	rented	tenant_utility	landlord_utility
good	yes	20	20
good	no	-10	0
bad	yes	50	-100
bad	no	-10	0

#apply selection
sim2_utility = d2 %>% apply_tenant_selection()

## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.

sim2_utility$group_rented %>% flextable::flextable()

group	rented	tenant_utility	landlord_utility	good_tenants	bad_tenants	total_tenants	var
blue	false	-7,490	0	490	259	749	trait
blue	true	185,020	185,020	9,251	0	9,251	trait
green	false	-32,510	0	1,679	1,572	3,251	trait
green	true	134,980	134,980	6,749	0	6,749	trait
blue	false	-9,230	0	716	207	923	trait_obs
blue	true	183,100	175,300	9,025	52	9,077	trait_obs
green	false	-30,770	0	1,742	1,335	3,077	trait_obs
green	true	145,570	110,020	6,686	237	6,923	trait_obs
blue	false	-2,790	0	155	124	279	trait_obs_adj
blue	true	198,470	178,220	9,586	135	9,721	trait_obs_adj
green	false	-37,210	0	2,293	1,428	3,721	trait_obs_adj
green	true	129,900	108,300	6,135	144	6,279	trait_obs_adj
blue	false	-20,320	0	1,785	247	2,032	trait_obs_within
blue	true	159,720	157,920	7,956	12	7,968	trait_obs_within
green	false	-19,680	0	881	1,087	1,968	trait_obs_within
green	true	175,190	102,440	7,547	485	8,032	trait_obs_within

#by group and method
sim2_utility$group_rented %>% 
  group_by(group, var) %>% 
  summarise(
    tenant_utility = sum(tenant_utility),
    landlord_utility = sum(landlord_utility),
    total_utility = tenant_utility + landlord_utility
  ) %>% 
  flextable::flextable()

## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.

group	var	tenant_utility	landlord_utility	total_utility
blue	trait	177,530	185,020	362,550
blue	trait_obs	173,870	175,300	349,170
blue	trait_obs_adj	195,680	178,220	373,900
blue	trait_obs_within	139,400	157,920	297,320
green	trait	102,470	134,980	237,450
green	trait_obs	114,800	110,020	224,820
green	trait_obs_adj	92,690	108,300	200,990
green	trait_obs_within	155,510	102,440	257,950

#by method
sim2_utility$group_rented %>% 
  group_by(var) %>% 
  summarise(
    tenant_utility = sum(tenant_utility),
    landlord_utility = sum(landlord_utility),
    total_utility = tenant_utility + landlord_utility
  ) %>% 
  flextable::flextable()

var	tenant_utility	landlord_utility	total_utility
trait	280,000	320,000	600,000
trait_obs	288,670	285,320	573,990
trait_obs_adj	288,370	286,520	574,890
trait_obs_within	294,910	260,360	555,270

#by tenant and rent
sim2_utility$tenant_rented %>% 
  flextable::flextable()

bad_tenant	rented	tenant_utility	landlord_utility	good_tenants	bad_tenants	total_tenants	var
false	false	-21,690	0	2,169	0	2,169	trait
false	true	320,000	320,000	16,000	0	16,000	trait
true	false	-18,310	0	0	1,831	1,831	trait
false	false	-24,580	0	2,458	0	2,458	trait_obs
false	true	314,220	314,220	15,711	0	15,711	trait_obs
true	false	-15,420	0	0	1,542	1,542	trait_obs
true	true	14,450	-28,900	0	289	289	trait_obs
false	false	-24,480	0	2,448	0	2,448	trait_obs_adj
false	true	314,420	314,420	15,721	0	15,721	trait_obs_adj
true	false	-15,520	0	0	1,552	1,552	trait_obs_adj
true	true	13,950	-27,900	0	279	279	trait_obs_adj
false	false	-26,660	0	2,666	0	2,666	trait_obs_within
false	true	310,060	310,060	15,503	0	15,503	trait_obs_within
true	false	-13,340	0	0	1,334	1,334	trait_obs_within
true	true	24,850	-49,700	0	497	497	trait_obs_within

#by group and tenant
sim2_utility$group_rented %>% 
  flextable::flextable()

group	rented	tenant_utility	landlord_utility	good_tenants	bad_tenants	total_tenants	var
blue	false	-7,490	0	490	259	749	trait
blue	true	185,020	185,020	9,251	0	9,251	trait
green	false	-32,510	0	1,679	1,572	3,251	trait
green	true	134,980	134,980	6,749	0	6,749	trait
blue	false	-9,230	0	716	207	923	trait_obs
blue	true	183,100	175,300	9,025	52	9,077	trait_obs
green	false	-30,770	0	1,742	1,335	3,077	trait_obs
green	true	145,570	110,020	6,686	237	6,923	trait_obs
blue	false	-2,790	0	155	124	279	trait_obs_adj
blue	true	198,470	178,220	9,586	135	9,721	trait_obs_adj
green	false	-37,210	0	2,293	1,428	3,721	trait_obs_adj
green	true	129,900	108,300	6,135	144	6,279	trait_obs_adj
blue	false	-20,320	0	1,785	247	2,032	trait_obs_within
blue	true	159,720	157,920	7,956	12	7,968	trait_obs_within
green	false	-19,680	0	881	1,087	1,968	trait_obs_within
green	true	175,190	102,440	7,547	485	8,032	trait_obs_within

#negative utility for bad tenants
sim2_utility$tenant_rented %>% 
  mutate(
    tenant_utility = if_else(bad_tenant, true = -tenant_utility, false = tenant_utility)
  ) %>% 
  group_by(var) %>% 
  summarise(
    tenant_utility = sum(tenant_utility),
    landlord_utility = sum(landlord_utility),
    total_utility = tenant_utility + landlord_utility
  )

Meta

#versions
write_sessioninfo()

## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.3.2.9000  kirkegaard_2025-11-19 robustbase_0.99-6    
##  [4] psych_2.5.6           assertthat_0.2.1      weights_1.1.2        
##  [7] magrittr_2.0.4        lubridate_1.9.4       forcats_1.0.1        
## [10] stringr_1.6.0         dplyr_1.1.4           purrr_1.2.0          
## [13] readr_2.1.5           tidyr_1.3.1           tibble_3.3.0         
## [16] ggplot2_4.0.1.9000    tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.4            mnormt_2.1.1            gridExtra_2.3          
##  [4] rlang_1.1.6.9000        compiler_4.5.2          gdata_3.0.1            
##  [7] systemfonts_1.3.1       vctrs_0.6.5             pkgconfig_2.0.3        
## [10] shape_1.4.6.1           fastmap_1.2.0           backports_1.5.0        
## [13] labeling_0.4.3          rmarkdown_2.30          tzdb_0.5.0             
## [16] nloptr_2.2.1            ragg_1.5.0              xfun_0.53              
## [19] glmnet_4.1-10           jomo_2.7-6              cachem_1.1.0           
## [22] jsonlite_2.0.0          uuid_1.2-1              pan_1.9                
## [25] broom_1.0.11            parallel_4.5.2          cluster_2.1.8.1        
## [28] R6_2.6.1                bslib_0.9.0             stringi_1.8.7          
## [31] RColorBrewer_1.1-3      boot_1.3-32             rpart_4.1.24           
## [34] jquerylib_0.1.4         Rcpp_1.1.0              iterators_1.0.14       
## [37] knitr_1.50              base64enc_0.1-3         Matrix_1.7-4           
## [40] splines_4.5.2           nnet_7.3-20             timechange_0.3.0       
## [43] tidyselect_1.2.1        rstudioapi_0.17.1       yaml_2.3.10            
## [46] codetools_0.2-20        lattice_0.22-7          plyr_1.8.9             
## [49] withr_3.0.2             S7_0.2.1                askpass_1.2.1          
## [52] flextable_0.9.10        evaluate_1.0.5          foreign_0.8-90         
## [55] survival_3.8-3          zip_2.3.3               xml2_1.4.0             
## [58] pillar_1.11.1           mice_3.18.0             checkmate_2.3.3        
## [61] foreach_1.5.2           reformulas_0.4.1        generics_0.1.4         
## [64] hms_1.1.3               scales_1.4.0            minqa_1.2.8            
## [67] gtools_3.9.5            glue_1.8.0              gdtools_0.4.4          
## [70] Hmisc_5.2-4             tools_4.5.2             data.table_1.17.8      
## [73] lme4_1.1-37             grid_4.5.2              rbibutils_2.3          
## [76] colorspace_2.1-2        nlme_3.1-168            htmlTable_2.4.3        
## [79] Formula_1.2-5           cli_3.6.5               textshaping_1.0.4      
## [82] officer_0.7.0           fontBitstreamVera_0.1.1 gtable_0.3.6           
## [85] DEoptimR_1.1-4          sass_0.4.10             digest_0.6.39          
## [88] fontquiver_0.2.1        htmlwidgets_1.6.4       farver_2.1.2           
## [91] htmltools_0.5.8.1       lifecycle_1.0.4         mitml_0.4-5            
## [94] fontLiberation_0.1.0    openssl_2.3.4           MASS_7.3-65

#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}

Whose benefit of the doubt?

10 January, 2026

Init

Functions

Analysis

Simulation 1 - Pilots

Simulation 2 - Landlords and tenants

Meta