Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  patchwork
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

#settings
#group means
group_means = c(0, -1)
#group names
group_names = c("blue", "green")
#group colors
group_colors = c("#377EB8", "#4DAF4A")
#sample size
n_group = 1e4

Functions

#sim data
sim_data = function(.obs_e_sd = obs_e_sd) {
  tibble(
    group = rep(group_names, each = n_group),
    trait = c(rnorm(n_group, mean = group_means[1]),
              rnorm(n_group, mean = group_means[2])),
    trait_obs = trait + rnorm(n_group*2, sd = .obs_e_sd)
  )
}

#apply affirmative action
apply_transforms = function(x, .rel_x) {
  #group means
  group_means_empirical = describe2(
    x$trait_obs,
    group = x$group
  )
  
  #corrected obs score
  x %>% mutate(
    trait_obs_adj = if_else(
      condition = x$group == group_names[1],
      true = group_means_empirical$mean[1] + .rel_x * (trait_obs - group_means_empirical$mean[1]),
      false = group_means_empirical$mean[2] + .rel_x * (trait_obs - group_means_empirical$mean[2])
    ),
    trait_obs_within = if_else(
      condition = x$group == group_names[1],
      true = x$trait_obs,
      false = x$trait_obs - group_means_empirical$mean[2]
    )
  )
}

#apply pilot selection and calculate benefit
apply_pilot_selection = function(x, vars = c("trait", "trait_obs", "trait_obs_adj", "trait_obs_within"), ratio = 0.1) {
  map_dfr(vars, function(var) {
    selected = x %>% 
      arrange(-.data[[var]]) %>% 
      slice_head(prop = ratio)
    
    selected_desc = describe2(selected["trait"], group = selected$group) %>% select(group, n, mean) %>% mutate(var = var, overall_mean_trait = mean(selected$trait), overall_sd_trait = sd(selected$trait))
  })
}

#apply tenant selection
apply_tenant_selection = function(x, vars = c("trait", "trait_obs", "trait_obs_adj", "trait_obs_within"), ratio = 0.8) {
  y = map(vars, function(var) {
    # browser()
    #assign an id
    x %<>% mutate(id = row_number())
      
    #select
    selected = x %>% 
        arrange(-.data[[var]]) %>% 
        slice_head(prop = ratio)
      
    #add in those who didnt get selected
    x2 = bind_rows(
        selected %>% mutate(rented = T),
        x %>% filter(!id %in% selected$id) %>% mutate(rented = F)
    )
    
    #calculate utility
    x2$tenant_utility = case_when(
      !x2$bad_tenant & x2$rented ~ 20,
      x2$bad_tenant & x2$rented ~ 50,
      !x2$rented ~ -10
    )
    
    x2$landlord_utility = case_when(
      !x2$bad_tenant & x2$rented ~ 20,
      x2$bad_tenant & x2$rented ~ -100,
      !x2$rented ~ 0
    )
    
    #split
    blues = x2 %>% filter(group == "blue")
    greens = x2 %>% filter(group == "green")

    list(
      group_rented = x2 %>% 
        group_by(group, rented) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var),
      
      tenant = x2 %>% 
        group_by(bad_tenant) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var),
      
      group_tenant = x2 %>% 
        group_by(group, bad_tenant) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var),
      
      tenant_rented = x2 %>% 
        group_by(bad_tenant, rented) %>% 
        summarise(
          tenant_utility = sum(tenant_utility),
          landlord_utility = sum(landlord_utility),
          good_tenants = sum(!bad_tenant),
          bad_tenants = sum(bad_tenant),
          total_tenants = good_tenants + bad_tenants
        ) %>% 
        ungroup() %>% 
        mutate(var = var)
    )
  })

  #restructure
  list(
    group_rented = map_dfr(y, ~.$group_rented),
    tenant = map_dfr(y, ~.$tenant),
    group_tenant = map_dfr(y, ~.$group_tenant),
    tenant_rented = map_dfr(y, ~.$tenant_rented)
  )
}

Analysis

Simulation 1 - Pilots

# simulate 2 groups, blues and greens, that differ in some trait by 1 SD
# also measure this trait but only imperfectly
# then we apply top down selection for a given ratio
# and observed the effects of using direct true trait value vs. observed trait value vs. corrected values
# we model the benefits and costs to the persons using a cost-benefit matrix

#observation error
obs_e_sd = 0.5
#reliability within group
rel_x = 1 - (obs_e_sd / 1)^2

set.seed(1)
d1 = sim_data()

#apply aa
d1 %<>% apply_transforms(.rel_x = rel_x)
## New names:
## • `` -> `...1`
p1a = GG_denhist(d1, var = "trait", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "True score")
p1b = GG_denhist(d1, var = "trait_obs", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Observed score")
p1c = GG_denhist(d1, var = "trait_obs_adj", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Adjusted observed score")
p1d = GG_denhist(d1, var = "trait_obs_within", group = "group", vline = F) + scale_fill_discrete(palette = group_colors) + labs(title = "Within group-normed observed score")

patchwork::wrap_plots(p1a, p1b, p1c, p1d)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

GG_save("figs/dists.png")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
#gaps
cohen.d(
  d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
  group = d1$group
)
## Call: cohen.d(x = d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within), 
##     group = d1$group)
## Cohen d statistic of difference between two means
##                  lower effect upper
## trait            -1.03   -1.0 -0.97
## trait_obs        -0.93   -0.9 -0.87
## trait_obs_adj    -1.23   -1.2 -1.17
## trait_obs_within -0.03    0.0  0.03
## 
## Multivariate (Mahalanobis) distance between groups
## [1] 1.1
## r equivalent of difference between two means
##            trait        trait_obs    trait_obs_adj trait_obs_within 
##            -0.45            -0.41            -0.51             0.00
describe2(
  d1 %>% select(trait, trait_obs, trait_obs_adj, trait_obs_within),
  group = d1$group
)
#selection effects
#top 10% selection with simple benefits
apply_pilot_selection(d1) %>% flextable::flextable()

group

n

mean

var

overall_mean_trait

overall_sd_trait

blue

1,739

1.490

trait

1.47

0.430

green

261

1.352

trait

1.47

0.430

blue

1,699

1.339

trait_obs

1.30

0.602

green

301

1.070

trait_obs

1.30

0.602

blue

1,827

1.302

trait_obs_adj

1.30

0.602

green

173

1.245

trait_obs_adj

1.30

0.602

blue

1,027

1.570

trait_obs_within

1.10

0.749

green

973

0.605

trait_obs_within

1.10

0.749

Simulation 2 - Landlords and tenants

#settings
#observation error
obs_e_sd = 0.7
#reliability within group
rel_x = 1 - (obs_e_sd / 1)^2
rel_x
## [1] 0.51
sqrt(rel_x)
## [1] 0.714
set.seed(1)
d2 = sim_data(obs_e_sd)

#bad tenants
d2$bad_tenant = d2$trait < -2

#apply transforms
d2 %<>% apply_transforms(.rel_x = rel_x)
## New names:
## • `` -> `...1`
table(
  group = d2$group,
  bad_tenant = d2$bad_tenant
)
##        bad_tenant
## group   FALSE TRUE
##   blue   9741  259
##   green  8428 1572
#utility matrix
sim2_utility = expand_grid(
  tenant = c("good", "bad"),
  rented = c("yes", "no")
  ) %>% mutate(
    tenant_utility = c(20, -10, 50, -10),
    landlord_utility = c(20, 0, -100, 0)
  )

sim2_utility %>% flextable::flextable()

tenant

rented

tenant_utility

landlord_utility

good

yes

20

20

good

no

-10

0

bad

yes

50

-100

bad

no

-10

0

#apply selection
sim2_utility = d2 %>% apply_tenant_selection()
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'bad_tenant'. You can override using the
## `.groups` argument.
sim2_utility$group_rented %>% flextable::flextable()

group

rented

tenant_utility

landlord_utility

good_tenants

bad_tenants

total_tenants

var

blue

false

-7,490

0

490

259

749

trait

blue

true

185,020

185,020

9,251

0

9,251

trait

green

false

-32,510

0

1,679

1,572

3,251

trait

green

true

134,980

134,980

6,749

0

6,749

trait

blue

false

-9,230

0

716

207

923

trait_obs

blue

true

183,100

175,300

9,025

52

9,077

trait_obs

green

false

-30,770

0

1,742

1,335

3,077

trait_obs

green

true

145,570

110,020

6,686

237

6,923

trait_obs

blue

false

-2,790

0

155

124

279

trait_obs_adj

blue

true

198,470

178,220

9,586

135

9,721

trait_obs_adj

green

false

-37,210

0

2,293

1,428

3,721

trait_obs_adj

green

true

129,900

108,300

6,135

144

6,279

trait_obs_adj

blue

false

-20,320

0

1,785

247

2,032

trait_obs_within

blue

true

159,720

157,920

7,956

12

7,968

trait_obs_within

green

false

-19,680

0

881

1,087

1,968

trait_obs_within

green

true

175,190

102,440

7,547

485

8,032

trait_obs_within

#by group and method
sim2_utility$group_rented %>% 
  group_by(group, var) %>% 
  summarise(
    tenant_utility = sum(tenant_utility),
    landlord_utility = sum(landlord_utility),
    total_utility = tenant_utility + landlord_utility
  ) %>% 
  flextable::flextable()
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.

group

var

tenant_utility

landlord_utility

total_utility

blue

trait

177,530

185,020

362,550

blue

trait_obs

173,870

175,300

349,170

blue

trait_obs_adj

195,680

178,220

373,900

blue

trait_obs_within

139,400

157,920

297,320

green

trait

102,470

134,980

237,450

green

trait_obs

114,800

110,020

224,820

green

trait_obs_adj

92,690

108,300

200,990

green

trait_obs_within

155,510

102,440

257,950

#by method
sim2_utility$group_rented %>% 
  group_by(var) %>% 
  summarise(
    tenant_utility = sum(tenant_utility),
    landlord_utility = sum(landlord_utility),
    total_utility = tenant_utility + landlord_utility
  ) %>% 
  flextable::flextable()

var

tenant_utility

landlord_utility

total_utility

trait

280,000

320,000

600,000

trait_obs

288,670

285,320

573,990

trait_obs_adj

288,370

286,520

574,890

trait_obs_within

294,910

260,360

555,270

#by tenant and rent
sim2_utility$tenant_rented %>% 
  flextable::flextable()

bad_tenant

rented

tenant_utility

landlord_utility

good_tenants

bad_tenants

total_tenants

var

false

false

-21,690

0

2,169

0

2,169

trait

false

true

320,000

320,000

16,000

0

16,000

trait

true

false

-18,310

0

0

1,831

1,831

trait

false

false

-24,580

0

2,458

0

2,458

trait_obs

false

true

314,220

314,220

15,711

0

15,711

trait_obs

true

false

-15,420

0

0

1,542

1,542

trait_obs

true

true

14,450

-28,900

0

289

289

trait_obs

false

false

-24,480

0

2,448

0

2,448

trait_obs_adj

false

true

314,420

314,420

15,721

0

15,721

trait_obs_adj

true

false

-15,520

0

0

1,552

1,552

trait_obs_adj

true

true

13,950

-27,900

0

279

279

trait_obs_adj

false

false

-26,660

0

2,666

0

2,666

trait_obs_within

false

true

310,060

310,060

15,503

0

15,503

trait_obs_within

true

false

-13,340

0

0

1,334

1,334

trait_obs_within

true

true

24,850

-49,700

0

497

497

trait_obs_within

#by group and tenant
sim2_utility$group_rented %>% 
  flextable::flextable()

group

rented

tenant_utility

landlord_utility

good_tenants

bad_tenants

total_tenants

var

blue

false

-7,490

0

490

259

749

trait

blue

true

185,020

185,020

9,251

0

9,251

trait

green

false

-32,510

0

1,679

1,572

3,251

trait

green

true

134,980

134,980

6,749

0

6,749

trait

blue

false

-9,230

0

716

207

923

trait_obs

blue

true

183,100

175,300

9,025

52

9,077

trait_obs

green

false

-30,770

0

1,742

1,335

3,077

trait_obs

green

true

145,570

110,020

6,686

237

6,923

trait_obs

blue

false

-2,790

0

155

124

279

trait_obs_adj

blue

true

198,470

178,220

9,586

135

9,721

trait_obs_adj

green

false

-37,210

0

2,293

1,428

3,721

trait_obs_adj

green

true

129,900

108,300

6,135

144

6,279

trait_obs_adj

blue

false

-20,320

0

1,785

247

2,032

trait_obs_within

blue

true

159,720

157,920

7,956

12

7,968

trait_obs_within

green

false

-19,680

0

881

1,087

1,968

trait_obs_within

green

true

175,190

102,440

7,547

485

8,032

trait_obs_within

#negative utility for bad tenants
sim2_utility$tenant_rented %>% 
  mutate(
    tenant_utility = if_else(bad_tenant, true = -tenant_utility, false = tenant_utility)
  ) %>% 
  group_by(var) %>% 
  summarise(
    tenant_utility = sum(tenant_utility),
    landlord_utility = sum(landlord_utility),
    total_utility = tenant_utility + landlord_utility
  )

Meta

#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.3.2.9000  kirkegaard_2025-11-19 robustbase_0.99-6    
##  [4] psych_2.5.6           assertthat_0.2.1      weights_1.1.2        
##  [7] magrittr_2.0.4        lubridate_1.9.4       forcats_1.0.1        
## [10] stringr_1.6.0         dplyr_1.1.4           purrr_1.2.0          
## [13] readr_2.1.5           tidyr_1.3.1           tibble_3.3.0         
## [16] ggplot2_4.0.1.9000    tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.4            mnormt_2.1.1            gridExtra_2.3          
##  [4] rlang_1.1.6.9000        compiler_4.5.2          gdata_3.0.1            
##  [7] systemfonts_1.3.1       vctrs_0.6.5             pkgconfig_2.0.3        
## [10] shape_1.4.6.1           fastmap_1.2.0           backports_1.5.0        
## [13] labeling_0.4.3          rmarkdown_2.30          tzdb_0.5.0             
## [16] nloptr_2.2.1            ragg_1.5.0              xfun_0.53              
## [19] glmnet_4.1-10           jomo_2.7-6              cachem_1.1.0           
## [22] jsonlite_2.0.0          uuid_1.2-1              pan_1.9                
## [25] broom_1.0.11            parallel_4.5.2          cluster_2.1.8.1        
## [28] R6_2.6.1                bslib_0.9.0             stringi_1.8.7          
## [31] RColorBrewer_1.1-3      boot_1.3-32             rpart_4.1.24           
## [34] jquerylib_0.1.4         Rcpp_1.1.0              iterators_1.0.14       
## [37] knitr_1.50              base64enc_0.1-3         Matrix_1.7-4           
## [40] splines_4.5.2           nnet_7.3-20             timechange_0.3.0       
## [43] tidyselect_1.2.1        rstudioapi_0.17.1       yaml_2.3.10            
## [46] codetools_0.2-20        lattice_0.22-7          plyr_1.8.9             
## [49] withr_3.0.2             S7_0.2.1                askpass_1.2.1          
## [52] flextable_0.9.10        evaluate_1.0.5          foreign_0.8-90         
## [55] survival_3.8-3          zip_2.3.3               xml2_1.4.0             
## [58] pillar_1.11.1           mice_3.18.0             checkmate_2.3.3        
## [61] foreach_1.5.2           reformulas_0.4.1        generics_0.1.4         
## [64] hms_1.1.3               scales_1.4.0            minqa_1.2.8            
## [67] gtools_3.9.5            glue_1.8.0              gdtools_0.4.4          
## [70] Hmisc_5.2-4             tools_4.5.2             data.table_1.17.8      
## [73] lme4_1.1-37             grid_4.5.2              rbibutils_2.3          
## [76] colorspace_2.1-2        nlme_3.1-168            htmlTable_2.4.3        
## [79] Formula_1.2-5           cli_3.6.5               textshaping_1.0.4      
## [82] officer_0.7.0           fontBitstreamVera_0.1.1 gtable_0.3.6           
## [85] DEoptimR_1.1-4          sass_0.4.10             digest_0.6.39          
## [88] fontquiver_0.2.1        htmlwidgets_1.6.4       farver_2.1.2           
## [91] htmltools_0.5.8.1       lifecycle_1.0.4         mitml_0.4-5            
## [94] fontLiberation_0.1.0    openssl_2.3.4           MASS_7.3-65
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}