Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  patchwork
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

Functions

#GPT pro
make_mating_pool <- function(n, seed = 1,
                            names = c("man", "woman")) {
  stopifnot(length(n) == 1, n > 1, is.finite(n))
  stopifnot(length(names) == 2)

  if (!is.null(seed)) set.seed(seed)

  men   <- as.numeric(scale(rnorm(n)))
  women <- as.numeric(scale(rnorm(n)))

  out <- data.frame(men, women)
  colnames(out) <- names
  out
}

#pairsing function
pair_assortative <- function(df, r,
                            man_col = names(df)[1],
                            woman_col = names(df)[2],
                            tol = 1e-3, maxit = 60,
                            seed = NULL) {
  stopifnot(is.data.frame(df))
  stopifnot(length(r) == 1, is.finite(r), r >= -1, r <= 1)
  if (!is.null(seed)) set.seed(seed)

  x <- df[[man_col]]
  y <- df[[woman_col]]
  n <- length(x)
  stopifnot(length(y) == n, n > 1)

  # Men order used to define who matches whom (rank order)
  ox <- order(x)

  corr_for_alpha <- function(alpha) {
    ry <- rank(y, ties.method = "random")
    rr <- rank(runif(n), ties.method = "random")
    score <- alpha * ry + (1 - alpha) * rr

    oy <- order(score)
    if (r < 0) oy <- rev(oy)

    cor(x[ox], y[oy])
  }

  target <- abs(r)
  c0 <- abs(corr_for_alpha(0))
  c1 <- abs(corr_for_alpha(1))

  if (target <= c0 + tol) {
    alpha_hat <- 0
  } else if (target >= c1 - tol) {
    alpha_hat <- 1
    warning(sprintf(
      "Requested |r|=%.3f exceeds/equals max achievable |r|=%.3f for this finite pool; using alpha=1.",
      target, c1
    ), call. = FALSE)
  } else {
    f <- function(a) abs(corr_for_alpha(a)) - target
    alpha_hat <- uniroot(f, interval = c(0, 1), tol = tol, maxiter = maxit)$root
  }

  # Final matching
  ry <- rank(y, ties.method = "random")
  rr <- rank(runif(n), ties.method = "random")
  score <- alpha_hat * ry + (1 - alpha_hat) * rr
  oy <- order(score)
  if (r < 0) oy <- rev(oy)

  # IMPORTANT: map paired women back to original men rows
  new_y <- y
  new_y[ox] <- y[oy]

  out <- df
  out[[woman_col]] <- new_y

  attr(out, "alpha") <- alpha_hat
  attr(out, "achieved_r") <- cor(out[[man_col]], out[[woman_col]])
  attr(out, "woman_order") <- oy
  attr(out, "men_order") <- ox
  out
}


#compute metrics
pair_stats <- function(df,
                       man_col = names(df)[1],
                       woman_col = names(df)[2]) {
  stopifnot(is.data.frame(df))
  x <- df[[man_col]]
  y <- df[[woman_col]]
  stopifnot(length(x) == length(y), length(x) > 1)

  data.frame(
    r = cor(x, y, use = "complete.obs"),
    mean_abs_diff = mean(abs(x - y), na.rm = TRUE)
  )
}

Analysis

Simulation 1

#small sample to illstrate the principle
d1 = make_mating_pool(n=50, seed = 2)

d1_random = pair_assortative(d1, r = 0)

d1_random %>% describe2()
#plot the pairsings
p1 = d1_random %>% 
  mutate(
    pair = 1:n()
  ) %>% 
  pivot_longer(-pair) %>% 
  ggplot(aes(name, value)) + 
  geom_point(aes(color = name)) +
  geom_line(aes(group = pair), alpha = 0.2, linetype = 2) +
  labs(
    color = "Sex",
    y = "Trait value (z)",
    x = "Sex",
    title = "Random mating",
    subtitle = "r = 0.0"
  )
  
#metrics
d1_random %>% pair_stats()
#match them up more realistically
d1_05 = pair_assortative(d1, r = 0.5)

p2 = d1_05 %>% 
  mutate(
    pair = 1:n()
  ) %>% 
  pivot_longer(-pair) %>% 
  ggplot(aes(name, value)) + 
  geom_point(aes(color = name)) +
  geom_line(aes(group = pair), alpha = 0.2, linetype = 2) +
  labs(
    color = "Sex",
    y = "Trait value (z)",
    x = "Sex",
    title = "Realistic assortative mating",
    subtitle = "r = 0.5"
  )

#metrics
d1_05 %>% pair_stats()
#pair as perfect as possible
d1_10 = pair_assortative(d1, r = 1)
## Warning: Requested |r|=1.000 exceeds/equals max achievable |r|=0.985 for this
## finite pool; using alpha=1.
p3 = d1_10 %>% 
  mutate(
    pair = 1:n()
  ) %>% 
  pivot_longer(-pair) %>% 
  ggplot(aes(name, value)) + 
  geom_point(aes(color = name)) +
  geom_line(aes(group = pair), alpha = 0.2, linetype = 2) +
  labs(
    color = "Sex",
    y = "Trait value (z)",
    x = "Sex",
    title = "Perfect assortative mating",
    subtitle = "r = 1.0"
  )

#metrics
d1_10 %>% pair_stats()
#combine plots
patchwork::wrap_plots(p1, p2, p3, nrow = 1, axes = "collect", guides = "collect")

GG_save("figs/pairings.png")

#larger sample size
d2 = make_mating_pool(n=1000, seed = 2)
d2_05 = pair_assortative(d2, r = 0.5)
d2_10 = pair_assortative(d2, r = 1)
## Warning: Requested |r|=1.000 exceeds/equals max achievable |r|=0.998 for this
## finite pool; using alpha=1.
d2_05 %>% pair_stats()
p5 = d2_05 %>% 
  mutate(
    max_z = pmax(man, woman),
    abs_d = abs(man - woman)
  ) %>% 
  GG_scatter("max_z", "abs_d") +
  labs(
    x = "Highest mate value for trait",
    y = "Trait value absolute difference"
  )

GG_save(plot = p5, "figs/abd_d_by_max_z.png")
## `geom_smooth()` using formula = 'y ~ x'
p4 = d2 %>% 
  mutate(
    max_z = pmax(man, woman),
    abs_d = abs(man - woman)
  ) %>% 
  GG_scatter("max_z", "abs_d") +
  labs(
    x = "Highest mate value for trait",
    y = "Trait value absolute difference"
  )

p6 = d2_10 %>% 
  mutate(
    max_z = pmax(man, woman),
    abs_d = abs(man - woman)
  ) %>% 
  GG_scatter("max_z", "abs_d") +
  labs(
    x = "Highest mate value for trait",
    y = "Trait value absolute difference"
  )

#combine plots
patchwork::wrap_plots(p4, p5, p6, nrow = 1, guides = "collect", axes = "collect")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/abd_d_by_max_z_3.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
d2_05 %>% 
  GG_scatter("man", "woman") +
  labs(
    title = "Pairs' trait values under assortative mating",
    x = "Man's value",
    y = "Woman's value"
  )
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/xz_am.png")
## `geom_smooth()` using formula = 'y ~ x'

Meta

#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.3.2.9000  kirkegaard_2025-11-19 robustbase_0.99-6    
##  [4] psych_2.5.6           assertthat_0.2.1      weights_1.1.2        
##  [7] magrittr_2.0.4        lubridate_1.9.4       forcats_1.0.1        
## [10] stringr_1.6.0         dplyr_1.1.4           purrr_1.2.0          
## [13] readr_2.1.5           tidyr_1.3.1           tibble_3.3.0         
## [16] ggplot2_4.0.1.9000    tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   farver_2.1.2       S7_0.2.1           fastmap_1.2.0     
##  [5] digest_0.6.39      rpart_4.1.24       timechange_0.3.0   lifecycle_1.0.4   
##  [9] cluster_2.1.8.1    survival_3.8-3     gdata_3.0.1        compiler_4.5.2    
## [13] rlang_1.1.6.9000   Hmisc_5.2-4        sass_0.4.10        tools_4.5.2       
## [17] yaml_2.3.10        data.table_1.17.8  knitr_1.50         labeling_0.4.3    
## [21] htmlwidgets_1.6.4  mnormt_2.1.1       RColorBrewer_1.1-3 withr_3.0.2       
## [25] foreign_0.8-90     nnet_7.3-20        grid_4.5.2         jomo_2.7-6        
## [29] colorspace_2.1-2   mice_3.18.0        scales_1.4.0       gtools_3.9.5      
## [33] iterators_1.0.14   MASS_7.3-65        cli_3.6.5          rmarkdown_2.30    
## [37] ragg_1.5.0         reformulas_0.4.1   generics_0.1.4     rstudioapi_0.17.1 
## [41] tzdb_0.5.0         minqa_1.2.8        cachem_1.1.0       splines_4.5.2     
## [45] parallel_4.5.2     base64enc_0.1-3    vctrs_0.6.5        boot_1.3-32       
## [49] glmnet_4.1-10      Matrix_1.7-4       jsonlite_2.0.0     hms_1.1.3         
## [53] mitml_0.4-5        Formula_1.2-5      htmlTable_2.4.3    systemfonts_1.3.1 
## [57] foreach_1.5.2      jquerylib_0.1.4    glue_1.8.0         DEoptimR_1.1-4    
## [61] nloptr_2.2.1       pan_1.9            codetools_0.2-20   stringi_1.8.7     
## [65] gtable_0.3.6       shape_1.4.6.1      lme4_1.1-37        pillar_1.11.1     
## [69] htmltools_0.5.8.1  R6_2.6.1           textshaping_1.0.4  Rdpack_2.6.4      
## [73] evaluate_1.0.5     lattice_0.22-7     rbibutils_2.3      backports_1.5.0   
## [77] broom_1.0.11       bslib_0.9.0        Rcpp_1.1.0         gridExtra_2.3     
## [81] nlme_3.1-168       checkmate_2.3.3    mgcv_1.9-3         xfun_0.53         
## [85] pkgconfig_2.0.3
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}