Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

pop_size = 1e4

Functions

calculate_RR = function(data, error = 0, use_true_group_size = F) {
  data$group_true = data$group
  if (error != 0) {
    #seed
    # set.seed(seed)
    
    #add random error
    error_idx = rbinom(nrow(data), 1, prob = error) %>% as.logical() %>% which()
    
    #change groups to a randomly chosen group at those idx
    data$group[error_idx] = sample(unique(data$group), size = length(error_idx), replace = T, prob = rep(1/length(unique(data$group)), length(unique(data$group))))
  }
  # browser()
  props = calculate_props(data, use_true_group_size = use_true_group_size)
  
  #relative rates to blue
  nonblue_groups = unique(props$group) %>% setdiff("Blue")
  
  map_dfr(nonblue_groups, \(gr) {
    tibble(
      group = gr,
      rate = props$estimate[props$group==gr],
      RR = rate / props$estimate[props$group=="Blue"],
      error = error
    )
  })
}

sim_data = function(pop_size, groups = c("Blue", "Red"), pop_fracs = rep(1/length(groups), length(groups)), rates) {
  d = tibble()
  
  for (i in seq_along(groups)) {
    d = bind_rows(
      d,
      tibble(
        group = groups[i],
        bad = rbinom(round(pop_size*pop_fracs[i]), size = 1, prob = rates[i])
      )
    )
  }
  
  d
}

#alternative prop test function
#we need this so we can fix population sizes
calculate_props = function(data, use_true_group_size = F) {
  # browser()
  #groups
  groups = data$group %>% unique()
  
  #count bads by group
  d = tibble()
  
  for (i in seq_along(groups)) {
    i_bad_count = data %>% filter(group == groups[i]) %>% pull(bad) %>% sum()
    
    #use true or not?
    if (use_true_group_size) {
      i_count = (data$group_true == groups[i]) %>% sum()
    } else {
      i_count = (data$group == groups[i]) %>% sum()
    }
    
    d = bind_rows(
      d,
      tibble(
        group = groups[i],
        estimate = i_bad_count/i_count
      )
    )
  }
  
  d
}

Analysis

Simulation 1: equal proportions, different rates

#across relative rates
set.seed(1)
d1_res = map_dfr(seq(0.05, 0.5, by = .05), \(red_rate) {
  sim_data(
    pop_size = pop_size,
    rates = c(0.1, red_rate)
  ) %>% 
    {
      map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e))
    } %>% 
    mutate(
      red_rate = red_rate,
      true_RR = red_rate/0.1
      )
})

d1_res %>% 
  # filter(true_RR == 5) %>% 
  ggplot(aes(error, RR, color = factor(true_RR), group = factor(true_RR))) +
  geom_line() +
  scale_y_continuous(breaks = seq(0, 6, by = 0.5)) +
  scale_x_continuous(labels = scales::percent) +
  labs(
    x = "Classification error %",
    title = "Effect of classification error on relative rates",
    subtitle = "Red population fraction = 0.5.",
    color = "True RR"
  )

GG_save("figs/sim1.png")

Simulation 2: different proportions, different rates

#across relative rates
set.seed(1)
d2_res = map_dfr(seq(0.05, 0.5, by = .05), \(red_rate) {
  sim_data(
    pop_size = pop_size,
    rates = c(0.1, red_rate),
    pop_fracs = c(0.9, 0.1)
  ) %>% 
    {
      map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e))
    } %>% 
    mutate(
      red_rate = red_rate,
      true_RR = red_rate/0.1
      )
})

d2_res %>% 
  ggplot(aes(error, RR, color = factor(true_RR), group = factor(true_RR))) +
  geom_line() +
  scale_y_continuous(breaks = seq(0, 6, by = 0.5)) +
  scale_x_continuous(labels = scales::percent) +
  labs(
    x = "Classification error %",
    title = "Effect of classification error on relative rates",
    subtitle = "Red population fraction = 0.1.",
    color = "True RR"
  )

GG_save("figs/sim2.png")

Simulation 3: different proportions, different rates, constant population denoms

#across relative rates
set.seed(1)
d3_res = map_dfr(seq(0.05, 0.5, by = .05), \(red_rate) {
  sim_data(
    pop_size = pop_size,
    rates = c(0.1, red_rate),
    pop_fracs = c(0.9, 0.1)
  ) %>% 
    {
      map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e, use_true_group_size = T))
    } %>% 
    mutate(
      true_rate = red_rate,
      true_RR = true_rate/0.1
      )
})

d3_res %>% 
  ggplot(aes(error, RR, color = factor(true_RR), group = factor(true_RR))) +
  geom_line() +
  scale_y_continuous(breaks = seq(0, 100, by = 0.5)) +
  scale_x_continuous(labels = scales::percent) +
  labs(
    x = "Classification error %",
    title = "Effect of classification error on relative rates",
    subtitle = "Red population fraction = 0.1. Constant denominator.",
    color = "True RR"
  )

GG_save("figs/sim3.png")

#expectation in the limit
#total rate
0.10*0.05+0.90*0.1

## [1] 0.095

#classified to red
(0.10*0.05+0.90*0.1)/2

## [1] 0.0475

#red rate
(0.10*0.05+0.90*0.1)/2/0.1

## [1] 0.475

#blue red
(0.10*0.05+0.90*0.1)/2/0.9

## [1] 0.0528

#RR
((0.10*0.05+0.90*0.1)/2/0.1) / ((0.10*0.05+0.90*0.1)/2/0.9)

## [1] 9

#sim
d3_res %>% filter(true_rate == 0.05, error == 1)

Simulation 4: varying proportions, constant rates, varying errors

#across relative rates
set.seed(1)
d4_res = map_dfr(seq(0.05, 0.95, by = .05), \(red_frac) {
  sim_data(
    pop_size = pop_size,
    rates = c(0.1, 0.1),
    pop_fracs = c(1-red_frac, red_frac)
  ) %>% 
    {
      map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e, use_true_group_size = T))
    } %>% 
    mutate(
      red_frac = red_frac
    )
})

d4_res %>% 
  ggplot(aes(error, log10(RR), color = factor(red_frac), group = factor(red_frac))) +
  geom_line() +
  scale_y_continuous(labels = function(v) round(10^v, 2), breaks = seq(-2, 2, by = 0.25)) +
  scale_x_continuous(labels = scales::percent) +
  labs(
    title = "Relative bad rates as a function of misclassification and population proportions",
    subtitle = "Constant denominator",
    y = "RR",
    x = "Classification error %",
    color = "Red frac."
  )

GG_save("figs/sim_4.png")

#limit calculation for 5% red
#red rate
(0.1/2) / 0.05

## [1] 1

#blue rate
(0.1/2) / 0.95

## [1] 0.0526

#RR
((0.1/2) / 0.05) / ((0.1/2) / 0.95)

## [1] 19

#sim
d4_res %>% filter(red_frac == 0.05, error == 1)

Meta

#versions
write_sessioninfo()

## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kirkegaard_2025-11-19 robustbase_0.99-6     psych_2.5.6          
##  [4] assertthat_0.2.1      weights_1.1.2         magrittr_2.0.4       
##  [7] lubridate_1.9.4       forcats_1.0.1         stringr_1.5.2        
## [10] dplyr_1.1.4           purrr_1.1.0           readr_2.1.5          
## [13] tidyr_1.3.1           tibble_3.3.0          ggplot2_4.0.0        
## [16] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   farver_2.1.2       S7_0.2.0           fastmap_1.2.0     
##  [5] digest_0.6.37      rpart_4.1.24       timechange_0.3.0   lifecycle_1.0.4   
##  [9] cluster_2.1.8.1    survival_3.8-3     gdata_3.0.1        compiler_4.5.2    
## [13] rlang_1.1.6        Hmisc_5.2-4        sass_0.4.10        tools_4.5.2       
## [17] yaml_2.3.10        data.table_1.17.8  knitr_1.50         labeling_0.4.3    
## [21] htmlwidgets_1.6.4  mnormt_2.1.1       RColorBrewer_1.1-3 withr_3.0.2       
## [25] foreign_0.8-90     nnet_7.3-20        grid_4.5.2         jomo_2.7-6        
## [29] colorspace_2.1-2   mice_3.18.0        scales_1.4.0       gtools_3.9.5      
## [33] iterators_1.0.14   MASS_7.3-65        cli_3.6.5          rmarkdown_2.30    
## [37] ragg_1.5.0         reformulas_0.4.1   generics_0.1.4     rstudioapi_0.17.1 
## [41] tzdb_0.5.0         minqa_1.2.8        cachem_1.1.0       splines_4.5.2     
## [45] parallel_4.5.2     base64enc_0.1-3    vctrs_0.6.5        boot_1.3-32       
## [49] glmnet_4.1-10      Matrix_1.7-4       jsonlite_2.0.0     hms_1.1.3         
## [53] mitml_0.4-5        Formula_1.2-5      htmlTable_2.4.3    systemfonts_1.3.1 
## [57] foreach_1.5.2      jquerylib_0.1.4    glue_1.8.0         DEoptimR_1.1-4    
## [61] nloptr_2.2.1       pan_1.9            codetools_0.2-20   stringi_1.8.7     
## [65] gtable_0.3.6       shape_1.4.6.1      lme4_1.1-37        pillar_1.11.1     
## [69] htmltools_0.5.8.1  R6_2.6.1           textshaping_1.0.4  Rdpack_2.6.4      
## [73] evaluate_1.0.5     lattice_0.22-7     rbibutils_2.3      backports_1.5.0   
## [77] broom_1.0.10       bslib_0.9.0        Rcpp_1.1.0         gridExtra_2.3     
## [81] nlme_3.1-168       checkmate_2.3.3    xfun_0.53          pkgconfig_2.0.3

#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}

Classification error and relative rates

20 November, 2025

Init

Functions

Analysis

Simulation 1: equal proportions, different rates

Simulation 2: different proportions, different rates

Simulation 3: different proportions, different rates, constant population denoms

Simulation 4: varying proportions, constant rates, varying errors

Meta