Init
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
)
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
pop_size = 1e4
Functions
calculate_RR = function(data, error = 0, use_true_group_size = F) {
data$group_true = data$group
if (error != 0) {
#seed
# set.seed(seed)
#add random error
error_idx = rbinom(nrow(data), 1, prob = error) %>% as.logical() %>% which()
#change groups to a randomly chosen group at those idx
data$group[error_idx] = sample(unique(data$group), size = length(error_idx), replace = T, prob = rep(1/length(unique(data$group)), length(unique(data$group))))
}
# browser()
props = calculate_props(data, use_true_group_size = use_true_group_size)
#relative rates to blue
nonblue_groups = unique(props$group) %>% setdiff("Blue")
map_dfr(nonblue_groups, \(gr) {
tibble(
group = gr,
rate = props$estimate[props$group==gr],
RR = rate / props$estimate[props$group=="Blue"],
error = error
)
})
}
sim_data = function(pop_size, groups = c("Blue", "Red"), pop_fracs = rep(1/length(groups), length(groups)), rates) {
d = tibble()
for (i in seq_along(groups)) {
d = bind_rows(
d,
tibble(
group = groups[i],
bad = rbinom(round(pop_size*pop_fracs[i]), size = 1, prob = rates[i])
)
)
}
d
}
#alternative prop test function
#we need this so we can fix population sizes
calculate_props = function(data, use_true_group_size = F) {
# browser()
#groups
groups = data$group %>% unique()
#count bads by group
d = tibble()
for (i in seq_along(groups)) {
i_bad_count = data %>% filter(group == groups[i]) %>% pull(bad) %>% sum()
#use true or not?
if (use_true_group_size) {
i_count = (data$group_true == groups[i]) %>% sum()
} else {
i_count = (data$group == groups[i]) %>% sum()
}
d = bind_rows(
d,
tibble(
group = groups[i],
estimate = i_bad_count/i_count
)
)
}
d
}
Analysis
Simulation 1: equal proportions, different rates
#across relative rates
set.seed(1)
d1_res = map_dfr(seq(0.05, 0.5, by = .05), \(red_rate) {
sim_data(
pop_size = pop_size,
rates = c(0.1, red_rate)
) %>%
{
map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e))
} %>%
mutate(
red_rate = red_rate,
true_RR = red_rate/0.1
)
})
d1_res %>%
# filter(true_RR == 5) %>%
ggplot(aes(error, RR, color = factor(true_RR), group = factor(true_RR))) +
geom_line() +
scale_y_continuous(breaks = seq(0, 6, by = 0.5)) +
scale_x_continuous(labels = scales::percent) +
labs(
x = "Classification error %",
title = "Effect of classification error on relative rates",
subtitle = "Red population fraction = 0.5.",
color = "True RR"
)

GG_save("figs/sim1.png")
Simulation 2: different proportions, different rates
#across relative rates
set.seed(1)
d2_res = map_dfr(seq(0.05, 0.5, by = .05), \(red_rate) {
sim_data(
pop_size = pop_size,
rates = c(0.1, red_rate),
pop_fracs = c(0.9, 0.1)
) %>%
{
map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e))
} %>%
mutate(
red_rate = red_rate,
true_RR = red_rate/0.1
)
})
d2_res %>%
ggplot(aes(error, RR, color = factor(true_RR), group = factor(true_RR))) +
geom_line() +
scale_y_continuous(breaks = seq(0, 6, by = 0.5)) +
scale_x_continuous(labels = scales::percent) +
labs(
x = "Classification error %",
title = "Effect of classification error on relative rates",
subtitle = "Red population fraction = 0.1.",
color = "True RR"
)

GG_save("figs/sim2.png")
Simulation 3: different proportions, different rates, constant
population denoms
#across relative rates
set.seed(1)
d3_res = map_dfr(seq(0.05, 0.5, by = .05), \(red_rate) {
sim_data(
pop_size = pop_size,
rates = c(0.1, red_rate),
pop_fracs = c(0.9, 0.1)
) %>%
{
map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e, use_true_group_size = T))
} %>%
mutate(
true_rate = red_rate,
true_RR = true_rate/0.1
)
})
d3_res %>%
ggplot(aes(error, RR, color = factor(true_RR), group = factor(true_RR))) +
geom_line() +
scale_y_continuous(breaks = seq(0, 100, by = 0.5)) +
scale_x_continuous(labels = scales::percent) +
labs(
x = "Classification error %",
title = "Effect of classification error on relative rates",
subtitle = "Red population fraction = 0.1. Constant denominator.",
color = "True RR"
)

GG_save("figs/sim3.png")
#expectation in the limit
#total rate
0.10*0.05+0.90*0.1
## [1] 0.095
#classified to red
(0.10*0.05+0.90*0.1)/2
## [1] 0.0475
#red rate
(0.10*0.05+0.90*0.1)/2/0.1
## [1] 0.475
#blue red
(0.10*0.05+0.90*0.1)/2/0.9
## [1] 0.0528
#RR
((0.10*0.05+0.90*0.1)/2/0.1) / ((0.10*0.05+0.90*0.1)/2/0.9)
## [1] 9
#sim
d3_res %>% filter(true_rate == 0.05, error == 1)
Simulation 4: varying proportions, constant rates, varying
errors
#across relative rates
set.seed(1)
d4_res = map_dfr(seq(0.05, 0.95, by = .05), \(red_frac) {
sim_data(
pop_size = pop_size,
rates = c(0.1, 0.1),
pop_fracs = c(1-red_frac, red_frac)
) %>%
{
map_dfr(seq(0, 1, by = 0.05), \(e) calculate_RR(., error = e, use_true_group_size = T))
} %>%
mutate(
red_frac = red_frac
)
})
d4_res %>%
ggplot(aes(error, log10(RR), color = factor(red_frac), group = factor(red_frac))) +
geom_line() +
scale_y_continuous(labels = function(v) round(10^v, 2), breaks = seq(-2, 2, by = 0.25)) +
scale_x_continuous(labels = scales::percent) +
labs(
title = "Relative bad rates as a function of misclassification and population proportions",
subtitle = "Constant denominator",
y = "RR",
x = "Classification error %",
color = "Red frac."
)

GG_save("figs/sim_4.png")
#limit calculation for 5% red
#red rate
(0.1/2) / 0.05
## [1] 1
#blue rate
(0.1/2) / 0.95
## [1] 0.0526
#RR
((0.1/2) / 0.05) / ((0.1/2) / 0.95)
## [1] 19
#sim
d4_res %>% filter(red_frac == 0.05, error == 1)