# Reproducibilnost i paketi
set.seed(123)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(purrr) # fallback
# Basic structure
N_person <- 6000
years <- 2010:2024
persons <- tibble(
person_id = sprintf("P%05d", 1:N_person),
sex = sample(c("M","F"), N_person, replace = TRUE),
birth_year = sample(1990:2008, N_person, replace = TRUE),
ethnicity_cat = sample(c("Majority","Minority_1","Minority_2","Missing"),
N_person, replace = TRUE, prob = c(0.65, 0.20, 0.12, 0.03)),
parents_education = factor(
sample(c("Low","Medium","High"), N_person, replace = TRUE,
prob = c(0.35, 0.45, 0.20)),
levels = c("Low","Medium","High"), ordered = TRUE
),
parents_income_quintile = sample(1:5, N_person, replace = TRUE),
adhd_flag = rbinom(N_person, 1, 0.20) # ~20% prevalence
)
# Min-Max for first diagnosis
first_dx_pool <- seq(as.Date("2010-01-01"), as.Date("2024-12-31"), by = "day")
# Index cases
cases_idx <- which(persons$adhd_flag == 1L)
n_cases <- length(cases_idx)
# Fill the column for cases only
persons$first_adhd_dx_date <- as.Date(NA)
persons$index_year_cases <- as.integer(NA)
if (n_cases > 0) {
dx_dates <- sample(first_dx_pool, n_cases, replace = TRUE)
persons$first_adhd_dx_date[cases_idx] <- dx_dates
persons$index_year_cases[cases_idx] <- lubridate::year(dx_dates)
}
# Create 'index_year' for ALL: cases get their year, control gets NA (za sada)
persons <- persons %>%
mutate(index_year = if_else(adhd_flag == 1L, index_year_cases, NA_integer_)) %>%
select(-index_year_cases)
# Sanity-check: all cases should have their year
stopifnot(sum(persons$adhd_flag == 1L & is.na(persons$index_year)) == 0)
band_width <- 5
band_breaks <- seq(min(persons$birth_year, na.rm = TRUE) - 1,
max(persons$birth_year, na.rm = TRUE) + 1,
by = band_width)
persons <- persons %>%
mutate(
birth_band = cut(birth_year, breaks = band_breaks, include.lowest = TRUE),
stratum = paste0(sex, "_", birth_band)
)
(Gde ima dovoljno kontrola ide 1:2, gde nema ide 1:1, gde nema ni jedne — biće fallback.)
# ---- STEP 5: Adaptive 1:K (K ≤ 2) greedy matching within strata ----
library(dplyr)
library(tidyr)
set.seed(42) # for reproducibility of random order within strata
K_max <- 2 # target up to 1:2 (adaptive per stratum)
# 5.1) Compute feasible K per stratum
K_by_stratum <- persons %>%
group_by(stratum) %>%
summarise(
cases = sum(adhd_flag == 1L),
controls = sum(adhd_flag == 0L),
K_stratum = ifelse(cases > 0, pmin(K_max, floor(controls / cases)), 0L),
.groups = "drop"
)
# 5.2) Prepare cases and controls
cases <- persons %>%
filter(adhd_flag == 1L) %>%
select(person_id, index_year, stratum) %>%
left_join(K_by_stratum, by = "stratum") %>%
filter(K_stratum > 0L) %>% # keep only strata where we can match
arrange(stratum, person_id) %>%
rename(case_id = person_id, case_index_year = index_year)
controls <- persons %>%
filter(adhd_flag == 0L) %>%
select(person_id, stratum) %>%
# randomize within stratum to avoid systematic bias by ID ordering
group_by(stratum) %>%
slice_sample(prop = 1) %>% # equivalent to arrange by random within strata
ungroup() %>%
rename(control_id = person_id)
# 5.3) Expand cases to K_stratum "slots" and create a within-stratum slot rank
# The slot_rank runs 1..(cases * K_stratum) inside each stratum.
cases_slots <- cases %>%
# replicate each case row K_stratum times (per stratum-specific K)
group_by(stratum) %>%
tidyr::uncount(weights = K_stratum, .remove = FALSE) %>%
# order slots deterministically by case_id then the replicate order
arrange(stratum, case_id, .by_group = TRUE) %>%
mutate(slot_rank = row_number()) %>%
ungroup()
# 5.4) Give each control a slot_rank within its stratum (unique controls only)
controls_slots <- controls %>%
group_by(stratum) %>%
mutate(slot_rank = row_number()) %>%
ungroup()
# 5.5) Greedy pairing by (stratum, slot_rank)
# This guarantees each control is used at most once, and each case slot gets at most one control.
matched <- inner_join(
cases_slots %>% select(stratum, slot_rank, case_id, case_index_year),
controls_slots %>% select(stratum, slot_rank, control_id),
by = c("stratum","slot_rank")
) %>%
select(case_id, control_id, stratum, case_index_year)
# 5.6) Assign pseudo-index to matched controls (cases already have index_year)
persons <- persons %>%
left_join(matched %>% select(control_id, case_index_year),
by = c("person_id" = "control_id")) %>%
mutate(index_year = if_else(adhd_flag == 0L & is.na(index_year),
case_index_year, index_year)) %>%
select(-case_index_year)
# 5.7) Quick counts of truly unmatched controls (after assignment)
unmatched_controls <- sum(is.na(persons$index_year) & persons$adhd_flag == 0L)
unmatched_controls
## [1] 2391
library(dplyr)
# A) How many controls per case (did we reach target K in each stratum?)
case_counts <- matched %>%
count(case_id, name = "n_controls") %>%
arrange(n_controls)
summary(case_counts$n_controls)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
cases_ltK <- case_counts %>% filter(n_controls < 2) # since K_max=2
nrow(cases_ltK); head(cases_ltK)
## [1] 0
## # A tibble: 0 × 2
## # ℹ 2 variables: case_id <chr>, n_controls <int>
# B) Did we recycle controls? (should be 1 in classical CC matching)
control_usage <- matched %>%
count(control_id, name = "times_used") %>%
arrange(desc(times_used))
summary(control_usage$times_used)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
control_usage %>% filter(times_used > 1) %>% head(20)
## # A tibble: 0 × 2
## # ℹ 2 variables: control_id <chr>, times_used <int>
# C) Achieved K per stratum (diagnostics vs. K_by_stratum)
achieved_by_stratum <- matched %>%
count(stratum, case_id, name = "n_controls") %>%
group_by(stratum) %>%
summarise(
cases_matched = n(), # number of cases with ≥1 control
mean_K = mean(n_controls),
min_K = min(n_controls),
max_K = max(n_controls),
.groups = "drop"
) %>%
left_join(K_by_stratum %>% select(stratum, K_stratum), by = "stratum")
achieved_by_stratum
## # A tibble: 8 × 6
## stratum cases_matched mean_K min_K max_K K_stratum
## <chr> <int> <dbl> <int> <int> <dbl>
## 1 F_(1994,1999] 169 2 2 2 2
## 2 F_(1999,2004] 150 2 2 2 2
## 3 F_(2004,2009] 152 2 2 2 2
## 4 F_[1989,1994] 152 2 2 2 2
## 5 M_(1994,1999] 134 2 2 2 2
## 6 M_(1999,2004] 164 2 2 2 2
## 7 M_(2004,2009] 119 2 2 2 2
## 8 M_[1989,1994] 163 2 2 2 2
7B) (PRIMER) Simulacija support_wide ako nemaš realni fajl
years <- 2010:2024 # koristi isti raspon kao u tvom projektu
support_wide <- tibble::tibble(person_id = persons$person_id)
for (yy in years) {
set.seed(100 + yy) # reproduktivnost po godini
# primer bazne verovatnoće (blagi trend kroz godine)
p_base <- plogis(qlogis(0.28) + 0.02 * (yy - median(years)))
support_wide[[paste0("support_", yy)]] <- rbinom(nrow(support_wide), 1, p_base)
}
# dodaj osobine iz persons (uključujući index_year)
support_wide <- support_wide %>%
dplyr::left_join(
persons %>%
dplyr::select(person_id, adhd_flag, index_year, sex, ethnicity_cat,
parents_education, parents_income_quintile, birth_year),
by = "person_id"
)
Cilj: iz support_wide (godine u kolonama) napravimo tabelu gde je jedan red = osoba × godina, sa ishodom support_any (0/1) i pomoćnim kolonama (post, rel_year, calendar_year, age_at_year).