Step 1 —

# Reproducibilnost i paketi
set.seed(123)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(purrr)   #  fallback
# Basic structure
N_person <- 6000
years    <- 2010:2024

Step 2 — Create Dataset

persons <- tibble(
  person_id = sprintf("P%05d", 1:N_person),
  sex = sample(c("M","F"), N_person, replace = TRUE),
  birth_year = sample(1990:2008, N_person, replace = TRUE),
  ethnicity_cat = sample(c("Majority","Minority_1","Minority_2","Missing"),
                         N_person, replace = TRUE, prob = c(0.65, 0.20, 0.12, 0.03)),
  parents_education = factor(
    sample(c("Low","Medium","High"), N_person, replace = TRUE,
           prob = c(0.35, 0.45, 0.20)),
    levels = c("Low","Medium","High"), ordered = TRUE
  ),
  parents_income_quintile = sample(1:5, N_person, replace = TRUE),
  adhd_flag = rbinom(N_person, 1, 0.20)  # ~20% prevalence
)

Korak 3 — Data of first diagnosis for cases

# Min-Max for first diagnosis
first_dx_pool <- seq(as.Date("2010-01-01"), as.Date("2024-12-31"), by = "day")

# Index cases
cases_idx <- which(persons$adhd_flag == 1L)
n_cases   <- length(cases_idx)

# Fill the column for cases only
persons$first_adhd_dx_date <- as.Date(NA)
persons$index_year_cases   <- as.integer(NA)
if (n_cases > 0) {
  dx_dates <- sample(first_dx_pool, n_cases, replace = TRUE)
  persons$first_adhd_dx_date[cases_idx] <- dx_dates
  persons$index_year_cases[cases_idx]   <- lubridate::year(dx_dates)
}

# Create 'index_year' for ALL: cases get their year, control gets NA (za sada)
persons <- persons %>%
  mutate(index_year = if_else(adhd_flag == 1L, index_year_cases, NA_integer_)) %>%
  select(-index_year_cases)

# Sanity-check: all cases should have their year
stopifnot(sum(persons$adhd_flag == 1L & is.na(persons$index_year)) == 0)

Step 4 — Defin stratum for matchin (sex + 5-year interval)

band_width <- 5
band_breaks <- seq(min(persons$birth_year, na.rm = TRUE) - 1,
                   max(persons$birth_year, na.rm = TRUE) + 1,
                   by = band_width)

persons <- persons %>%
  mutate(
    birth_band = cut(birth_year, breaks = band_breaks, include.lowest = TRUE),
    stratum    = paste0(sex, "_", birth_band)
  )

Korak 5 — Adaptivni 1:K (K ≤ 2) greedy matching unutar stratuma

(Gde ima dovoljno kontrola ide 1:2, gde nema ide 1:1, gde nema ni jedne — biće fallback.)

# ---- STEP 5: Adaptive 1:K (K ≤ 2) greedy matching within strata ----
library(dplyr)
library(tidyr)

set.seed(42)  # for reproducibility of random order within strata

K_max <- 2  # target up to 1:2 (adaptive per stratum)

# 5.1) Compute feasible K per stratum
K_by_stratum <- persons %>%
  group_by(stratum) %>%
  summarise(
    cases     = sum(adhd_flag == 1L),
    controls  = sum(adhd_flag == 0L),
    K_stratum = ifelse(cases > 0, pmin(K_max, floor(controls / cases)), 0L),
    .groups = "drop"
  )

# 5.2) Prepare cases and controls
cases <- persons %>%
  filter(adhd_flag == 1L) %>%
  select(person_id, index_year, stratum) %>%
  left_join(K_by_stratum, by = "stratum") %>%
  filter(K_stratum > 0L) %>%                      # keep only strata where we can match
  arrange(stratum, person_id) %>%
  rename(case_id = person_id, case_index_year = index_year)

controls <- persons %>%
  filter(adhd_flag == 0L) %>%
  select(person_id, stratum) %>%
  # randomize within stratum to avoid systematic bias by ID ordering
  group_by(stratum) %>%
  slice_sample(prop = 1) %>%                      # equivalent to arrange by random within strata
  ungroup() %>%
  rename(control_id = person_id)

# 5.3) Expand cases to K_stratum "slots" and create a within-stratum slot rank
# The slot_rank runs 1..(cases * K_stratum) inside each stratum.
cases_slots <- cases %>%
  # replicate each case row K_stratum times (per stratum-specific K)
  group_by(stratum) %>%
  tidyr::uncount(weights = K_stratum, .remove = FALSE) %>%
  # order slots deterministically by case_id then the replicate order
  arrange(stratum, case_id, .by_group = TRUE) %>%
  mutate(slot_rank = row_number()) %>%
  ungroup()

# 5.4) Give each control a slot_rank within its stratum (unique controls only)
controls_slots <- controls %>%
  group_by(stratum) %>%
  mutate(slot_rank = row_number()) %>%
  ungroup()

# 5.5) Greedy pairing by (stratum, slot_rank)
# This guarantees each control is used at most once, and each case slot gets at most one control.
matched <- inner_join(
  cases_slots %>% select(stratum, slot_rank, case_id, case_index_year),
  controls_slots %>% select(stratum, slot_rank, control_id),
  by = c("stratum","slot_rank")
) %>%
  select(case_id, control_id, stratum, case_index_year)

# 5.6) Assign pseudo-index to matched controls (cases already have index_year)
persons <- persons %>%
  left_join(matched %>% select(control_id, case_index_year),
            by = c("person_id" = "control_id")) %>%
  mutate(index_year = if_else(adhd_flag == 0L & is.na(index_year),
                              case_index_year, index_year)) %>%
  select(-case_index_year)

# 5.7) Quick counts of truly unmatched controls (after assignment)
unmatched_controls <- sum(is.na(persons$index_year) & persons$adhd_flag == 0L)
unmatched_controls
## [1] 2391

Matching check

library(dplyr)

# A) How many controls per case (did we reach target K in each stratum?)
case_counts <- matched %>%
  count(case_id, name = "n_controls") %>%
  arrange(n_controls)

summary(case_counts$n_controls)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2
cases_ltK <- case_counts %>% filter(n_controls < 2)   # since K_max=2
nrow(cases_ltK); head(cases_ltK)
## [1] 0
## # A tibble: 0 × 2
## # ℹ 2 variables: case_id <chr>, n_controls <int>
# B) Did we recycle controls? (should be 1 in classical CC matching)
control_usage <- matched %>%
  count(control_id, name = "times_used") %>%
  arrange(desc(times_used))

summary(control_usage$times_used)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
control_usage %>% filter(times_used > 1) %>% head(20)
## # A tibble: 0 × 2
## # ℹ 2 variables: control_id <chr>, times_used <int>
# C) Achieved K per stratum (diagnostics vs. K_by_stratum)
achieved_by_stratum <- matched %>%
  count(stratum, case_id, name = "n_controls") %>%
  group_by(stratum) %>%
  summarise(
    cases_matched = n(),                         # number of cases with ≥1 control
    mean_K        = mean(n_controls),
    min_K         = min(n_controls),
    max_K         = max(n_controls),
    .groups = "drop"
  ) %>%
  left_join(K_by_stratum %>% select(stratum, K_stratum), by = "stratum")

achieved_by_stratum
## # A tibble: 8 × 6
##   stratum       cases_matched mean_K min_K max_K K_stratum
##   <chr>                 <int>  <dbl> <int> <int>     <dbl>
## 1 F_(1994,1999]           169      2     2     2         2
## 2 F_(1999,2004]           150      2     2     2         2
## 3 F_(2004,2009]           152      2     2     2         2
## 4 F_[1989,1994]           152      2     2     2         2
## 5 M_(1994,1999]           134      2     2     2         2
## 6 M_(1999,2004]           164      2     2     2         2
## 7 M_(2004,2009]           119      2     2     2         2
## 8 M_[1989,1994]           163      2     2     2         2

Korak 7 — Priprema “support_wide” (0/1 po kalendarskoj godini)

7B) (PRIMER) Simulacija support_wide ako nemaš realni fajl

years <- 2010:2024  # koristi isti raspon kao u tvom projektu

support_wide <- tibble::tibble(person_id = persons$person_id)
for (yy in years) {
  set.seed(100 + yy)  # reproduktivnost po godini
  # primer bazne verovatnoće (blagi trend kroz godine)
  p_base <- plogis(qlogis(0.28) + 0.02 * (yy - median(years)))
  support_wide[[paste0("support_", yy)]] <- rbinom(nrow(support_wide), 1, p_base)
}

# dodaj osobine iz persons (uključujući index_year)
support_wide <- support_wide %>%
  dplyr::left_join(
    persons %>%
      dplyr::select(person_id, adhd_flag, index_year, sex, ethnicity_cat,
                    parents_education, parents_income_quintile, birth_year),
    by = "person_id"
  )

Korak 8 — Napravi person-year panel (LONG) i izvedene kolone

Cilj: iz support_wide (godine u kolonama) napravimo tabelu gde je jedan red = osoba × godina, sa ishodom support_any (0/1) i pomoćnim kolonama (post, rel_year, calendar_year, age_at_year).