Real-World Study
set.seed(20251006)
# setwd("C:\\Users\\hed2\\Downloads\\rws")
# Load packages
pkgs <- c("data.table", "dplyr", "tableone", "MatchIt", "cobalt",
"survey", "survival", "survminer", "ggplot2", "broom", "sandwich", "lmtest")
# for (p in pkgs) if(!requireNamespace(p, quietly = TRUE)) install.packages(p)
lapply(pkgs, library, character.only = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## cobalt (Version 4.6.1, Build Date: 2025-08-20)
##
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
##
## lalonde
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:data.table':
##
## yearmon, yearqtr
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## [[1]]
## [1] "data.table" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
##
## [[2]]
## [1] "dplyr" "data.table" "stats" "graphics" "grDevices"
## [6] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "tableone" "dplyr" "data.table" "stats" "graphics"
## [6] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "MatchIt" "tableone" "dplyr" "data.table" "stats"
## [6] "graphics" "grDevices" "utils" "datasets" "methods"
## [11] "base"
##
## [[5]]
## [1] "cobalt" "MatchIt" "tableone" "dplyr" "data.table"
## [6] "stats" "graphics" "grDevices" "utils" "datasets"
## [11] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "cobalt"
## [6] "MatchIt" "tableone" "dplyr" "data.table" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[7]]
## [1] "survey" "survival" "Matrix" "grid" "cobalt"
## [6] "MatchIt" "tableone" "dplyr" "data.table" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[8]]
## [1] "survminer" "ggpubr" "ggplot2" "survey" "survival"
## [6] "Matrix" "grid" "cobalt" "MatchIt" "tableone"
## [11] "dplyr" "data.table" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "survminer" "ggpubr" "ggplot2" "survey" "survival"
## [6] "Matrix" "grid" "cobalt" "MatchIt" "tableone"
## [11] "dplyr" "data.table" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "broom" "survminer" "ggpubr" "ggplot2" "survey"
## [6] "survival" "Matrix" "grid" "cobalt" "MatchIt"
## [11] "tableone" "dplyr" "data.table" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "sandwich" "broom" "survminer" "ggpubr" "ggplot2"
## [6] "survey" "survival" "Matrix" "grid" "cobalt"
## [11] "MatchIt" "tableone" "dplyr" "data.table" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[12]]
## [1] "lmtest" "zoo" "sandwich" "broom" "survminer"
## [6] "ggpubr" "ggplot2" "survey" "survival" "Matrix"
## [11] "grid" "cobalt" "MatchIt" "tableone" "dplyr"
## [16] "data.table" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
# ------------------------------------------------------
# 1. Simulate Real-World Dataset
# ------------------------------------------------------
n <- 1000
dat <- data.frame(
patient_id = 1:n,
exposure_group = sample(c(0, 1), n, replace = TRUE, prob = c(0.4, 0.6)),
age_at_index = round(rnorm(n, 60, 10)),
sex = sample(c("Male", "Female"), n, replace = TRUE),
baseline_hba1c = round(rnorm(n, 7.5, 1.2), 1),
comorbidity_score = pmax(0, round(rnorm(n, 2, 1))),
region = sample(c("North", "South", "East", "West"), n, replace = TRUE)
)
# Simulate follow-up time and event
# treatment A Slightly reduce risk
dat <- dat %>%
mutate(
lambda = 0.03 + 0.002 * (age_at_index - 60) + 0.01 * comorbidity_score + ifelse(exposure_group=="A", -0.01, 0),
follow_up_time = rexp(n, rate = lambda),
event_flag = rbinom(n, 1, 1 - exp(-lambda * follow_up_time))
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `follow_up_time = rexp(n, rate = lambda)`.
## Caused by warning in `rexp()`:
## ! NAs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# fwrite(dat, "analysis_dataset.csv")
# ------------------------------------------------------
# 2. Baseline Summary
# ------------------------------------------------------
vars <- c("age_at_index", "sex", "baseline_hba1c", "comorbidity_score", "region")
catVars <- c("sex", "region")
table1 <- CreateTableOne(vars = vars, strata = "exposure_group", data = dat, factorVars = catVars)
print(table1, smd = TRUE)
## Stratified by exposure_group
## 0 1 p test SMD
## n 414 586
## age_at_index (mean (SD)) 60.75 (9.70) 59.97 (9.45) 0.202 0.082
## sex = Male (%) 210 (50.7) 292 (49.8) 0.830 0.018
## baseline_hba1c (mean (SD)) 7.54 (1.19) 7.46 (1.21) 0.311 0.065
## comorbidity_score (mean (SD)) 2.00 (1.02) 1.99 (1.04) 0.955 0.004
## region (%) 0.641 0.083
## East 109 (26.3) 164 (28.0)
## North 102 (24.6) 128 (21.8)
## South 107 (25.8) 145 (24.7)
## West 96 (23.2) 149 (25.4)
# ------------------------------------------------------
# 3. Propensity Score Modeling
# ------------------------------------------------------
ps_formula <- as.formula("exposure_group ~ age_at_index + sex + baseline_hba1c + comorbidity_score + region")
m.out <- matchit(ps_formula, data = dat, method = "nearest", ratio = 1, caliper = 0.2)
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
summary(m.out)
##
## Call:
## matchit(formula = ps_formula, data = dat, method = "nearest",
## caliper = 0.2, ratio = 1)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.5879 0.5833 0.1345 1.0722
## age_at_index 59.9710 60.7536 -0.0828 0.9496
## sexFemale 0.5017 0.4928 0.0179 .
## sexMale 0.4983 0.5072 -0.0179 .
## baseline_hba1c 7.4642 7.5423 -0.0647 1.0292
## comorbidity_score 1.9915 1.9952 -0.0036 1.0392
## regionEast 0.2799 0.2633 0.0369 .
## regionNorth 0.2184 0.2464 -0.0676 .
## regionSouth 0.2474 0.2585 -0.0255 .
## regionWest 0.2543 0.2319 0.0514 .
## eCDF Mean eCDF Max
## distance 0.0465 0.1059
## age_at_index 0.0145 0.0575
## sexFemale 0.0090 0.0090
## sexMale 0.0090 0.0090
## baseline_hba1c 0.0131 0.0378
## comorbidity_score 0.0037 0.0061
## regionEast 0.0166 0.0166
## regionNorth 0.0279 0.0279
## regionSouth 0.0110 0.0110
## regionWest 0.0224 0.0224
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.5888 0.5830 0.1733 1.0027
## age_at_index 59.6869 60.8301 -0.1209 0.9514
## sexFemale 0.4951 0.4951 0.0000 .
## sexMale 0.5049 0.5049 0.0000 .
## baseline_hba1c 7.4413 7.5524 -0.0922 1.0562
## comorbidity_score 2.0194 1.9927 0.0257 1.0673
## regionEast 0.2791 0.2646 0.0324 .
## regionNorth 0.2209 0.2476 -0.0646 .
## regionSouth 0.2427 0.2597 -0.0394 .
## regionWest 0.2573 0.2282 0.0669 .
## eCDF Mean eCDF Max Std. Pair Dist.
## distance 0.0521 0.1117 0.1739
## age_at_index 0.0205 0.0850 0.9431
## sexFemale 0.0000 0.0000 0.5146
## sexMale 0.0000 0.0000 0.5146
## baseline_hba1c 0.0177 0.0558 1.0165
## comorbidity_score 0.0053 0.0121 1.1177
## regionEast 0.0146 0.0146 0.7569
## regionNorth 0.0267 0.0267 0.6168
## regionSouth 0.0170 0.0170 0.7593
## regionWest 0.0291 0.0291 0.7246
##
## Sample Sizes:
## Control Treated
## All 414 586
## Matched 412 412
## Unmatched 2 174
## Discarded 0 0
# Extract matched dataset
matched <- match.data(m.out)
cat("Matched sample size:", nrow(matched), "\n")
## Matched sample size: 824
# Balance Visualization
love.plot(m.out, stats = "mean.diffs", threshold = 0.1, var.order = "unadjusted")
## Warning: Standardized mean differences and raw mean differences are present in
## the same plot. Use the `stars` argument to distinguish between them and
## appropriately label the x-axis. See `?love.plot` for details.

# ------------------------------------------------------
# 4. Cox Model on Matched Data
# ------------------------------------------------------
cox_mod <- coxph(Surv(follow_up_time, event_flag) ~ exposure_group + age_at_index + sex + comorbidity_score, data = matched)
summary(cox_mod)
## Call:
## coxph(formula = Surv(follow_up_time, event_flag) ~ exposure_group +
## age_at_index + sex + comorbidity_score, data = matched)
##
## n= 821, number of events= 415
## (3 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## exposure_group -0.059470 0.942264 0.100120 -0.594 0.553
## age_at_index 0.058694 1.060451 0.005097 11.516 < 2e-16 ***
## sexMale -0.049999 0.951230 0.099369 -0.503 0.615
## comorbidity_score 0.361313 1.435213 0.047134 7.666 1.78e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## exposure_group 0.9423 1.0613 0.7744 1.147
## age_at_index 1.0605 0.9430 1.0499 1.071
## sexMale 0.9512 1.0513 0.7829 1.156
## comorbidity_score 1.4352 0.6968 1.3086 1.574
##
## Concordance= 0.682 (se = 0.015 )
## Likelihood ratio test= 175.9 on 4 df, p=<2e-16
## Wald test = 183.6 on 4 df, p=<2e-16
## Score (logrank) test = 188.4 on 4 df, p=<2e-16
# Kaplan-Meier Plot
fit_km <- survfit(Surv(follow_up_time, event_flag) ~ exposure_group, data = matched)
# summary(fit_km)
ggsurvplot(fit_km, data = matched, pval = T, risk.table = F,
title = "Kaplan-Meier Curve by Treatment Group")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ------------------------------------------------------
# 5. Sensitivity Analysis: IPTW Inverse Probability Treatment Weighting
# ------------------------------------------------------
ps_model <- glm(exposure_group ~ age_at_index + sex + baseline_hba1c + comorbidity_score + region,
data = dat, family = binomial)
dat$ps <- predict(ps_model, type = "response")
dat <- dat %>% mutate(iptw = ifelse(exposure_group == "A", 1/ps, 1/(1-ps))) #Inverse Probability Treatment Weighting
# Cut off extreme weights
q <- quantile(dat$iptw, probs = c(0.01, 0.99)) #weight between q1 and q2
dat$iptw_trunc <- pmin(pmax(dat$iptw, q[1]), q[2])
# weight Cox
cox_ipw <- coxph(Surv(follow_up_time, event_flag) ~ exposure_group + age_at_index + sex + comorbidity_score,
data = dat, weights = iptw_trunc)
summary(cox_ipw)
## Call:
## coxph(formula = Surv(follow_up_time, event_flag) ~ exposure_group +
## age_at_index + sex + comorbidity_score, data = dat, weights = iptw_trunc)
##
## n= 996, number of events= 507
## (4 observations deleted due to missingness)
##
## coef exp(coef) se(coef) robust se z Pr(>|z|)
## exposure_group -0.068847 0.933470 0.057989 0.092981 -0.740 0.459
## age_at_index 0.061072 1.062975 0.002985 0.005519 11.067 <2e-16 ***
## sexMale -0.037441 0.963251 0.057501 0.090894 -0.412 0.680
## comorbidity_score 0.370673 1.448709 0.027509 0.044524 8.325 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## exposure_group 0.9335 1.0713 0.7780 1.120
## age_at_index 1.0630 0.9408 1.0515 1.075
## sexMale 0.9633 1.0382 0.8061 1.151
## comorbidity_score 1.4487 0.6903 1.3276 1.581
##
## Concordance= 0.689 (se = 0.014 )
## Likelihood ratio test= 539.1 on 4 df, p=<2e-16
## Wald test = 185.3 on 4 df, p=<2e-16
## Score (logrank) test = 577.2 on 4 df, p=<2e-16, Robust = 191.8 p=<2e-16
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).