Real-World Study

set.seed(20251006)
# setwd("C:\\Users\\hed2\\Downloads\\rws")

# Load packages
pkgs <- c("data.table", "dplyr", "tableone", "MatchIt", "cobalt",
          "survey", "survival", "survminer", "ggplot2", "broom", "sandwich", "lmtest")
# for (p in pkgs) if(!requireNamespace(p, quietly = TRUE)) install.packages(p)
lapply(pkgs, library, character.only = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##  cobalt (Version 4.6.1, Build Date: 2025-08-20)
## 
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
## 
##     lalonde
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:data.table':
## 
##     yearmon, yearqtr
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## [[1]]
## [1] "data.table" "stats"      "graphics"   "grDevices"  "utils"     
## [6] "datasets"   "methods"    "base"      
## 
## [[2]]
## [1] "dplyr"      "data.table" "stats"      "graphics"   "grDevices" 
## [6] "utils"      "datasets"   "methods"    "base"      
## 
## [[3]]
##  [1] "tableone"   "dplyr"      "data.table" "stats"      "graphics"  
##  [6] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[4]]
##  [1] "MatchIt"    "tableone"   "dplyr"      "data.table" "stats"     
##  [6] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [11] "base"      
## 
## [[5]]
##  [1] "cobalt"     "MatchIt"    "tableone"   "dplyr"      "data.table"
##  [6] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [11] "methods"    "base"      
## 
## [[6]]
##  [1] "survey"     "survival"   "Matrix"     "grid"       "cobalt"    
##  [6] "MatchIt"    "tableone"   "dplyr"      "data.table" "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[7]]
##  [1] "survey"     "survival"   "Matrix"     "grid"       "cobalt"    
##  [6] "MatchIt"    "tableone"   "dplyr"      "data.table" "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[8]]
##  [1] "survminer"  "ggpubr"     "ggplot2"    "survey"     "survival"  
##  [6] "Matrix"     "grid"       "cobalt"     "MatchIt"    "tableone"  
## [11] "dplyr"      "data.table" "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[9]]
##  [1] "survminer"  "ggpubr"     "ggplot2"    "survey"     "survival"  
##  [6] "Matrix"     "grid"       "cobalt"     "MatchIt"    "tableone"  
## [11] "dplyr"      "data.table" "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[10]]
##  [1] "broom"      "survminer"  "ggpubr"     "ggplot2"    "survey"    
##  [6] "survival"   "Matrix"     "grid"       "cobalt"     "MatchIt"   
## [11] "tableone"   "dplyr"      "data.table" "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[11]]
##  [1] "sandwich"   "broom"      "survminer"  "ggpubr"     "ggplot2"   
##  [6] "survey"     "survival"   "Matrix"     "grid"       "cobalt"    
## [11] "MatchIt"    "tableone"   "dplyr"      "data.table" "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"      
## 
## [[12]]
##  [1] "lmtest"     "zoo"        "sandwich"   "broom"      "survminer" 
##  [6] "ggpubr"     "ggplot2"    "survey"     "survival"   "Matrix"    
## [11] "grid"       "cobalt"     "MatchIt"    "tableone"   "dplyr"     
## [16] "data.table" "stats"      "graphics"   "grDevices"  "utils"     
## [21] "datasets"   "methods"    "base"
# ------------------------------------------------------
# 1. Simulate Real-World Dataset
# ------------------------------------------------------
n <- 1000
dat <- data.frame(
  patient_id = 1:n,
  exposure_group = sample(c(0, 1), n, replace = TRUE, prob = c(0.4, 0.6)),
  age_at_index = round(rnorm(n, 60, 10)),
  sex = sample(c("Male", "Female"), n, replace = TRUE),
  baseline_hba1c = round(rnorm(n, 7.5, 1.2), 1),
  comorbidity_score = pmax(0, round(rnorm(n, 2, 1))),
  region = sample(c("North", "South", "East", "West"), n, replace = TRUE)
)

# Simulate follow-up time and event
# treatment A Slightly reduce risk
dat <- dat %>%
  mutate(
    lambda = 0.03 + 0.002 * (age_at_index - 60) + 0.01 * comorbidity_score + ifelse(exposure_group=="A", -0.01, 0),
    follow_up_time = rexp(n, rate = lambda),
    event_flag = rbinom(n, 1, 1 - exp(-lambda * follow_up_time))
  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `follow_up_time = rexp(n, rate = lambda)`.
## Caused by warning in `rexp()`:
## ! NAs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# fwrite(dat, "analysis_dataset.csv")



# ------------------------------------------------------
# 2. Baseline Summary
# ------------------------------------------------------
vars <- c("age_at_index", "sex", "baseline_hba1c", "comorbidity_score", "region")
catVars <- c("sex", "region")
table1 <- CreateTableOne(vars = vars, strata = "exposure_group", data = dat, factorVars = catVars)
print(table1, smd = TRUE)
##                                Stratified by exposure_group
##                                 0             1             p      test SMD   
##   n                               414           586                           
##   age_at_index (mean (SD))      60.75 (9.70)  59.97 (9.45)   0.202       0.082
##   sex = Male (%)                  210 (50.7)    292 (49.8)   0.830       0.018
##   baseline_hba1c (mean (SD))     7.54 (1.19)   7.46 (1.21)   0.311       0.065
##   comorbidity_score (mean (SD))  2.00 (1.02)   1.99 (1.04)   0.955       0.004
##   region (%)                                                 0.641       0.083
##      East                         109 (26.3)    164 (28.0)                    
##      North                        102 (24.6)    128 (21.8)                    
##      South                        107 (25.8)    145 (24.7)                    
##      West                          96 (23.2)    149 (25.4)
# ------------------------------------------------------
# 3. Propensity Score Modeling
# ------------------------------------------------------
ps_formula <- as.formula("exposure_group ~ age_at_index + sex + baseline_hba1c + comorbidity_score + region")
m.out <- matchit(ps_formula, data = dat, method = "nearest", ratio = 1, caliper = 0.2)
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
summary(m.out)
## 
## Call:
## matchit(formula = ps_formula, data = dat, method = "nearest", 
##     caliper = 0.2, ratio = 1)
## 
## Summary of Balance for All Data:
##                   Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                 0.5879        0.5833          0.1345     1.0722
## age_at_index            59.9710       60.7536         -0.0828     0.9496
## sexFemale                0.5017        0.4928          0.0179          .
## sexMale                  0.4983        0.5072         -0.0179          .
## baseline_hba1c           7.4642        7.5423         -0.0647     1.0292
## comorbidity_score        1.9915        1.9952         -0.0036     1.0392
## regionEast               0.2799        0.2633          0.0369          .
## regionNorth              0.2184        0.2464         -0.0676          .
## regionSouth              0.2474        0.2585         -0.0255          .
## regionWest               0.2543        0.2319          0.0514          .
##                   eCDF Mean eCDF Max
## distance             0.0465   0.1059
## age_at_index         0.0145   0.0575
## sexFemale            0.0090   0.0090
## sexMale              0.0090   0.0090
## baseline_hba1c       0.0131   0.0378
## comorbidity_score    0.0037   0.0061
## regionEast           0.0166   0.0166
## regionNorth          0.0279   0.0279
## regionSouth          0.0110   0.0110
## regionWest           0.0224   0.0224
## 
## Summary of Balance for Matched Data:
##                   Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                 0.5888        0.5830          0.1733     1.0027
## age_at_index            59.6869       60.8301         -0.1209     0.9514
## sexFemale                0.4951        0.4951          0.0000          .
## sexMale                  0.5049        0.5049          0.0000          .
## baseline_hba1c           7.4413        7.5524         -0.0922     1.0562
## comorbidity_score        2.0194        1.9927          0.0257     1.0673
## regionEast               0.2791        0.2646          0.0324          .
## regionNorth              0.2209        0.2476         -0.0646          .
## regionSouth              0.2427        0.2597         -0.0394          .
## regionWest               0.2573        0.2282          0.0669          .
##                   eCDF Mean eCDF Max Std. Pair Dist.
## distance             0.0521   0.1117          0.1739
## age_at_index         0.0205   0.0850          0.9431
## sexFemale            0.0000   0.0000          0.5146
## sexMale              0.0000   0.0000          0.5146
## baseline_hba1c       0.0177   0.0558          1.0165
## comorbidity_score    0.0053   0.0121          1.1177
## regionEast           0.0146   0.0146          0.7569
## regionNorth          0.0267   0.0267          0.6168
## regionSouth          0.0170   0.0170          0.7593
## regionWest           0.0291   0.0291          0.7246
## 
## Sample Sizes:
##           Control Treated
## All           414     586
## Matched       412     412
## Unmatched       2     174
## Discarded       0       0
# Extract matched dataset
matched <- match.data(m.out)
cat("Matched sample size:", nrow(matched), "\n")
## Matched sample size: 824
# Balance Visualization
love.plot(m.out, stats = "mean.diffs", threshold = 0.1, var.order = "unadjusted")
## Warning: Standardized mean differences and raw mean differences are present in
## the same plot. Use the `stars` argument to distinguish between them and
## appropriately label the x-axis. See `?love.plot` for details.

# ------------------------------------------------------
# 4. Cox Model on Matched Data
# ------------------------------------------------------
cox_mod <- coxph(Surv(follow_up_time, event_flag) ~ exposure_group + age_at_index + sex + comorbidity_score, data = matched)
summary(cox_mod)
## Call:
## coxph(formula = Surv(follow_up_time, event_flag) ~ exposure_group + 
##     age_at_index + sex + comorbidity_score, data = matched)
## 
##   n= 821, number of events= 415 
##    (3 observations deleted due to missingness)
## 
##                        coef exp(coef)  se(coef)      z Pr(>|z|)    
## exposure_group    -0.059470  0.942264  0.100120 -0.594    0.553    
## age_at_index       0.058694  1.060451  0.005097 11.516  < 2e-16 ***
## sexMale           -0.049999  0.951230  0.099369 -0.503    0.615    
## comorbidity_score  0.361313  1.435213  0.047134  7.666 1.78e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## exposure_group       0.9423     1.0613    0.7744     1.147
## age_at_index         1.0605     0.9430    1.0499     1.071
## sexMale              0.9512     1.0513    0.7829     1.156
## comorbidity_score    1.4352     0.6968    1.3086     1.574
## 
## Concordance= 0.682  (se = 0.015 )
## Likelihood ratio test= 175.9  on 4 df,   p=<2e-16
## Wald test            = 183.6  on 4 df,   p=<2e-16
## Score (logrank) test = 188.4  on 4 df,   p=<2e-16
# Kaplan-Meier Plot
fit_km <- survfit(Surv(follow_up_time, event_flag) ~ exposure_group, data = matched)
# summary(fit_km)
ggsurvplot(fit_km, data = matched, pval = T, risk.table = F, 
           title = "Kaplan-Meier Curve by Treatment Group")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ------------------------------------------------------
# 5. Sensitivity Analysis: IPTW Inverse Probability Treatment Weighting
# ------------------------------------------------------
ps_model <- glm(exposure_group ~ age_at_index + sex + baseline_hba1c + comorbidity_score + region,
                data = dat, family = binomial)
dat$ps <- predict(ps_model, type = "response")
dat <- dat %>% mutate(iptw = ifelse(exposure_group == "A", 1/ps, 1/(1-ps)))   #Inverse Probability Treatment Weighting

# Cut off extreme weights
q <- quantile(dat$iptw, probs = c(0.01, 0.99))   #weight between q1 and q2
dat$iptw_trunc <- pmin(pmax(dat$iptw, q[1]), q[2])

# weight Cox
cox_ipw <- coxph(Surv(follow_up_time, event_flag) ~ exposure_group + age_at_index + sex + comorbidity_score,
                 data = dat,         weights = iptw_trunc)      
summary(cox_ipw)
## Call:
## coxph(formula = Surv(follow_up_time, event_flag) ~ exposure_group + 
##     age_at_index + sex + comorbidity_score, data = dat, weights = iptw_trunc)
## 
##   n= 996, number of events= 507 
##    (4 observations deleted due to missingness)
## 
##                        coef exp(coef)  se(coef) robust se      z Pr(>|z|)    
## exposure_group    -0.068847  0.933470  0.057989  0.092981 -0.740    0.459    
## age_at_index       0.061072  1.062975  0.002985  0.005519 11.067   <2e-16 ***
## sexMale           -0.037441  0.963251  0.057501  0.090894 -0.412    0.680    
## comorbidity_score  0.370673  1.448709  0.027509  0.044524  8.325   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## exposure_group       0.9335     1.0713    0.7780     1.120
## age_at_index         1.0630     0.9408    1.0515     1.075
## sexMale              0.9633     1.0382    0.8061     1.151
## comorbidity_score    1.4487     0.6903    1.3276     1.581
## 
## Concordance= 0.689  (se = 0.014 )
## Likelihood ratio test= 539.1  on 4 df,   p=<2e-16
## Wald test            = 185.3  on 4 df,   p=<2e-16
## Score (logrank) test = 577.2  on 4 df,   p=<2e-16,   Robust = 191.8  p=<2e-16
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).