UMIT course, PRACTICAL 5: Two-Stage Adjustment

by Ian White and Nicholas Latimer

adaptation for R by Paul Schneider, updated by Ian 8/3/2024

# Function to install and load all the packages used in the course
install_n_load <- function(packages) {
  for (package in packages) {
    if (eval(parse(text = paste("require(", package, ")"))) == 0) {
      install.packages(package)
    }
    eval(parse(text = paste("require(", package, ")")))
    
  }
}
required_packages = c("haven",
                      "psych",
                      "survival",
                      "flexsurv",
                      "ggplot2",
                      "cowplot",
                      "survminer",
                      "geepack",
                      "Hmisc",
                      "eha",
                      "rpsftm",
                      "rms")
# # clear work space from previous results
rm(list = setdiff(ls(), c("install_n_load", "required_packages")))
# # Load required packages
install_n_load(required_packages)
options(scipen = 99999)
(a) Select confounders. Which covariates might be prognostic?

Note: part (b) retains relevant covariates, but think about it here.

(b) Create the dataset for Stage 1 of the analysis.
# # read unadjusted data
novCEA = read.csv("novCEA_IPCWready.csv")
# # alternatively, use:
# novCEA = read.csv(file.choose())

# step I
novCEA = novCEA[-which(novCEA$trtrand ==1 | 
                         novCEA$progind == 0 | 
                         novCEA$time < novCEA$progtime
                       ),]
novCEA$disconexp = NULL

# step II  Create variables for 'treatment group' analysis
novCEA$ontrtxo = with(novCEA, ifelse(time>=xotime & !is.na(xotime),1,0 ))

# step III Create a variable representing time since disease progression
novCEA$deathtime = novCEA$deathtime - novCEA$progtime
novCEA$time = novCEA$time - novCEA$progtime

# step IV Ensure you have data for each patient characteristic measured at the time of progression
novCEA$HRQLprog = NA
uniq.subj = unique(novCEA$id)
for(subj in uniq.subj){
  temp.df = novCEA[novCEA$id == subj,]
  temp.df$HRQLprog = temp.df$HRQLtdc[order(temp.df$time)][1]
  novCEA[novCEA$id == subj,] = temp.df
}

# step V Format dataset correctly for stage 1 of the analysis
# THIS STEP CORRECTED BY IAN 24FEB2022

# novCEA$deathindtd = novCEA$finalobs = 0
# 
# uniq.subj = unique(novCEA$id)
# for(subj in uniq.subj){
#   temp.df = novCEA[novCEA$id == subj,] 
#   novCEA = novCEA[-which(novCEA$id == subj),] 
#   temp.df = temp.df[order(temp.df$time),]
#   last.obs.dup = temp.df[length(temp.df$time),]
#   last.obs.dup$finalobs = 1
#   last.obs.dup$time = last.obs.dup$deathtime
#   temp.df = temp.df[temp.df$time != last.obs.dup$time,]
#   if(last.obs.dup$deathind){last.obs.dup$deathindtd = 1}
#   novCEA = rbind(novCEA,temp.df,last.obs.dup)
# }
# 
# 
# # Stata produces start and end times for survival data automatically
# # In R, we need to do this manually:
# novCEA$st = ifelse(novCEA$time==0,0,1)
# novCEA$d_ = novCEA$deathindtd # censored or event
# novCEA$t_0 = novCEA$t_ = NA # t_0 start time; t_ stop time of interval
# 
# uniq.subj = unique(novCEA$id)
# for(s in 1:length(uniq.subj)){
# 
#   temp.df = novCEA[novCEA$id == uniq.subj[s],]
#   temp.df = temp.df[order(temp.df$time),]
#   
#   for(o in 2:length(temp.df$time)){
#     temp.df$t_0[o] = temp.df$time[o-1]
#     temp.df$t_[o] = temp.df$time[o]
#   }
#   novCEA[novCEA$id == uniq.subj[s],] = temp.df
# }

novCEA$t_0 = novCEA$time
novCEA$t_ = pmin(novCEA$time+21, novCEA$deathtime)
novCEA$d_ = novCEA$deathind & novCEA$t_==novCEA$deathtime
# END OF CORRECTIONS 24FEB2022

novCEA.2s = novCEA 
write.csv(novCEA.2s,file="NovCEA_stage1_R.csv",row.names = F)
# Or, simply use the NovCEA_2stage dataset, 
# which is the dataset created by practical4reformat.do:
# novCEA.2s = as.data.frame(read_dta("NovCEA_2stage_stage1.dta"))



(c) Estimate the effect of switching, using a Weibull model.

Retain theta and model fit statistics.

Note: theta = exp(weib.model$coefficients[“ontrtxo”]). Theta estimates can differ between R and Stata.

# it's easier to remove obs we don't need for the weibull model:
# novCEA.2s = novCEA.2s[novCEA.2s$st==1,]

# fit survival object
surv.2s = Surv(time = novCEA.2s$t_0,
               time2 = novCEA.2s$t_,
               event = novCEA.2s$d_)

# fit weibull aft model
weib.model = aftreg(surv.2s ~ ontrtxo + prognosisb + progtime + CEAb + CEAprog + HRQLb+ HRQLprog, 
                    data = novCEA.2s, 
                    dist="weibull", 
                    param = "lifeExp")
# IDEALLY WOULD ADD id = id OPTION BUT IT REALLY MAKES COEFFICIENTS DISAGREE WITH STATA 

# Stata and R provide completely different log Likelihood values (and AIC values respectively) for thw model:
# Loglik:
weib.model$loglik[2]
## [1] -555.1136
# AIC
2* length(weib.model$coefficients) - 2*(weib.model$loglik[2])
## [1] 1128.227
# The LR test statistic, however, is the same:
lrstat <- -2 * (weib.model$loglik[1] - weib.model$loglik[2])
round(lrstat,2)
## [1] 57.89
# Coefficient estimates differ between R and Stata differ at the 4th digit
# log(scale) = _cons (in Stata)
# log(shape) = /ln_p (in Stata)
weib.model                              
## Call:
## aftreg(formula = surv.2s ~ ontrtxo + prognosisb + progtime + 
##     CEAb + CEAprog + HRQLb + HRQLprog, data = novCEA.2s, dist = "weibull", 
##     param = "lifeExp")
## 
## Covariate          W.mean      Coef Life-Expn  se(Coef)    Wald p
## ontrtxo             0.659     0.558     1.746     0.206     0.007 
## prognosisb          0.334    -0.301     0.740     0.357     0.400 
## progtime          118.451     0.004     1.004     0.002     0.053 
## CEAb               19.917    -0.041     0.960     0.065     0.531 
## CEAprog            27.148     0.028     1.028     0.033     0.400 
## HRQLb               0.438     0.192     1.212     0.541     0.722 
## HRQLprog            0.456    -0.748     0.473     0.562     0.184 
## 
## Baseline parameters:
## log(scale)                    4.609               1.140     0.000 
## log(shape)                    0.635               0.077     0.000 
## Baseline life expectancy:  89.1 
## 
## Events                    101 
## Total time at risk         13152 
## Max. log. likelihood      -555.11 
## LR test statistic         57.9 
## Degrees of freedom        7 
## Overall p-value           0.000000000397085
# Coefficients + 95%CI
round(cbind(coef=weib.model$coefficients, confint(weib.model)),4)
##               coef   2.5 % 97.5 %
## ontrtxo     0.5576  0.1540 0.9611
## prognosisb -0.3008 -1.0009 0.3992
## progtime    0.0043 -0.0001 0.0087
## CEAb       -0.0405 -0.1674 0.0863
## CEAprog     0.0280 -0.0372 0.0931
## HRQLb       0.1923 -0.8675 1.2522
## HRQLprog   -0.7477 -1.8501 0.3546
## log(scale)  4.6091  2.3743 6.8440
## log(shape)  0.6349  0.4830 0.7868
# retain theta
theta = exp(weib.model$coefficients[c("ontrtxo")] ) 
theta
##  ontrtxo 
## 1.746404



(d) Use theta (b) to create the counterfactual dataset.

First estimate shrunken survival times in switchers.

Go back to the original dataset:

novCEA = read.csv("novCEA_IPCWready.csv")

uniq.id = unique(novCEA$id)
for(subj in uniq.id){ # loop to retrieve max values for each subject
  temp = novCEA[novCEA$id == subj,]
  temp.max = temp[1,]
  temp.max[1,] = as.numeric(apply(temp,2,function(x){max(x,na.rm=T)}))
  novCEA = novCEA[novCEA$id != subj,]
  novCEA = rbind(novCEA,temp.max)
}


novCEA$xotime[is.na(novCEA$xotime)] = 0
novCEA$tpxo = novCEA$deathtime - novCEA$xotime
novCEA$txot = novCEA$xotime
novCEA$timecf = with(novCEA,ifelse( trtrand ==0 & xotime>0,
                      (xotime + (tpxo* (1/theta))),
                      deathtime))



(e) Estimate the hazard ratio in the counterfactual dataset. Plot the Kaplan-Meier.
# cox reg HR
surv.adj = Surv(novCEA$timecf,novCEA$deathind)
cox.adj = coxph(surv.adj ~ novCEA$trtrand)
cox.adj
## Call:
## coxph(formula = surv.adj ~ novCEA$trtrand)
## 
##                   coef exp(coef) se(coef)      z             p
## novCEA$trtrand -0.7931    0.4525   0.1322 -5.997 0.00000000201
## 
## Likelihood ratio test=33.87  on 1 df, p=0.000000005889
## n= 300, number of events= 257
cox.adj.alt = coxreg(surv.adj ~ as.factor(novCEA$trtrand),method="breslow")
cox.adj.alt # with time at risk
## Call:
## coxreg(formula = surv.adj ~ as.factor(novCEA$trtrand), method = "breslow")
## 
## Covariate             Mean       Coef     Rel.Risk   S.E.    Wald p
## as.factor(novCEA$trtrand) 
##                0      0.269     0         1 (reference)
##                1      0.731    -0.793     0.453     0.132     0.000 
## 
## Events                    257 
## Total time at risk         78676 
## Max. log. likelihood      -1276.2 
## LR test statistic         33.84 
## Degrees of freedom        1 
## Overall p-value           0.00000000597813
# Hazard Ratios
data.frame(OR =exp(cox.adj$coefficients),
           CI95_lower = exp(confint(cox.adj))[,1], 
           CI95_upper = exp(confint(cox.adj))[,2])
##                       OR CI95_lower CI95_upper
## novCEA$trtrand 0.4524581  0.3491565  0.5863226
# Kaplan meier curve
survfit.adj = survfit(surv.adj ~ novCEA$trtrand)
survfit.adj
## Call: survfit(formula = surv.adj ~ novCEA$trtrand)
## 
##                    n events median 0.95LCL 0.95UCL
## novCEA$trtrand=0 108    103    180     157     205
## novCEA$trtrand=1 192    154    280     258     316
ggsurvplot(survfit.adj, data = novCEA, risk.table=T)



(f) Incorporate recensoring. Use ‘admin’ as the ‘endstudy’ time for each patient.
novCEA$trecens = novCEA$timecf
novCEA$trecens[novCEA$trtrand==0] = novCEA$admin[novCEA$trtrand==0] * (1/theta)
novCEA$deathind[novCEA$trtrand==0 & novCEA$timecf > novCEA$trecens] = 0
novCEA$timecf[novCEA$timecf > novCEA$trecens & novCEA$trtrand == 0] = novCEA$trecens[novCEA$timecf > novCEA$trecens & novCEA$trtrand == 0]



(g) Then estimate the hazard ratio in the counterfactual, recensored dataset. Plot the Kaplan-Meier.
Surv.recen = Surv(novCEA$timecf,novCEA$deathind)
cox.recen = coxph(Surv.recen ~novCEA$trtrand)
exp(cox.recen$coefficients)
## novCEA$trtrand 
##      0.4506357
survfit.recen = survfit(Surv.recen ~novCEA$trtrand)
ggsurvplot(survfit.recen,novCEA,risk.table = T)



(g+) Bootstrap the confidence interval.
# Inefficient and very slow code to bootstrap 95%CI... 
# CAVE: May take a while to compute!
boot_2stage = function(file1 = "NovCEA_stage1_R.csv", 
                       file2 = "novCEA_IPCWready.csv",
                       iterations=1000){
  
  df.1 = read.csv(file1)
  df.2 = read.csv(file2)
  
  est = rep(NA,times=iterations)
  
  novCEA = df.2
  novCEA.2s = df.1
  uniq.id = unique(novCEA$id)
  n.id = length(uniq.id)
  boot.est = rep(NA,times=iterations)
  
  # sample individuals
  for(i in 1:iterations){
    # cat("\r iteration",i,"   of  ",iterations)
    sample.ids = sample(x=uniq.id,size =n.id,replace = T)
    rows1 = 0
    rows2 = 0
    for(j in 1:length(sample.ids)){
      
      rows1 = c(rows1,which(novCEA$id == sample.ids[j]))
      rows2 = c(rows2,which(novCEA.2s$id == sample.ids[j]))
    }
    rows1 = rows1[-1]
    rows2 = rows2[-1]
    novCEA.b = novCEA[rows1,]
    novCEA.2s.b = novCEA.2s[rows2,]
    
    
    # fit survival object
    surv.2s = Surv(time = novCEA.2s.b$'t_0',
                   time2 = novCEA.2s.b$`t_`,
                   event = novCEA.2s.b$`d_`)
    
    # fit weibull aft model
    weib.model = aftreg(surv.2s ~ ontrtxo + prognosisb + progtime + CEAb + CEAprog + HRQLb+ HRQLprog, 
                        data = novCEA.2s.b, 
                        dist="weibull", 
                        param = "lifeExp")
    
    theta = exp(weib.model$coefficients[c("ontrtxo")] ) 
    
    
    #######################
    uniq.ids.b = unique(novCEA.b$id)
    for(subj in uniq.ids.b){ # loop to retrieve max values for each subject
      temp = novCEA.b[novCEA.b$id == subj,]
      temp.max = temp[1,]
      temp.max[1,] = as.numeric(apply(temp,2,function(x){max(x,na.rm=T)}))
      novCEA.b = novCEA.b[novCEA.b$id != subj,]
      novCEA.b = rbind(novCEA.b,temp.max)
    }
    
    
    novCEA.b$xotime[is.na(novCEA.b$xotime)] = 0
    novCEA.b$tpxo = novCEA.b$deathtime - novCEA.b$xotime
    novCEA.b$txot = novCEA.b$xotime
    novCEA.b$timecf = with(novCEA.b,ifelse( trtrand ==0 & xotime>0,
                                            (xotime + (tpxo* (1/theta))),
                                            deathtime))
    
    novCEA.b$trecens = novCEA.b$timecf
    novCEA.b$trecens[novCEA.b$trtrand==0] = novCEA.b$admin[novCEA.b$trtrand==0] * (1/theta)
    novCEA.b$deathind[novCEA.b$trtrand==0 & novCEA.b$timecf > novCEA.b$trecens] = 0
    novCEA.b$timecf[novCEA.b$timecf > novCEA.b$trecens & novCEA.b$trtrand == 0] = novCEA.b$trecens[novCEA.b$timecf > novCEA.b$trecens & novCEA.b$trtrand == 0]
    
    Surv.recen = Surv(novCEA.b$timecf,novCEA.b$deathind)
    cox.recen = coxph(Surv.recen ~novCEA.b$trtrand)
    est[i] =  exp(cox.recen$coefficients)
    
  }
  return(est)
}


bs.est = boot_2stage(iterations = 20) # More iterations: more accurate but slow!
data.frame(est=mean(bs.est),
           lower95CI=quantile(bs.est,probs = c(0.05)),
           upper95CI=quantile(bs.est,probs = c(.95)))
##          est lower95CI upper95CI
## 5% 0.4823828 0.4076459 0.5748167
(h) If you have time, repeat steps (b)-(f) using alternative AFT models.



(i) What do you notice? How might you try to justify use of the two-stage method?
switcher = unique(novCEA$id[novCEA$xoind==1])
switch.df = novCEA[novCEA$id %in% switcher,]

switch.df$xotime = (switch.df$xotime - switch.df$progtime) +1
survfit.prog = survfit(Surv(xotime,xoind) ~ trtrand, data = switch.df)
ggsurvplot(survfit.prog,data= switch.df, risk.table = T, xlim =c(0,50),ylim=c(-.1,1))