by Ian White and Nicholas Latimer
adaptation for R by Paul Schneider, updated by Ian 8/3/2024
# Function to install and load all the packages used in the course
install_n_load <- function(packages) {
for (package in packages) {
if (eval(parse(text = paste("require(", package, ")"))) == 0) {
install.packages(package)
}
eval(parse(text = paste("require(", package, ")")))
}
}
required_packages = c("haven",
"psych",
"survival",
"flexsurv",
"ggplot2",
"cowplot",
"survminer",
"geepack",
"Hmisc",
"eha",
"rpsftm",
"rms")
# # clear work space from previous results
rm(list = setdiff(ls(), c("install_n_load", "required_packages")))
# # Load required packages
install_n_load(required_packages)
options(scipen = 99999)
Note: part (b) retains relevant covariates, but think about it here.
# # read unadjusted data
novCEA = read.csv("novCEA_IPCWready.csv")
# # alternatively, use:
# novCEA = read.csv(file.choose())
# step I
novCEA = novCEA[-which(novCEA$trtrand ==1 |
novCEA$progind == 0 |
novCEA$time < novCEA$progtime
),]
novCEA$disconexp = NULL
# step II Create variables for 'treatment group' analysis
novCEA$ontrtxo = with(novCEA, ifelse(time>=xotime & !is.na(xotime),1,0 ))
# step III Create a variable representing time since disease progression
novCEA$deathtime = novCEA$deathtime - novCEA$progtime
novCEA$time = novCEA$time - novCEA$progtime
# step IV Ensure you have data for each patient characteristic measured at the time of progression
novCEA$HRQLprog = NA
uniq.subj = unique(novCEA$id)
for(subj in uniq.subj){
temp.df = novCEA[novCEA$id == subj,]
temp.df$HRQLprog = temp.df$HRQLtdc[order(temp.df$time)][1]
novCEA[novCEA$id == subj,] = temp.df
}
# step V Format dataset correctly for stage 1 of the analysis
# THIS STEP CORRECTED BY IAN 24FEB2022
# novCEA$deathindtd = novCEA$finalobs = 0
#
# uniq.subj = unique(novCEA$id)
# for(subj in uniq.subj){
# temp.df = novCEA[novCEA$id == subj,]
# novCEA = novCEA[-which(novCEA$id == subj),]
# temp.df = temp.df[order(temp.df$time),]
# last.obs.dup = temp.df[length(temp.df$time),]
# last.obs.dup$finalobs = 1
# last.obs.dup$time = last.obs.dup$deathtime
# temp.df = temp.df[temp.df$time != last.obs.dup$time,]
# if(last.obs.dup$deathind){last.obs.dup$deathindtd = 1}
# novCEA = rbind(novCEA,temp.df,last.obs.dup)
# }
#
#
# # Stata produces start and end times for survival data automatically
# # In R, we need to do this manually:
# novCEA$st = ifelse(novCEA$time==0,0,1)
# novCEA$d_ = novCEA$deathindtd # censored or event
# novCEA$t_0 = novCEA$t_ = NA # t_0 start time; t_ stop time of interval
#
# uniq.subj = unique(novCEA$id)
# for(s in 1:length(uniq.subj)){
#
# temp.df = novCEA[novCEA$id == uniq.subj[s],]
# temp.df = temp.df[order(temp.df$time),]
#
# for(o in 2:length(temp.df$time)){
# temp.df$t_0[o] = temp.df$time[o-1]
# temp.df$t_[o] = temp.df$time[o]
# }
# novCEA[novCEA$id == uniq.subj[s],] = temp.df
# }
novCEA$t_0 = novCEA$time
novCEA$t_ = pmin(novCEA$time+21, novCEA$deathtime)
novCEA$d_ = novCEA$deathind & novCEA$t_==novCEA$deathtime
# END OF CORRECTIONS 24FEB2022
novCEA.2s = novCEA
write.csv(novCEA.2s,file="NovCEA_stage1_R.csv",row.names = F)
# Or, simply use the NovCEA_2stage dataset,
# which is the dataset created by practical4reformat.do:
# novCEA.2s = as.data.frame(read_dta("NovCEA_2stage_stage1.dta"))
Retain theta and model fit statistics.
Note: theta = exp(weib.model$coefficients[“ontrtxo”]). Theta estimates can differ between R and Stata.
# it's easier to remove obs we don't need for the weibull model:
# novCEA.2s = novCEA.2s[novCEA.2s$st==1,]
# fit survival object
surv.2s = Surv(time = novCEA.2s$t_0,
time2 = novCEA.2s$t_,
event = novCEA.2s$d_)
# fit weibull aft model
weib.model = aftreg(surv.2s ~ ontrtxo + prognosisb + progtime + CEAb + CEAprog + HRQLb+ HRQLprog,
data = novCEA.2s,
dist="weibull",
param = "lifeExp")
# IDEALLY WOULD ADD id = id OPTION BUT IT REALLY MAKES COEFFICIENTS DISAGREE WITH STATA
# Stata and R provide completely different log Likelihood values (and AIC values respectively) for thw model:
# Loglik:
weib.model$loglik[2]
## [1] -555.1136
# AIC
2* length(weib.model$coefficients) - 2*(weib.model$loglik[2])
## [1] 1128.227
# The LR test statistic, however, is the same:
lrstat <- -2 * (weib.model$loglik[1] - weib.model$loglik[2])
round(lrstat,2)
## [1] 57.89
# Coefficient estimates differ between R and Stata differ at the 4th digit
# log(scale) = _cons (in Stata)
# log(shape) = /ln_p (in Stata)
weib.model
## Call:
## aftreg(formula = surv.2s ~ ontrtxo + prognosisb + progtime +
## CEAb + CEAprog + HRQLb + HRQLprog, data = novCEA.2s, dist = "weibull",
## param = "lifeExp")
##
## Covariate W.mean Coef Life-Expn se(Coef) Wald p
## ontrtxo 0.659 0.558 1.746 0.206 0.007
## prognosisb 0.334 -0.301 0.740 0.357 0.400
## progtime 118.451 0.004 1.004 0.002 0.053
## CEAb 19.917 -0.041 0.960 0.065 0.531
## CEAprog 27.148 0.028 1.028 0.033 0.400
## HRQLb 0.438 0.192 1.212 0.541 0.722
## HRQLprog 0.456 -0.748 0.473 0.562 0.184
##
## Baseline parameters:
## log(scale) 4.609 1.140 0.000
## log(shape) 0.635 0.077 0.000
## Baseline life expectancy: 89.1
##
## Events 101
## Total time at risk 13152
## Max. log. likelihood -555.11
## LR test statistic 57.9
## Degrees of freedom 7
## Overall p-value 0.000000000397085
# Coefficients + 95%CI
round(cbind(coef=weib.model$coefficients, confint(weib.model)),4)
## coef 2.5 % 97.5 %
## ontrtxo 0.5576 0.1540 0.9611
## prognosisb -0.3008 -1.0009 0.3992
## progtime 0.0043 -0.0001 0.0087
## CEAb -0.0405 -0.1674 0.0863
## CEAprog 0.0280 -0.0372 0.0931
## HRQLb 0.1923 -0.8675 1.2522
## HRQLprog -0.7477 -1.8501 0.3546
## log(scale) 4.6091 2.3743 6.8440
## log(shape) 0.6349 0.4830 0.7868
# retain theta
theta = exp(weib.model$coefficients[c("ontrtxo")] )
theta
## ontrtxo
## 1.746404
First estimate shrunken survival times in switchers.
Go back to the original dataset:
novCEA = read.csv("novCEA_IPCWready.csv")
uniq.id = unique(novCEA$id)
for(subj in uniq.id){ # loop to retrieve max values for each subject
temp = novCEA[novCEA$id == subj,]
temp.max = temp[1,]
temp.max[1,] = as.numeric(apply(temp,2,function(x){max(x,na.rm=T)}))
novCEA = novCEA[novCEA$id != subj,]
novCEA = rbind(novCEA,temp.max)
}
novCEA$xotime[is.na(novCEA$xotime)] = 0
novCEA$tpxo = novCEA$deathtime - novCEA$xotime
novCEA$txot = novCEA$xotime
novCEA$timecf = with(novCEA,ifelse( trtrand ==0 & xotime>0,
(xotime + (tpxo* (1/theta))),
deathtime))
# cox reg HR
surv.adj = Surv(novCEA$timecf,novCEA$deathind)
cox.adj = coxph(surv.adj ~ novCEA$trtrand)
cox.adj
## Call:
## coxph(formula = surv.adj ~ novCEA$trtrand)
##
## coef exp(coef) se(coef) z p
## novCEA$trtrand -0.7931 0.4525 0.1322 -5.997 0.00000000201
##
## Likelihood ratio test=33.87 on 1 df, p=0.000000005889
## n= 300, number of events= 257
cox.adj.alt = coxreg(surv.adj ~ as.factor(novCEA$trtrand),method="breslow")
cox.adj.alt # with time at risk
## Call:
## coxreg(formula = surv.adj ~ as.factor(novCEA$trtrand), method = "breslow")
##
## Covariate Mean Coef Rel.Risk S.E. Wald p
## as.factor(novCEA$trtrand)
## 0 0.269 0 1 (reference)
## 1 0.731 -0.793 0.453 0.132 0.000
##
## Events 257
## Total time at risk 78676
## Max. log. likelihood -1276.2
## LR test statistic 33.84
## Degrees of freedom 1
## Overall p-value 0.00000000597813
# Hazard Ratios
data.frame(OR =exp(cox.adj$coefficients),
CI95_lower = exp(confint(cox.adj))[,1],
CI95_upper = exp(confint(cox.adj))[,2])
## OR CI95_lower CI95_upper
## novCEA$trtrand 0.4524581 0.3491565 0.5863226
# Kaplan meier curve
survfit.adj = survfit(surv.adj ~ novCEA$trtrand)
survfit.adj
## Call: survfit(formula = surv.adj ~ novCEA$trtrand)
##
## n events median 0.95LCL 0.95UCL
## novCEA$trtrand=0 108 103 180 157 205
## novCEA$trtrand=1 192 154 280 258 316
ggsurvplot(survfit.adj, data = novCEA, risk.table=T)
novCEA$trecens = novCEA$timecf
novCEA$trecens[novCEA$trtrand==0] = novCEA$admin[novCEA$trtrand==0] * (1/theta)
novCEA$deathind[novCEA$trtrand==0 & novCEA$timecf > novCEA$trecens] = 0
novCEA$timecf[novCEA$timecf > novCEA$trecens & novCEA$trtrand == 0] = novCEA$trecens[novCEA$timecf > novCEA$trecens & novCEA$trtrand == 0]
Surv.recen = Surv(novCEA$timecf,novCEA$deathind)
cox.recen = coxph(Surv.recen ~novCEA$trtrand)
exp(cox.recen$coefficients)
## novCEA$trtrand
## 0.4506357
survfit.recen = survfit(Surv.recen ~novCEA$trtrand)
ggsurvplot(survfit.recen,novCEA,risk.table = T)
# Inefficient and very slow code to bootstrap 95%CI...
# CAVE: May take a while to compute!
boot_2stage = function(file1 = "NovCEA_stage1_R.csv",
file2 = "novCEA_IPCWready.csv",
iterations=1000){
df.1 = read.csv(file1)
df.2 = read.csv(file2)
est = rep(NA,times=iterations)
novCEA = df.2
novCEA.2s = df.1
uniq.id = unique(novCEA$id)
n.id = length(uniq.id)
boot.est = rep(NA,times=iterations)
# sample individuals
for(i in 1:iterations){
# cat("\r iteration",i," of ",iterations)
sample.ids = sample(x=uniq.id,size =n.id,replace = T)
rows1 = 0
rows2 = 0
for(j in 1:length(sample.ids)){
rows1 = c(rows1,which(novCEA$id == sample.ids[j]))
rows2 = c(rows2,which(novCEA.2s$id == sample.ids[j]))
}
rows1 = rows1[-1]
rows2 = rows2[-1]
novCEA.b = novCEA[rows1,]
novCEA.2s.b = novCEA.2s[rows2,]
# fit survival object
surv.2s = Surv(time = novCEA.2s.b$'t_0',
time2 = novCEA.2s.b$`t_`,
event = novCEA.2s.b$`d_`)
# fit weibull aft model
weib.model = aftreg(surv.2s ~ ontrtxo + prognosisb + progtime + CEAb + CEAprog + HRQLb+ HRQLprog,
data = novCEA.2s.b,
dist="weibull",
param = "lifeExp")
theta = exp(weib.model$coefficients[c("ontrtxo")] )
#######################
uniq.ids.b = unique(novCEA.b$id)
for(subj in uniq.ids.b){ # loop to retrieve max values for each subject
temp = novCEA.b[novCEA.b$id == subj,]
temp.max = temp[1,]
temp.max[1,] = as.numeric(apply(temp,2,function(x){max(x,na.rm=T)}))
novCEA.b = novCEA.b[novCEA.b$id != subj,]
novCEA.b = rbind(novCEA.b,temp.max)
}
novCEA.b$xotime[is.na(novCEA.b$xotime)] = 0
novCEA.b$tpxo = novCEA.b$deathtime - novCEA.b$xotime
novCEA.b$txot = novCEA.b$xotime
novCEA.b$timecf = with(novCEA.b,ifelse( trtrand ==0 & xotime>0,
(xotime + (tpxo* (1/theta))),
deathtime))
novCEA.b$trecens = novCEA.b$timecf
novCEA.b$trecens[novCEA.b$trtrand==0] = novCEA.b$admin[novCEA.b$trtrand==0] * (1/theta)
novCEA.b$deathind[novCEA.b$trtrand==0 & novCEA.b$timecf > novCEA.b$trecens] = 0
novCEA.b$timecf[novCEA.b$timecf > novCEA.b$trecens & novCEA.b$trtrand == 0] = novCEA.b$trecens[novCEA.b$timecf > novCEA.b$trecens & novCEA.b$trtrand == 0]
Surv.recen = Surv(novCEA.b$timecf,novCEA.b$deathind)
cox.recen = coxph(Surv.recen ~novCEA.b$trtrand)
est[i] = exp(cox.recen$coefficients)
}
return(est)
}
bs.est = boot_2stage(iterations = 20) # More iterations: more accurate but slow!
data.frame(est=mean(bs.est),
lower95CI=quantile(bs.est,probs = c(0.05)),
upper95CI=quantile(bs.est,probs = c(.95)))
## est lower95CI upper95CI
## 5% 0.4823828 0.4076459 0.5748167
switcher = unique(novCEA$id[novCEA$xoind==1])
switch.df = novCEA[novCEA$id %in% switcher,]
switch.df$xotime = (switch.df$xotime - switch.df$progtime) +1
survfit.prog = survfit(Surv(xotime,xoind) ~ trtrand, data = switch.df)
ggsurvplot(survfit.prog,data= switch.df, risk.table = T, xlim =c(0,50),ylim=c(-.1,1))