UMIT course, PRACTICAL 4: IPCW

by Ian White and Nicholas Latimer
adaptation for R by Paul Schneider
thanks to Jingyi Xuan for resolving discrepancies with Stata updated IW 07feb2024

R users may also want to check out the ipcwswitch package



# Function to install and load all the packages used in the course
install_n_load <- function(packages){
  for(package in packages){
    if(eval(parse(text=paste("require(",package,")")))==0) {
      install.packages(package)
    } 
    eval(parse(text=paste("require(",package,")")))
    
  } 
}
  required_packages = c("haven",
                        "psych",
                        "survival",
                        "flexsurv",
                        "ggplot2",
                        "cowplot",
                        "survminer",
                        "geepack",
                        "Hmisc",
                        "eha",
                        "rpsftm",
                        "rms")

  options(repos = "https://cran.ma.imperial.ac.uk/") ## without this, Knit fails 

Q1. IPCW step 1: Censor observations and re-format the data

Reformat the data
  # # clear work space from previous results
  rm(list = setdiff(ls(),c("install_n_load","required_packages")))
  # # load required packages
  install_n_load(required_packages)
  
  # # read data
  novCEA.baseline = as.data.frame(read_dta("NovCEAevents.dta"))
  novCEA.tdcs = as.data.frame(read_dta("NovCEAtdcs.dta"))
  novCEA = merge(novCEA.baseline,novCEA.tdcs,by="id")

  # # get rid of 43 obs apparently beyond the end of follow-up
  # # (none of these have events)
  novCEA = novCEA[novCEA$time < novCEA$admin,] 
  # # get rid of 7 obs on the day of death
  # # (these mess up later code)
  novCEA = novCEA[novCEA$time != novCEA$deathtime,]

  # # create dummies for being the first and last observation per person
  novCEA$lastobs = novCEA$firstobs = 0

  for(subj in unique(novCEA$id)){
    temp.subset = novCEA[novCEA$id == subj,]
    temp.subset$firstobs[which(temp.subset$time == min(temp.subset$time))] = 1
    temp.subset$lastobs[which(temp.subset$time == max(temp.subset$time))] = 1
    novCEA[novCEA$id == subj,] = temp.subset
  }

  # # create time-dependent outcome for death in this interval
  novCEA$deathtdo = ifelse(novCEA$lastobs & novCEA$deathind,1,0)

  # # create time-dependent variable for switching in this interval
  switchInInterval = with(novCEA,(xotime>=time) & (xotime < time +21 ) & (xoind))
  novCEA$xotdo = ifelse(switchInInterval,1,0)
  NAvalue = with(novCEA, (xotime<time) & (xoind))
  novCEA$xotdo = ifelse(NAvalue,NA,novCEA$xotdo)

  # # create time-dependent variable for time since progression
  # # 4 failures occur between time 0 and 21 
  # # & are implicitly treated as occurring at time 0
  # # all other failures occur at multiples of 21 days
  failureAtStart = with(novCEA, (progtime<time+21) & (progind==1))
  novCEA$progtdc = ifelse(failureAtStart,1,0)

  # # create time-dependent variables for time since progression
  novCEA$recentprogtdc = with(novCEA,
                           (progtime<time+21) & (progtime>=time-42) & (progind==1)) 
  novCEA$nowprogtdc = with(novCEA,
                           (progtime<time+21) & (progtime>=time) & (progind==1)) 
  
  # # save dataframe
  write.csv(novCEA,"NovCEA_IPCWready.csv",row.names = F)



Check that your dataset is correct by examining the data and by tabulating your new variables by arm.
(a) The data set contains one record per participant per time
# View(novCEA)
(b) Time-dependent outcomes have names ending in tdo:
  with(novCEA, table(deathtdo,trtrand))
##         trtrand
## deathtdo    0    1
##        0 1132 2658
##        1  103  154
  with(novCEA, table(xotdo,trtrand))
##      trtrand
## xotdo    0    1
##     0  798 2812
##     1   49    0
(c) Time-dependent covariates have names ending in tdc:
  with(novCEA, table(progtdc,trtrand))
##        trtrand
## progtdc    0    1
##       0  557 1909
##       1  678  903
  with(novCEA, table(recentprogtdc,trtrand))
##              trtrand
## recentprogtdc    0    1
##         FALSE  949 2364
##         TRUE   286  448
  with(novCEA, table(nowprogtdc,trtrand))
##           trtrand
## nowprogtdc    0    1
##      FALSE 1130 2649
##      TRUE   105  163



Let’s see how many observations we have of each sort.

  with(novCEA, table(deathtdo, xotdo, trtrand, useNA="ifany"))
## , , trtrand = 0
## 
##         xotdo
## deathtdo    0    1 <NA>
##        0  742   48  342
##        1   56    1   46
## 
## , , trtrand = 1
## 
##         xotdo
## deathtdo    0    1 <NA>
##        0 2658    0    0
##        1  154    0    0



(d) Repeat the ITT and PP (censored) analyses in this restructured dataset.
  # # check we get exactly the same ITT results as in the previous practical using the reformat data
  novCEA.itt =  novCEA
  novCEA.itt$tin = novCEA$time
  novCEA.itt$tout = pmin((novCEA.itt$time + 21), novCEA.itt$deathtime)
  
  novSurv.itt = Surv(time=novCEA.itt$tin, # now with intervall time1
                     time2=novCEA.itt$tout, # time2
                     event = novCEA.itt$deathtdo)
  
  coxitt = coxph(novSurv.itt ~ trtrand, data = novCEA.itt, ties=("breslow"))
  coxitt
## Call:
## coxph(formula = novSurv.itt ~ trtrand, data = novCEA.itt, ties = ("breslow"))
## 
##            coef exp(coef) se(coef)      z        p
## trtrand -0.5221    0.5933   0.1286 -4.059 4.92e-05
## 
## Likelihood ratio test=15.8  on 1 df, p=7.044e-05
## n= 4047, number of events= 257
  exp(confint(coxitt))
##             2.5 %    97.5 %
## trtrand 0.4610846 0.7633627
  # # check we get exactly the same PP results as in the previous practical using the reformat data
  novCEA.pp =  novCEA[novCEA$xotdo==0,]
  novCEA.pp$tin = novCEA.pp$time
  novCEA.pp$tout = pmin((novCEA.pp$time + 21), novCEA.pp$deathtime)
  
  novSurv.pp = Surv(time=novCEA.pp$tin, # now with intervall time1
                    time2=novCEA.pp$tout, # time2
                    event = novCEA.pp$deathtdo)
  
  coxpp = coxph(novSurv.pp ~ trtrand, data = novCEA.pp,ties=("breslow"))
  coxpp
## Call:
## coxph(formula = novSurv.pp ~ trtrand, data = novCEA.pp, ties = ("breslow"))
## 
##            coef exp(coef) se(coef)      z       p
## trtrand -0.6026    0.5474   0.1659 -3.633 0.00028
## 
## Likelihood ratio test=12.2  on 1 df, p=0.0004789
## n= 3610, number of events= 210 
##    (388 observations deleted due to missingness)
  exp(confint(coxpp))
##             2.5 %    97.5 %
## trtrand 0.3954296 0.7576677
        <br><br>
        

Q2. IPCW step 2: Model the probability of being censored over time

(a) Use logistic regression to predict switching given baseline covariates (model S1)

(WE WILL USE LINEAR AND QUADRATIC TERMS IN TIME. A BETTER APPROACH USING SPLINES IS GIVEN LATER)

  fitBLswitch_lin = glm(xotdo ~ time, family =  binomial(link = 'logit'), data = novCEA[novCEA$trtrand == 0,])
  summary(fitBLswitch_lin)
## 
## Call:
## glm(formula = xotdo ~ time, family = binomial(link = "logit"), 
##     data = novCEA[novCEA$trtrand == 0, ])
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.142647   0.217139 -14.473   <2e-16 ***
## time         0.003117   0.001233   2.528   0.0115 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 374.40  on 846  degrees of freedom
## Residual deviance: 368.79  on 845  degrees of freedom
##   (388 observations deleted due to missingness)
## AIC: 372.79
## 
## Number of Fisher Scoring iterations: 5
  # # odds ratios
  data.frame(OR =exp(fitBLswitch_lin$coefficients),
             CI95_lower = exp(confint(fitBLswitch_lin))[,1], 
             CI95_upper = exp(confint(fitBLswitch_lin))[,2])
##                     OR CI95_lower CI95_upper
## (Intercept) 0.04316837 0.02771498 0.06505688
## time        1.00312226 1.00056958 1.00545542
  logLik(fitBLswitch_lin)
## 'log Lik.' -184.3937 (df=2)
  AIC(fitBLswitch_lin)
## [1] 372.7874
  BIC(fitBLswitch_lin)
## [1] 382.2708
  # plot(fitBLswitch_lin)

  # linear model is quite poor, fit time-squared model
  novCEA$time2 <- novCEA$time^2
  fitBLswitch = glm(xotdo ~ time + time2, family =  binomial(link = 'logit'), data = novCEA[novCEA$trtrand == 0,])
  summary(fitBLswitch)
## 
## Call:
## glm(formula = xotdo ~ time + time2, family = binomial(link = "logit"), 
##     data = novCEA[novCEA$trtrand == 0, ])
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.998e+00  7.329e-01  -8.183 2.77e-16 ***
## time         5.193e-02  1.084e-02   4.791 1.66e-06 ***
## time2       -1.540e-04  3.777e-05  -4.076 4.58e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 374.40  on 846  degrees of freedom
## Residual deviance: 323.58  on 844  degrees of freedom
##   (388 observations deleted due to missingness)
## AIC: 329.58
## 
## Number of Fisher Scoring iterations: 8
  # # odds ratios
  data.frame(OR =exp(fitBLswitch$coefficients),
             CI95_lower = exp(confint(fitBLswitch))[,1], 
             CI95_upper = exp(confint(fitBLswitch))[,2])
##                      OR   CI95_lower  CI95_upper
## (Intercept) 0.002484837 0.0004912639 0.008887927
## time        1.053306287 1.0334692053 1.078624063
## time2       0.999846055 0.9997623390 0.999911077



(b) assess fit of the model, for comparison to alternatives that may be run later
  logLik(fitBLswitch)
## 'log Lik.' -161.7875 (df=3)
  AIC(fitBLswitch)
## [1] 329.575
  BIC(fitBLswitch)
## [1] 343.8001
  plot(fitBLswitch)

  # here's a way to do the Hosmer-Lemeshow test
  install.packages("glmtoolbox")
## Installing package into 'C:/Rlibrary/4.3'
## (as 'lib' is unspecified)
## package 'glmtoolbox' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rmjwiww\AppData\Local\Temp\RtmpmGGePQ\downloaded_packages
  library(glmtoolbox)
## Warning: package 'glmtoolbox' was built under R version 4.3.2
## 
## Attaching package: 'glmtoolbox'
## The following object is masked from 'package:geepack':
## 
##     QIC
  hltest(fitBLswitch_lin)
## 
##    The Hosmer-Lemeshow goodness-of-fit test
## 
##  Group Size Observed Expected
##      1  108        0 4.469254
##      2  106        0 4.670186
##      3  104        1 4.877533
##      4   97        1 4.841657
##      5   92       14 4.886268
##      6   73        8 4.124637
##      7  108        9 6.680540
##      8   79       12 5.635539
##      9   80        4 8.814386
## 
##          Statistic =  49.35495 
## degrees of freedom =  7 
##            p-value =  1.9331e-08
  hltest(fitBLswitch)
## 
##    The Hosmer-Lemeshow goodness-of-fit test
## 
##  Group Size Observed   Expected
##      1  135        0  0.2711346
##      2  110        0  0.7380215
##      3  111        1  1.7667569
##      4  104        1  3.4516483
##      5  101       15  6.0004932
##      6   84        9  7.7132139
##      7   92        7 11.5471781
##      8  110       16 17.5115534
## 
##          Statistic =  19.94313 
## degrees of freedom =  6 
##            p-value =  0.0028347
  # quadratic model is better but could still be improved 
  # we'll use it  
        <br><br>
        
(c) Estimate the probability of switching for each patient-observation included in the regression.
  novCEA$pxo1 = predict(fitBLswitch,newdata =novCEA,  type="response")
  summary(novCEA$pxo1)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.006862 0.034332 0.060453 0.130327 0.165495



(d) Use logistic regression to predict switching given baseline and time-updated covariates (model S12)
fitBLTDswitch = glm(xotdo ~ prognosisb + CEAb + HRQLb + nowprogtdc + CEAtdc + HRQLtdc + time + time2, 
                    family =  binomial(link = 'logit'), 
                    data = novCEA[novCEA$trtrand == 0 & novCEA$recentprogtdc == 1,])
  
  summary(fitBLTDswitch)
## 
## Call:
## glm(formula = xotdo ~ prognosisb + CEAb + HRQLb + nowprogtdc + 
##     CEAtdc + HRQLtdc + time + time2, family = binomial(link = "logit"), 
##     data = novCEA[novCEA$trtrand == 0 & novCEA$recentprogtdc == 
##         1, ])
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -6.7847760  5.5036508  -1.233  0.21766   
## prognosisb     -2.7384749  2.8305719  -0.967  0.33331   
## CEAb            0.1388221  0.3785942   0.367  0.71386   
## HRQLb           0.8469678  2.2817367   0.371  0.71049   
## nowprogtdcTRUE  1.6508039  0.5373881   3.072  0.00213 **
## CEAtdc         -0.0900447  0.2717116  -0.331  0.74034   
## HRQLtdc        -0.1055115  2.6130755  -0.040  0.96779   
## time            0.0620480  0.0303804   2.042  0.04112 * 
## time2          -0.0001136  0.0001101  -1.032  0.30210   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 223.83  on 201  degrees of freedom
## Residual deviance: 108.06  on 193  degrees of freedom
##   (84 observations deleted due to missingness)
## AIC: 126.06
## 
## Number of Fisher Scoring iterations: 6
  data.frame(OR =exp(fitBLTDswitch$coefficients),
             CI95_lower = exp(confint(fitBLTDswitch))[,1], 
             CI95_upper = exp(confint(fitBLTDswitch))[,2])
##                         OR   CI95_lower CI95_upper
## (Intercept)    0.001130861 1.554144e-08  45.461215
## prognosisb     0.064668895 2.110449e-04  13.199732
## CEAb           1.148919701 5.490330e-01   2.451344
## HRQLb          2.332563388 2.561079e-02 217.361999
## nowprogtdcTRUE 5.211167519 1.872867e+00  15.700423
## CEAtdc         0.913890305 5.397361e-01   1.561309
## HRQLtdc        0.899864111 4.908600e-03 149.945607
## time           1.064013403 1.006283e+00   1.131241
## time2          0.999886440 9.996772e-01   1.000093



(e) Assess model fit
  logLik(fitBLTDswitch)
## 'log Lik.' -54.03214 (df=9)
  AIC(fitBLTDswitch)
## [1] 126.0643
  BIC(fitBLTDswitch)
## [1] 155.8387
  hltest(fitBLTDswitch)
## 
##    The Hosmer-Lemeshow goodness-of-fit test
## 
##  Group Size Observed    Expected
##      1   20        0  0.07463176
##      2   20        1  0.15648071
##      3   20        0  0.27618149
##      4   20        0  0.37600229
##      5   20        1  0.64665168
##      6   20        2  1.55539425
##      7   20        0  3.92791273
##      8   20        9  8.18273190
##      9   20       19 13.53247547
##     10   20       15 18.31513584
##     11    2        2  1.95640188
## 
##          Statistic =  24.68309 
## degrees of freedom =  9 
##            p-value =  0.0033425



(f) Estimate the probability of switching for each patient-observation included in the regression.
  novCEA$pxo2 =  predict(fitBLTDswitch, newdata = novCEA,type="response")

  # # set prob of switching to zero where progtypetdc==0
  novCEA$pxo2[novCEA$trtrand==0 & novCEA$recentprogtdc ==0] = 0  

  
# # explore fitted probabilities
ggplot(novCEA) +
  geom_point(aes(x= time, y= pxo1, col = "given baseline covariates")) +
  geom_point(aes(x= time, y= pxo2,col="given baseline & time-updated covariates")) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  ylab("Probability of switch") +
  xlab("Time") +
  guides(fill=guide_legend(title=""))



Q3. IPCW step 3: For each individual at each time, compute the inverse probability of remaining uncensored

(a) Estimate the probabilities of remaining ‘un-switched’ and hence the weights
  novCEA$denom = novCEA$num = NA
  for(subj in unique(novCEA$id)){
      temp.subset = novCEA[novCEA$id == subj,]
      temp.subset = temp.subset[order(temp.subset$time),]
      temp.subset$num[1] = 1 - temp.subset$pxo1[1]
      temp.subset$denom[1] = 1 - temp.subset$pxo2[1]
      len = length(temp.subset$num)
      if(len>1){
        for(i in 2:len){
          temp.subset$num[i] = temp.subset$num[i-1] * (1-temp.subset$pxo1[i])
          temp.subset$denom[i] = temp.subset$denom[i-1] * (1-temp.subset$pxo2[i])
        }
      }
      novCEA[novCEA$id == subj,] = temp.subset
  }
  
  isControl = novCEA$trtrand==0
  novCEA$weight[isControl]  =  1 / novCEA$denom[isControl]
  novCEA$sweight[isControl] =  novCEA$num[isControl] / novCEA$denom[isControl]



(b) summarise the weights, and inspect the data to be confident that you have computed the weights correctly.
  # # Unstablised Weights: 
  summary(novCEA$weight[novCEA$xotdo==0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   1.000   1.135   1.021  11.525    3200
  # # Stabilised Weights:
  summary(novCEA$sweight[novCEA$xotdo==0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.283   0.694   0.941   0.885   0.991   6.753    3200
  plot_grid(
    ggplot(novCEA[novCEA$xotdo==0,]) +
      geom_histogram(aes(weight)) +
       theme_minimal()
                     ,
    ggplot(novCEA[novCEA$xotdo==0,]) +
    geom_histogram(aes(sweight)) +
      theme_minimal()
           
  )



(c) set the weights to 1 in the treatment arm.
  novCEA$weight[!isControl] = 1
  novCEA$sweight[!isControl] = 1



Q4. IPCW step 4: Use these weights in a weighted analysis of the outcome model

Estimate the KM graph and IPCW hazard ratio using Cox regression.



data preparation
##### data preparation 
novCEA.KM =  novCEA[novCEA$xotdo==0,]
  novCEA.KM$tin = novCEA.KM$time
  novCEA.KM$tout = pmin((novCEA.KM$time + 21), novCEA.KM$deathtime)
  
  novSurv.KM = Surv(time=novCEA.KM$tin, # now with intervall time1
                    time2=novCEA.KM$tout, # time2
                    event = novCEA.KM$deathtdo)
data preparation
# # NOWEIGHTS (for comparison)
  novSurv.KM.fit.NOweights = survfit(novSurv.KM ~ trtrand, data = novCEA.KM)
  
  ggsurvplot(novSurv.KM.fit.NOweights,risk.table = T) +
    ggtitle("UNWEIGHTED Results")



Unstabilised weights
# # unstabilised weights 
  novSurv.KM.fit.weighted = survfit(novSurv.KM ~ trtrand, data = novCEA.KM, weights = weight)
  # # plot
  ggsurvplot(novSurv.KM.fit.weighted,risk.table = T) +
    ggtitle("Results with unstabilised weights")

    # cox regression 
  cox.wu = coxph(novSurv.KM ~ trtrand, data = novCEA.KM, weights = weight, ties = ("breslow"), cluster=id)
  cox.wu
## Call:
## coxph(formula = novSurv.KM ~ trtrand, data = novCEA.KM, weights = weight, 
##     ties = ("breslow"), cluster = id)
## 
##            coef exp(coef) se(coef) robust se      z        p
## trtrand -0.6995    0.4968   0.1519    0.1731 -4.042 5.31e-05
## 
## Likelihood ratio test=19.78  on 1 df, p=8.706e-06
## n= 3610, number of events= 210 
##    (388 observations deleted due to missingness)
  exp(confint(cox.wu))
##             2.5 %    97.5 %
## trtrand 0.3539187 0.6974928
# I'm told that the correct way to get sandwich variances is not the above but
#   install.packages(c("sandwich","lmtest"))
#   library(sandwich)
#   library(lmtest)
#   coeftest(cox.wu, vcov = sandwich, cluster = ~id)
# This agrees somewhat better with the Stata results
# Thanks to Amy Chang for this



Stabilised weights
  # # STABILISED WEIGHTS 
  novSurv.KM.fit.weighted2 = survfit(novSurv.KM ~ trtrand, data = novCEA.KM, weights = sweight)
  # # plot
  ggsurvplot(novSurv.KM.fit.weighted2,risk.table = T) +
    ggtitle("Results with stabilised weights")

  # cox regression 
  cox.ws = coxph(novSurv.KM ~ trtrand, data = novCEA.KM, weights = sweight,ties = ("breslow"), cluster=id)
  cox.ws
## Call:
## coxph(formula = novSurv.KM ~ trtrand, data = novCEA.KM, weights = sweight, 
##     ties = ("breslow"), cluster = id)
## 
##            coef exp(coef) se(coef) robust se      z        p
## trtrand -0.7041    0.4945   0.1843    0.1794 -3.926 8.65e-05
## 
## Likelihood ratio test=13.55  on 1 df, p=0.0002329
## n= 3610, number of events= 210 
##    (388 observations deleted due to missingness)
  exp(confint(cox.ws))
##             2.5 %   97.5 %
## trtrand 0.3479575 0.702884
  # Cox models could optionally adjust for covariates 

Note that the cluster-robust variance estimator (and hence the CI) differs slightly from those in the Stata analysis.

IPCW+SPLINES: Doing it all again but better - with splines

# remove previously computed vars
  novCEA$pxo2 = novCEA$pxo1 = novCEA$weight = novCEA$sweight =  NULL 



IPCW step 2 with splines: Model the probability of being censored over time

2(a) Use logistic regression to predict switching given baseline covariates (model S1)

First create splines for a time-dependent intercept Generate 5 knots based upon the event time distribution (can try other numbers of knots)

  knots.R = rcspline.eval(novCEA$time[novCEA$xotdo==1], knots.only=T, pc=T, nk=6)
  knots.R
## [1]  84 105 147 168 210
  # Stata creates slightly different knots, we will use those
  knots.Stata = c(42,84,126,168,273)

  fitBLswitch.sp = glm(xotdo ~ rcs(time,knots.Stata), family = binomial(link="logit"), data = novCEA[novCEA$trtrand==0,])
  summary(fitBLswitch.sp)
## 
## Call:
## glm(formula = xotdo ~ rcs(time, knots.Stata), family = binomial(link = "logit"), 
##     data = novCEA[novCEA$trtrand == 0, ])
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -11.06698    2.59159  -4.270 1.95e-05 ***
## rcs(time, knots.Stata)time      0.12374    0.03651   3.390  0.00070 ***
## rcs(time, knots.Stata)time'    -0.91563    0.28757  -3.184  0.00145 ** 
## rcs(time, knots.Stata)time''    2.31678    0.74610   3.105  0.00190 ** 
## rcs(time, knots.Stata)time'''  -2.08256    0.71362  -2.918  0.00352 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 374.40  on 846  degrees of freedom
## Residual deviance: 310.93  on 842  degrees of freedom
##   (388 observations deleted due to missingness)
## AIC: 320.93
## 
## Number of Fisher Scoring iterations: 9
  # # odds ratios
  data.frame(OR = exp(fitBLswitch.sp$coefficients),
             CI95_lower = exp(confint(fitBLswitch.sp))[,1], 
             CI95_upper = exp(confint(fitBLswitch.sp))[,2])
##                                         OR   CI95_lower   CI95_upper
## (Intercept)                   1.561970e-05 3.220756e-08  0.000903275
## rcs(time, knots.Stata)time    1.131721e+00 1.067002e+00  1.232348027
## rcs(time, knots.Stata)time'   4.002642e-01 2.119126e-01  0.658349286
## rcs(time, knots.Stata)time''  1.014291e+01 2.694118e+00 50.877125515
## rcs(time, knots.Stata)time''' 1.246110e-01 2.781889e-02  0.460305065



2(b) assess fit of the model, for comparison to alternatives that may be run later
  logLik(fitBLswitch.sp)
## 'log Lik.' -155.4667 (df=5)
  AIC(fitBLswitch.sp)
## [1] 320.9334
  BIC(fitBLswitch.sp)
## [1] 344.6419
  plot(fitBLswitch.sp)



2(c) Estimate the probability of switching for each patient-observation included in the regression.
  novCEA$pxo1 = predict(fitBLswitch.sp, newdata = novCEA, type="response")



2(d) Use logistic regression to predict switching given baseline and time-updated covariates (model S12)
  fitBLTDswitch.sp = glm(xotdo ~ prognosisb + CEAb + HRQLb + nowprogtdc + CEAtdc + HRQLtdc + rcs(time, knots.Stata), family =  binomial(link = 'logit'), data = novCEA[novCEA$trtrand == 0 & novCEA$recentprogtdc > 0,])
  summary(fitBLTDswitch.sp)
## 
## Call:
## glm(formula = xotdo ~ prognosisb + CEAb + HRQLb + nowprogtdc + 
##     CEAtdc + HRQLtdc + rcs(time, knots.Stata), family = binomial(link = "logit"), 
##     data = novCEA[novCEA$trtrand == 0 & novCEA$recentprogtdc > 
##         0, ])
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                   -12.7357     6.7545  -1.886  0.05936 . 
## prognosisb                     -1.9575     2.9281  -0.669  0.50381   
## CEAb                            0.2772     0.3994   0.694  0.48767   
## HRQLb                           1.0984     2.3283   0.472  0.63710   
## nowprogtdcTRUE                  1.7569     0.5410   3.247  0.00117 **
## CEAtdc                         -0.1756     0.2811  -0.625  0.53219   
## HRQLtdc                        -0.2904     2.6503  -0.110  0.91275   
## rcs(time, knots.Stata)time      0.1330     0.0574   2.317  0.02053 * 
## rcs(time, knots.Stata)time'    -0.8992     0.4701  -1.913  0.05579 . 
## rcs(time, knots.Stata)time''    2.5201     1.3051   1.931  0.05348 . 
## rcs(time, knots.Stata)time'''  -2.7147     1.4359  -1.891  0.05867 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 223.83  on 201  degrees of freedom
## Residual deviance: 103.82  on 191  degrees of freedom
##   (84 observations deleted due to missingness)
## AIC: 125.82
## 
## Number of Fisher Scoring iterations: 7
  # # odds ratios
  data.frame(OR =exp(fitBLTDswitch.sp$coefficients),
             CI95_lower = exp(confint(fitBLTDswitch.sp))[,1], 
             CI95_upper = exp(confint(fitBLTDswitch.sp))[,2])
##                                         OR   CI95_lower  CI95_upper
## (Intercept)                   2.944021e-06 1.963261e-12   0.8180748
## prognosisb                    1.412173e-01 4.039642e-04  42.8982655
## CEAb                          1.319398e+00 6.103808e-01   2.9712651
## HRQLb                         2.999312e+00 3.032446e-02 310.9014078
## nowprogtdcTRUE                5.794460e+00 2.070176e+00  17.6058457
## CEAtdc                        8.389534e-01 4.773519e-01   1.4520040
## HRQLtdc                       7.479650e-01 3.733854e-03 133.7213596
## rcs(time, knots.Stata)time    1.142216e+00 1.039646e+00   1.3011130
## rcs(time, knots.Stata)time'   4.069098e-01 1.456317e-01   0.9199604
## rcs(time, knots.Stata)time''  1.243016e+01 1.183884e+00 202.3698960
## rcs(time, knots.Stata)time''' 6.622355e-02 3.425509e-03   1.0351274



2(e) Assess model fit
  logLik(fitBLTDswitch.sp)
## 'log Lik.' -51.90815 (df=11)
  AIC(fitBLTDswitch.sp)
## [1] 125.8163
  BIC(fitBLTDswitch.sp)
## [1] 162.2073
  plot(fitBLTDswitch.sp)

2(f) Estimate the probability of switching for each patient-observation included in the regression.
  novCEA$pxo2 = predict(fitBLTDswitch.sp, newdata = novCEA, type="response")



# # set prob of switching to zero where recentprogtdc==0
  novCEA$pxo2[novCEA$trtrand==0 & novCEA$recentprogtdc == 0] = 0



IPCW step 3 with splines: For each individual at each time, compute the inverse probability of remaining uncensored

3(a) Estimate the probabilities of remaining ‘un-switched’ and hence the weights
  novCEA$denom = novCEA$num = NA
  for(subj in unique(novCEA$id)){
    temp.subset = novCEA[novCEA$id == subj,]
    temp.subset = temp.subset[order(temp.subset$time),]
    temp.subset$num[1] = 1 - temp.subset$pxo1[1]
    temp.subset$denom[1] = 1 - temp.subset$pxo2[1]
    len = length(temp.subset$num)
    if(len>1){
      for(i in 2:len){
        temp.subset$num[i] = temp.subset$num[i-1] * (1-temp.subset$pxo1[i])
        temp.subset$denom[i] = temp.subset$denom[i-1] * (1-temp.subset$pxo2[i])
      }
    }
    novCEA[novCEA$id == subj,] = temp.subset
  }
  
  isControl = novCEA$trtrand==0
  novCEA$weight[isControl]  = 1 / novCEA$denom[isControl]
  novCEA$sweight[isControl] = novCEA$num[isControl] / novCEA$denom[isControl]



3(b) summarise the weights, and inspect the data to be confident that you have computed the weights correctly.
# # Unstablised Weights:
summary(novCEA$weight[novCEA$xotdo==0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   1.000   1.114   1.019   7.287    3200
# # Stabilised Weights: 
summary(novCEA$sweight[novCEA$xotdo==0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.274   0.702   0.966   0.871   1.000   4.486    3200
plot_grid(
  ggplot(novCEA[novCEA$xotdo==0,]) +
    geom_histogram(aes(weight)) +
     theme_minimal()
                   ,
  ggplot(novCEA[novCEA$xotdo==0,]) +
  geom_histogram(aes(sweight)) +
    theme_minimal()
         
)



3(c) set the weights to 1 in the treatment arm.
novCEA$weight[!isControl] = 1
novCEA$sweight[!isControl] = 1



IPCW step 4 with splines

data preparation

novCEA.KM =  novCEA[novCEA$xotdo==0,]
  novCEA.KM$tin = novCEA.KM$time
  novCEA.KM$tout = pmin((novCEA.KM$time + 21), novCEA.KM$deathtime)
  
  novSurv.KM = Surv(time=novCEA.KM$tin, # now with intervall time1
                    time2=novCEA.KM$tout, # time2
                    event = novCEA.KM$deathtdo)



No weights
   # # # NOWEIGHTS - as comparison (code not run)
  # novSurv.KM.fit.NOweights = survfit(novSurv.KM ~ trtrand, data = novCEA.KM)
  # 
  # ggsurvplot(novSurv.KM.fit.NOweights,risk.table = T) +
  #   ggtitle("UNWEIGHTED Results")



Unstabilised weights
  # # UNSTABILISED WEIGHTS 
  novSurv.KM.fit.weighted = survfit(novSurv.KM ~ trtrand, data = novCEA.KM, weights = weight)
  # # plot
  ggsurvplot(novSurv.KM.fit.weighted,risk.table = T) +
    ggtitle("Results with unstabilised weights")

  # cox regression 
  cox2.wu = coxph(novSurv.KM ~ trtrand, data = novCEA.KM, weights = weight, ties = ("breslow"), cluster=id)
  cox2.wu
## Call:
## coxph(formula = novSurv.KM ~ trtrand, data = novCEA.KM, weights = weight, 
##     ties = ("breslow"), cluster = id)
## 
##            coef exp(coef) se(coef) robust se      z        p
## trtrand -0.7146    0.4894   0.1511    0.1699 -4.207 2.59e-05
## 
## Likelihood ratio test=20.76  on 1 df, p=5.214e-06
## n= 3610, number of events= 210 
##    (388 observations deleted due to missingness)
  exp(confint(cox2.wu))
##             2.5 %    97.5 %
## trtrand 0.3508025 0.6827413



Stabilised weights
  # # STABILISED WEIGHTS 
  novSurv.KM.fit.weighted2 = survfit(novSurv.KM ~ trtrand, data = novCEA.KM, weights = sweight)
  # # plot
  ggsurvplot(novSurv.KM.fit.weighted2,risk.table = T) +
    ggtitle("Results with stabilised weights")

  # cox regression 
  cox2.ws = coxph(novSurv.KM ~ trtrand, data = novCEA.KM, weights = sweight, ties = ("breslow"), cluster=id)
  cox2.ws
## Call:
## coxph(formula = novSurv.KM ~ trtrand, data = novCEA.KM, weights = sweight, 
##     ties = ("breslow"), cluster = id)
## 
##            coef exp(coef) se(coef) robust se      z        p
## trtrand -0.7249    0.4844   0.1844    0.1764 -4.109 3.97e-05
## 
## Likelihood ratio test=14.24  on 1 df, p=0.0001609
## n= 3610, number of events= 210 
##    (388 observations deleted due to missingness)
  exp(confint(cox2.ws))
##             2.5 %    97.5 %
## trtrand 0.3427733 0.6844227