\(~\)

Generation of relative survival data

\(~\)

1. Setting the true excess hazard model

The true excess hazard function is described by the Generalized Weibull regression model:

\[\lambda_{E}(t|x) = \frac{k \rho t^{k-1} }{1 + \frac{(\rho t)^k }{\alpha}} \cdot \exp{(x^\top{}\beta)} \]

# Baseline hazard function 
  haz_GenWeibull <- function(t, rho, a, k, log.p = FALSE){
  log.h <- log(k) + log(rho) + (k-1)*log(t) - log(1 + ((rho*t)^k)/a)
  ifelse(log.p, return(log.h), return(exp(log.h)))
  }  

Parameters are chosen to mimic the scenarios of cancer population studies.

# ================================================ #
#  Colorectal cancer behavior (Nathalie's thesis)  #
# ================================================ #
  # Baseline hazard parameters
    r <- 0.5
    a <- 0.2
    k <- 2
       
  # Regression coefficients
    betaAge  <- log(1.03)
    betaSexe <- log(0.8)
    beta_X   <- log(2)
    
  # Plotting the baseline hazard  
    haz_values <- Vectorize(function(t) haz_GenWeibull(t,r,a,k ))
    curve(haz_values, 0, 10, ylim = c(0,1), col = "red", lwd = 2, n = 1000, xlab = "Follow-up time",
          ylab = "Hazard function", cex.axis = 1, cex.lab = 1)

2. Generation of RS survival times

\(~\)

\(\bullet\) Code and parameters values: Giorgi.

\(\bullet\) Simulation of survival times from general population/ outputs (cause-specific, hypothetical world): Nathalie.

\(\bullet\) Life table: \(\texttt{survexp.us}\).

\(\bullet\) Initially, I generated 1 replicate of the data set and a fixed proporton of censored observations.

\(~\)

library("survival")

simuRSdata <- function(NbInd, sim.covs = list(age.prop = c(0.25,0.35,0.4), sex.prop = 0.5), cens.control = list(censor.dist = "unif", par.unif = c(0,10), par.exp = 2,CensAdmin=5)){
  
  # ================================================== #
  #  CREATING THE DATA FRAME FOR THE SIMULATED DATA    #
  # ================================================== #
    MyData           <- array(0, dim=c(NbInd, 14))
    dimnames(MyData) <- list(seq(1,NbInd ), 
                             c("id","age", "ageCentre","ageCat", "sex", "X","obsTime", "status",
                               "tauxatt", "cause_spec", "PopTime","SpecTime", "hyp_time", "cause_hypo_world"))
  
  # =========================================================================== #
  #  SETTING THE TRUE EXCESS HAZARD MODEL                                       #
  # =========================================================================== #
  
    # GENERALIZED WEIBULL DISTRIBUTION: Parameters are chosen to mimic population studies on colorectal cancer 
      r <- 0.5
      a <- 0.2
      k <- 2
      
      betaAge1 <- log(2)
      betaAge2 <- log(3)
      betaSexe <- log(0.8)
      beta_X   <- log(2)
    
   # =========================================== #
   #  GENERATING RELATIVE SURVIVAL DATA SETS     #
   # =========================================== #
     
        # ================================= #
        #  1. Generating covariate values   #
        # ================================= #
        
           # ------------- Age ----------------------------------------------------------------------------- #  
             pAge0     <- sim.covs$age.prop[1] 
             pAge1     <- sim.covs$age.prop[2] 
             pAge2     <- sim.covs$age.prop[3] 
             
             age       <- c(runif(NbInd*pAge0, min = 30, max = 65), 
                            runif(NbInd*pAge1, min = 65, max = 75), 
                            runif(NbInd*pAge2, min = 75, max = 85)) 
             
             age1 <- ifelse((age>=65 & age<75), 1, 0)
             age2 <- ifelse(age>=75, 1, 0)
             
             age_cat   <- c(rep("[30,65]",NbInd*pAge0), 
                            rep("[65, 75]",NbInd*pAge1), 
                            rep("[75, 85]",NbInd*pAge2)) 
             ageMoyen  <- mean(age)
             ageCentre <- age - ageMoyen  
            
           # ----------- Sex ------------------------------------------------------------------------------- #  
             pFem <- 0.5 # Proportion of females in each file
             uSex <- runif(NbInd)
             sex  <- (uSex <= pFem)*1 + (uSex > pFem)*0   # Categorization: Male = 0, Female = 1.
          
        
           # ----------- Dummy variable X ------------------------------------------------------------------ #
             pX <- 0.5 # Proportion of X = 0 in each file
             uX <- runif(NbInd)
             X  <- (uX <= pX)*1 + (uX > pX)*0    
          
        # ========================================================================================= #    
        #  2. Generating survival times for general population (tpsGene)                            #
        # ========================================================================================= #      
          tpsGene <- rep(0, NbInd)
          sexNom  <- ifelse(sex == 0, "male", "female")
          f1 <- function(i){
            uAtt <- runif(1)
            TauxAtt <- 1-exp(-365.24*survexp.us[trunc(age[i]) + 1 , sexNom[i], "2004"])
            if(uAtt <= TauxAtt){
              tpsG <- runif(1)
              tpsGene[i] <- tpsG
            }
            else{
              while(uAtt > TauxAtt){
                tpsGene[i] <- tpsGene[i] + 1
                uAtt <- runif(1)
                if(trunc(age[i]) + 1 + tpsGene[i] < 100)
                  TauxAtt <- 1-exp(-365.24*survexp.us[trunc(age[i]) + tpsGene[i] + 1 , sexNom[i], "2004"])
                else
                  TauxAtt <- 1-exp(-365.24*survexp.us[104, sexNom[i], "2004"])
              }
              tpsG <- runif(1)
              tpsGene[i] <- tpsGene[i] + tpsG
            }
          }
        
          tpsGene <- sapply(1:NbInd,f1)
        
      # ============================================================================= #
      #  3. Generating survival times from the excess hazard model (tpsSpe)           #
      # ============================================================================= #
        
        # Non-censored specific times(tpsSpe): Uses the inverse of the FDA of the Generalized 
        # Weibull distribution  
          Ui        <- runif(NbInd)  
          exp.betaz <- exp(betaSexe*sex + betaAge1*age1 + betaAge2*age2  + beta_X*X)
          tpsSpe    <- (1/r*(a*((1/(1-Ui))^(1/(a*exp.betaz))-1))^(1/k))
        
      # ============================================================================= #
      #  4. Generating right-censored observations                                    #
      # ============================================================================= #  
        
        # ------------- Simulating completely random dropout times tpsCens --------------------------------- #
          # ============== #    
          #  UNIFORM law   #  
          # ============== #  
            tpsCens    <- runif(NbInd, min = cens.control$par.unif[1], max = cens.control$par.unif[2])
            CensAdmin  <- cens.control$CensAdmin
           
     
        #  ------------ Generating censored times in different survival settings --------------------------- #
          # =============================================================== #    
          #  RELATIVE SURVIVAL SETTING:                                     #  
          #  Simulates the situation where we dont know the cause of death  #
          # =============================================================== #    
              tpsSurv <- pmin(tpsGene, tpsSpe)
            
             # Random dropout at 'tpsCens':
               temps   <- pmin(tpsCens, tpsSurv)  
               statut  <- rep(1, NbInd)
               statut  <- ifelse(temps == tpsCens, 0, 1)
            
             # Administrative censoring at 'CensAdmin':
               statut[temps > CensAdmin]  <-  0
               temps[temps > CensAdmin]   <- CensAdmin
    
          # =============================================================== #    
          #  CAUSE-SPECIFIC SETTING:                                        #  
          #  Simulates the situation where we know the cause of death       #
          # =============================================================== #  
               cause <- rep(0, NbInd)
            
             # Identifying the cause of the event:
               cause <- ifelse(tpsSurv == tpsSpe, 1, 0)  
            
             # Introduce the censor indicator    
               ind        <- which(cause == 1)
               cause[ind] <- ifelse(temps[ind] == tpsCens[ind], 0, 1)
               cause[temps > CensAdmin] <-  0
            
          # ================================================================== #    
          #  HYPOTHETICAL WORLD:                                               #  
          #  Simulates the hypothetical situation where people only experience #
          #  the event of interest.                                            #
          # ================================================================== #  
              temps2     <- pmin(tpsCens, tpsSpe)
              cause2     <- rep(1, NbInd)  
              cause2     <- ifelse(temps2 == tpsCens, 0, 1)  
              cause2[temps2 > CensAdmin]  <-  0
              temps2[temps2 > CensAdmin]  <- CensAdmin

        # ----------- Controlling the censoring proportion (not accounting the administrative censoring) ------ #
            nCens     <- 0  
            nDC       <- 0  
            
            nCens <- nCens + sum(temps < CensAdmin & statut == 0)  
            nDC <- nDC + sum(statut == 1)
            
            # Taux attendu de mortalite a partir des temps generes
            ageDC   <- trunc(age) + trunc(temps)
            ageDC   <-(ageDC >= 100)*(99) + (ageDC < 100)*ageDC
            tauxatt <- (sex == 0)*(1-exp(-365.24*survexp.us[ageDC + 1 + 4, "male", "2004"])) + 
                       (sex == 1)*(1-exp(-365.24*survexp.us[ageDC + 1 + 4, "female", "2004"]))
            
  
      # ============================================================================= #
      #  5. SAVING THE OUTPUTS IN TH DATA FRAME                                       #
      # ============================================================================= #  
        for (i in (1:NbInd)){
          MyData[i,] <- c(id=i,  age=age[i], ageCentre=ageCentre[i], ageCat = age_cat[i],
                          sex = sex[i], X = X[i], obsTime=temps[i], status=statut[i], tauxatt=tauxatt[i], 
                          cause_spec=cause[i],PopTime = tpsGene[i], SpecTime = tpsSpe[i], 
                          hyp_time=temps2[i], cause_hypo_world=cause2[i])}
        
        return(list(simuData = as.data.frame(MyData), 
                    prop.cens = 100*nCens/NbInd, 
                    prop.event = 100*nDC/NbInd, 
                    age.prop = sim.covs$age.prop, 
                    sex.prop = sim.covs$sex.prop, 
                    prop.spec.event = sum(tpsSpe < tpsGene)/NbInd ))}  

\(~\)

Generating different scenarios for the simulation study

\(~\)

We want to check:

\(\bullet\) The bias of using

\[\widehat{BS}(t) = \frac{1}{n} \sum_{i=1}^{n}{\left[ \left(\mathbb{I}(T_i> t) - \widehat{\pi}_{E,n}(t|x_i)\right)^2 \right]}, \qquad t \geq 0, \] to estimate \[BS_{RS}(t) = \mathbb{E}\left[\left(\mathbb{I}(T_{E} > t)- \hat{\pi}_{E}(t|X)\right)^2\right], \qquad t \geq 0.\]

\(~\)

\(\bullet\) The performance of the proposed approach

\[\widehat{BS}(t) = \frac{1}{n} \sum_{i=1}^{n}{\left[ \left(\mathbb{I}(T_i> t)- {\color{purple}S_{P}(t| x_D )} \cdot ~ \hat{\pi}_{E}(t|x )\right)^2 \right]}\cdot \omega(t,\hat{G}_n,x_i), \qquad \omega(t,\hat{G}_n,x_i)= \frac{\mathbb{I}_{(T_i\leq t)\delta_i}}{\widehat{G}_n(T_{i^{-}}|x_i)} ~+~ \frac{\mathbb{I}_{(T_i > t)}}{\hat{G}_n(t|x_i)},\]
to capture a misspecification in the model for different ranges of \(S_P(t|x_D)\).

\(~\)

Scenario I
simRSdata <- simuRSdata(1000, 
                        sim.covs=list(age.prop = c(0.025,0.025,0.95)), 
                        cens.control = list(par.unif = c(0,10)))
  par(mfrow=c(1,3))
  barplot(table(simRSdata$simuData$ageCat), xlab = "Age categories")
  hist(as.numeric(simRSdata$simuData$age), xlab="Age", main="")
  plot(as.numeric(simRSdata$simuData$hyp_time),as.numeric(simRSdata$simuData$PopTime), ylab="Population times",xlab="Specific times",  xlim=c(0,15),ylim=c(0,70), main="")

\(~\)

\(\bullet\) OBS. Third plot corresponds to the points \((T_{Ei},T_{Pi}), ~i = 1, \ldots, n.\)

\(~\)

Scenario II
simRSdata2 <- simuRSdata(1000, 
                         sim.covs=list(age.prop = c(0.8,0.1,0.1)), 
                         cens.control = list(par.unif = c(0,10)))
  par(mfrow=c(1,3))
  barplot(table(simRSdata2$simuData$ageCat), xlab = "Age categories")
  hist(as.numeric(simRSdata2$simuData$age), xlab="Age", main="")
  plot(as.numeric(simRSdata2$simuData$hyp_time),as.numeric(simRSdata2$simuData$PopTime), ylab="Population times",xlab="Specific times",     xlim=c(0,15),ylim=c(0,70), main="")

\(~\)

B.4 Generating censored data