\(~\)
\(~\)
The true excess hazard function is described by the Generalized Weibull regression model:
\[\lambda_{E}(t|x) = \frac{k \rho t^{k-1} }{1 + \frac{(\rho t)^k }{\alpha}} \cdot \exp{(x^\top{}\beta)} \]
# Baseline hazard function
haz_GenWeibull <- function(t, rho, a, k, log.p = FALSE){
log.h <- log(k) + log(rho) + (k-1)*log(t) - log(1 + ((rho*t)^k)/a)
ifelse(log.p, return(log.h), return(exp(log.h)))
}
Parameters are chosen to mimic the scenarios of cancer population studies.
# ================================================ #
# Colorectal cancer behavior (Nathalie's thesis) #
# ================================================ #
# Baseline hazard parameters
r <- 0.5
a <- 0.2
k <- 2
# Regression coefficients
betaAge <- log(1.03)
betaSexe <- log(0.8)
beta_X <- log(2)
# Plotting the baseline hazard
haz_values <- Vectorize(function(t) haz_GenWeibull(t,r,a,k ))
curve(haz_values, 0, 10, ylim = c(0,1), col = "red", lwd = 2, n = 1000, xlab = "Follow-up time",
ylab = "Hazard function", cex.axis = 1, cex.lab = 1)
\(~\)
\(\bullet\) Code and parameters values: Giorgi.
\(\bullet\) Simulation of survival times from general population/ outputs (cause-specific, hypothetical world): Nathalie.
\(\bullet\) Life table: \(\texttt{survexp.us}\).
\(\bullet\) Initially, I generated 1 replicate of the data set and a fixed proporton of censored observations.
\(~\)
library("survival")
simuRSdata <- function(NbInd, sim.covs = list(age.prop = c(0.25,0.35,0.4), sex.prop = 0.5), cens.control = list(censor.dist = "unif", par.unif = c(0,10), par.exp = 2,CensAdmin=5)){
# ================================================== #
# CREATING THE DATA FRAME FOR THE SIMULATED DATA #
# ================================================== #
MyData <- array(0, dim=c(NbInd, 14))
dimnames(MyData) <- list(seq(1,NbInd ),
c("id","age", "ageCentre","ageCat", "sex", "X","obsTime", "status",
"tauxatt", "cause_spec", "PopTime","SpecTime", "hyp_time", "cause_hypo_world"))
# =========================================================================== #
# SETTING THE TRUE EXCESS HAZARD MODEL #
# =========================================================================== #
# GENERALIZED WEIBULL DISTRIBUTION: Parameters are chosen to mimic population studies on colorectal cancer
r <- 0.5
a <- 0.2
k <- 2
betaAge1 <- log(2)
betaAge2 <- log(3)
betaSexe <- log(0.8)
beta_X <- log(2)
# =========================================== #
# GENERATING RELATIVE SURVIVAL DATA SETS #
# =========================================== #
# ================================= #
# 1. Generating covariate values #
# ================================= #
# ------------- Age ----------------------------------------------------------------------------- #
pAge0 <- sim.covs$age.prop[1]
pAge1 <- sim.covs$age.prop[2]
pAge2 <- sim.covs$age.prop[3]
age <- c(runif(NbInd*pAge0, min = 30, max = 65),
runif(NbInd*pAge1, min = 65, max = 75),
runif(NbInd*pAge2, min = 75, max = 85))
age1 <- ifelse((age>=65 & age<75), 1, 0)
age2 <- ifelse(age>=75, 1, 0)
age_cat <- c(rep("[30,65]",NbInd*pAge0),
rep("[65, 75]",NbInd*pAge1),
rep("[75, 85]",NbInd*pAge2))
ageMoyen <- mean(age)
ageCentre <- age - ageMoyen
# ----------- Sex ------------------------------------------------------------------------------- #
pFem <- 0.5 # Proportion of females in each file
uSex <- runif(NbInd)
sex <- (uSex <= pFem)*1 + (uSex > pFem)*0 # Categorization: Male = 0, Female = 1.
# ----------- Dummy variable X ------------------------------------------------------------------ #
pX <- 0.5 # Proportion of X = 0 in each file
uX <- runif(NbInd)
X <- (uX <= pX)*1 + (uX > pX)*0
# ========================================================================================= #
# 2. Generating survival times for general population (tpsGene) #
# ========================================================================================= #
tpsGene <- rep(0, NbInd)
sexNom <- ifelse(sex == 0, "male", "female")
f1 <- function(i){
uAtt <- runif(1)
TauxAtt <- 1-exp(-365.24*survexp.us[trunc(age[i]) + 1 , sexNom[i], "2004"])
if(uAtt <= TauxAtt){
tpsG <- runif(1)
tpsGene[i] <- tpsG
}
else{
while(uAtt > TauxAtt){
tpsGene[i] <- tpsGene[i] + 1
uAtt <- runif(1)
if(trunc(age[i]) + 1 + tpsGene[i] < 100)
TauxAtt <- 1-exp(-365.24*survexp.us[trunc(age[i]) + tpsGene[i] + 1 , sexNom[i], "2004"])
else
TauxAtt <- 1-exp(-365.24*survexp.us[104, sexNom[i], "2004"])
}
tpsG <- runif(1)
tpsGene[i] <- tpsGene[i] + tpsG
}
}
tpsGene <- sapply(1:NbInd,f1)
# ============================================================================= #
# 3. Generating survival times from the excess hazard model (tpsSpe) #
# ============================================================================= #
# Non-censored specific times(tpsSpe): Uses the inverse of the FDA of the Generalized
# Weibull distribution
Ui <- runif(NbInd)
exp.betaz <- exp(betaSexe*sex + betaAge1*age1 + betaAge2*age2 + beta_X*X)
tpsSpe <- (1/r*(a*((1/(1-Ui))^(1/(a*exp.betaz))-1))^(1/k))
# ============================================================================= #
# 4. Generating right-censored observations #
# ============================================================================= #
# ------------- Simulating completely random dropout times tpsCens --------------------------------- #
# ============== #
# UNIFORM law #
# ============== #
tpsCens <- runif(NbInd, min = cens.control$par.unif[1], max = cens.control$par.unif[2])
CensAdmin <- cens.control$CensAdmin
# ------------ Generating censored times in different survival settings --------------------------- #
# =============================================================== #
# RELATIVE SURVIVAL SETTING: #
# Simulates the situation where we dont know the cause of death #
# =============================================================== #
tpsSurv <- pmin(tpsGene, tpsSpe)
# Random dropout at 'tpsCens':
temps <- pmin(tpsCens, tpsSurv)
statut <- rep(1, NbInd)
statut <- ifelse(temps == tpsCens, 0, 1)
# Administrative censoring at 'CensAdmin':
statut[temps > CensAdmin] <- 0
temps[temps > CensAdmin] <- CensAdmin
# =============================================================== #
# CAUSE-SPECIFIC SETTING: #
# Simulates the situation where we know the cause of death #
# =============================================================== #
cause <- rep(0, NbInd)
# Identifying the cause of the event:
cause <- ifelse(tpsSurv == tpsSpe, 1, 0)
# Introduce the censor indicator
ind <- which(cause == 1)
cause[ind] <- ifelse(temps[ind] == tpsCens[ind], 0, 1)
cause[temps > CensAdmin] <- 0
# ================================================================== #
# HYPOTHETICAL WORLD: #
# Simulates the hypothetical situation where people only experience #
# the event of interest. #
# ================================================================== #
temps2 <- pmin(tpsCens, tpsSpe)
cause2 <- rep(1, NbInd)
cause2 <- ifelse(temps2 == tpsCens, 0, 1)
cause2[temps2 > CensAdmin] <- 0
temps2[temps2 > CensAdmin] <- CensAdmin
# ----------- Controlling the censoring proportion (not accounting the administrative censoring) ------ #
nCens <- 0
nDC <- 0
nCens <- nCens + sum(temps < CensAdmin & statut == 0)
nDC <- nDC + sum(statut == 1)
# Taux attendu de mortalite a partir des temps generes
ageDC <- trunc(age) + trunc(temps)
ageDC <-(ageDC >= 100)*(99) + (ageDC < 100)*ageDC
tauxatt <- (sex == 0)*(1-exp(-365.24*survexp.us[ageDC + 1 + 4, "male", "2004"])) +
(sex == 1)*(1-exp(-365.24*survexp.us[ageDC + 1 + 4, "female", "2004"]))
# ============================================================================= #
# 5. SAVING THE OUTPUTS IN TH DATA FRAME #
# ============================================================================= #
for (i in (1:NbInd)){
MyData[i,] <- c(id=i, age=age[i], ageCentre=ageCentre[i], ageCat = age_cat[i],
sex = sex[i], X = X[i], obsTime=temps[i], status=statut[i], tauxatt=tauxatt[i],
cause_spec=cause[i],PopTime = tpsGene[i], SpecTime = tpsSpe[i],
hyp_time=temps2[i], cause_hypo_world=cause2[i])}
return(list(simuData = as.data.frame(MyData),
prop.cens = 100*nCens/NbInd,
prop.event = 100*nDC/NbInd,
age.prop = sim.covs$age.prop,
sex.prop = sim.covs$sex.prop,
prop.spec.event = sum(tpsSpe < tpsGene)/NbInd ))}
\(~\)
\(~\)
We want to check:
\(\bullet\) The bias of using
\[\widehat{BS}(t) = \frac{1}{n} \sum_{i=1}^{n}{\left[ \left(\mathbb{I}(T_i> t) - \widehat{\pi}_{E,n}(t|x_i)\right)^2 \right]}, \qquad t \geq 0, \] to estimate \[BS_{RS}(t) = \mathbb{E}\left[\left(\mathbb{I}(T_{E} > t)- \hat{\pi}_{E}(t|X)\right)^2\right], \qquad t \geq 0.\]
\(~\)
\(\bullet\) The performance of the proposed approach
\[\widehat{BS}(t) = \frac{1}{n} \sum_{i=1}^{n}{\left[ \left(\mathbb{I}(T_i> t)- {\color{purple}S_{P}(t| x_D )} \cdot ~ \hat{\pi}_{E}(t|x )\right)^2 \right]}\cdot \omega(t,\hat{G}_n,x_i), \qquad \omega(t,\hat{G}_n,x_i)= \frac{\mathbb{I}_{(T_i\leq t)\delta_i}}{\widehat{G}_n(T_{i^{-}}|x_i)} ~+~ \frac{\mathbb{I}_{(T_i > t)}}{\hat{G}_n(t|x_i)},\]
to capture a misspecification in the model for different ranges of \(S_P(t|x_D)\).
\(~\)
simRSdata <- simuRSdata(1000,
sim.covs=list(age.prop = c(0.025,0.025,0.95)),
cens.control = list(par.unif = c(0,10)))
par(mfrow=c(1,3))
barplot(table(simRSdata$simuData$ageCat), xlab = "Age categories")
hist(as.numeric(simRSdata$simuData$age), xlab="Age", main="")
plot(as.numeric(simRSdata$simuData$hyp_time),as.numeric(simRSdata$simuData$PopTime), ylab="Population times",xlab="Specific times", xlim=c(0,15),ylim=c(0,70), main="")
\(~\)
\(\bullet\) OBS. Third plot corresponds to the points \((T_{Ei},T_{Pi}), ~i = 1, \ldots, n.\)
\(~\)
simRSdata2 <- simuRSdata(1000,
sim.covs=list(age.prop = c(0.8,0.1,0.1)),
cens.control = list(par.unif = c(0,10)))
par(mfrow=c(1,3))
barplot(table(simRSdata2$simuData$ageCat), xlab = "Age categories")
hist(as.numeric(simRSdata2$simuData$age), xlab="Age", main="")
plot(as.numeric(simRSdata2$simuData$hyp_time),as.numeric(simRSdata2$simuData$PopTime), ylab="Population times",xlab="Specific times", xlim=c(0,15),ylim=c(0,70), main="")
\(~\)