Code for the Survival Analysis Paper

R Markdown

Code for section 3.1:

# First thing first, we define the packages are going to be used in this survival analysis.
rm(list=ls())
library(survival)
library(flexsurv)
library(eha)

## 
## Attaching package: 'eha'

## The following objects are masked from 'package:flexsurv':
## 
##     dgompertz, dllogis, hgompertz, Hgompertz, hllogis, Hllogis, hlnorm,
##     Hlnorm, hweibull, Hweibull, pgompertz, pllogis, qgompertz, qllogis,
##     rgompertz, rllogis

# Then we show our selected data set.
data(kidney)
head(kidney)

##   id time status age sex disease frail
## 1  1    8      1  28   1   Other   2.3
## 2  1   16      1  28   1   Other   2.3
## 3  2   23      1  48   2      GN   1.9
## 4  2   13      0  48   2      GN   1.9
## 5  3   22      1  32   1   Other   1.2
## 6  3   28      1  32   1   Other   1.2

# Now we fit survival models with no covariates, this corresponds to the models in section 3.1 of the paper.
fit.weibull <- flexsurvreg(formula = Surv(time, status) ~ 1, data = kidney, dist = "weibull")
fit.ggama <- flexsurvreg(formula = Surv(time, status) ~ 1, data = kidney, dist = "gengamma")
fit.lnorm <- flexsurvreg(formula = Surv(time, status) ~ 1, data = kidney, dist = "lognormal")

# Showing the graphs of them, the graphs are included in the paper.
plot.new()
plot.window(xlim = c(0,600), ylim = c(0,1))
axis(1, cex.axis = 1.5); axis(2, cex.axis = 1.5); box(); title(ylab="Survival", xlab="Time", cex.lab = 1.5)
lines(fit.weibull, col="red", lwd.ci=0, lty.ci=0)
lines(fit.lnorm, col="blue", lwd.ci=0, lty.ci=0)
lines(fit.ggama, col="green", lwd.ci=0, lty.ci=0)
legend("topright", legend = c("weibull", "lnorm", "gengamma"),
       lty = 1, col = c("red","blue","green"))

plot.new()
plot.window(xlim = c(0,600), ylim = c(0,0.015))
axis(1, cex.axis = 1.5); axis(2, cex.axis = 1.5); box(); title(ylab="Hazard", xlab="Time", cex.lab = 1.5)
lines(fit.weibull, col="red", lwd.ci=0, lty.ci=0, type = "hazard")
lines(fit.lnorm, col="blue", lwd.ci=0, lty.ci=0, type = "hazard")
lines(fit.ggama, col="green", lwd.ci=0, lty.ci=0, type = "hazard")
legend("topright", legend = c("weibull", "lnorm", "gengamma"),
       lty = 1, col = c("red","blue","green"))

# We compare their AICs
AIC(fit.weibull)

## [1] 685.8749

AIC(fit.lnorm)

## [1] 684.0963

AIC(fit.ggama)

## [1] 685.5893

# This corresponds to figure 10 in the paper.
plot(fit.weibull, col="red", lwd.ci=0, lty.ci=0, ylab="Survival", xlab="Time", cex.lab = 1.5,cex.axis = 1.5)
lines(fit.lnorm, col="blue", lwd.ci=0, lty.ci=0)
lines(fit.ggama, col="green", lwd.ci=0, lty.ci=0)
legend("topright", legend = c("weibull", "lnorm", "gengamma", "KM"),
       lty = 1, col = c("red","blue","green","black"))

Code for section 3.2 and 3.3:

# Now we turn to full regression models.

# Following models are used in section 3.2.
# We start with the log-normal distribution with two hazard structures.

fit.aft <- aftreg(formula = Surv(time, status) ~ age + sex, data = kidney,
                  dist = "lognormal")

fit.ph <- phreg(formula = Surv(time, status) ~ age + sex , data = kidney,
                dist = "lognormal")


# Then we define the how to calculate AICs for different hazard structure as this is not inlcuded in the packages.
AIC.aft <- -2*(fit.aft$loglik[2]) + 2*length(fit.aft$coefficients)
AIC.ph <- -2*(fit.ph$loglik[2]) + 2*length(fit.ph$coefficients)

# Again, showing their AICs and plots, AICs are used to do model selection in section 3.3.
AIC.aft

## [1] 671.9596

AIC.ph

## [1] 679.9417

plot(fit.aft)

plot(fit.ph)

# Next we do the samething using the Weibull distribution.
# note here there are models with three covariates and this is explained in section 3.3.

fit.aftwbn <- aftreg(formula = Surv(time, status) ~ age + sex + factor(disease), data = kidney,
                  dist = "weibull")
fit.aftwb <- aftreg(formula = Surv(time, status) ~ age + sex, data = kidney,
                    dist = "weibull")

fit.phwbn <- phreg(formula = Surv(time, status) ~ age + sex + factor(disease), data = kidney,
                dist = "weibull")
fit.phwb <- phreg(formula = Surv(time, status) ~ age + sex, data = kidney,
                  dist = "weibull")



AIC.aftwb <- -2*(fit.aftwb$loglik[2]) + 2*length(fit.aftwb$coefficients)
AIC.aftwbn <- -2*(fit.aftwbn$loglik[2]) + 2*length(fit.aftwbn$coefficients)
AIC.phwb <- -2*(fit.phwb$loglik[2]) + 2*length(fit.phwb$coefficients)
AIC.phwbn <- -2*(fit.phwbn$loglik[2]) + 2*length(fit.phwbn$coefficients)

AIC.aftwb

## [1] 681.1083

AIC.aftwbn

## [1] 674.7269

AIC.phwb

## [1] 681.1083

AIC.phwbn

## [1] 674.7269

plot(fit.aftwb)

plot(fit.phwb)

Code for reparameterization to be used for prediction:

# Because the prediction function defined and used in section 4 uses parametrisation of mu,
# while the parametrisations of models in previous regression section uses median,
# we need to calculate the parameter mu of the model ourselves.
# This section of codes is not explained in the paper as it is pure coding, 
# but it is crucial step through the development of this paper.

# Lognormal Hazard, as at this point we have selected it as our model.
hlnorm <- function(t,mu,sigma, log = FALSE){
  lpdf0 <- dlnorm(t,mu,sigma, log = T)
  ls0 <- plnorm(t,mu,sigma, lower.tail = FALSE, log.p = T)
  val <- lpdf0 - ls0
  if(log) return(val) else return(exp(val))
}

# Lognormal Cumulative hazard
chlnorm <- function(t,mu,sigma){
  H0 <- -plnorm(t,mu,sigma, lower.tail = FALSE, log.p = TRUE)
  return(H0)
}

x <- as.matrix(cbind(kidney$age,kidney$sex))
p <- ncol(x)
status <- as.logical(kidney$status)
times <- as.vector(kidney$time)
times.obs <- times[status]


log.lik <- function(par){
  ae0 <- par[1]; be0 <- exp(par[2]);   beta <- par[3:(2+p)];
  exp.x.beta <- as.vector(exp(x%*%beta))
  exp.x.beta.obs <- exp.x.beta[status]
  haz0 <-  hlnorm(times.obs*exp.x.beta.obs,ae0,be0)*exp.x.beta.obs
  val <- - sum( log(haz0)) + sum(chlnorm(times*exp.x.beta,ae0,be0)) 
  return(val)
}

OPT <- optim(c(0,0,0,0),log.lik,control=list(maxit=1000))

OPT

## $par
## [1]  2.067646568  0.169612686  0.005256262 -1.375966481
## 
## $value
## [1] 331.9798
## 
## $counts
## function gradient 
##      257       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Code for section 4, prediction:

# After finishing calculating the parameter, we can start our predictions, this corresponds to section 4.
# We start with the distribution.

chlnorm <- function(t, mu, sigma){-plnorm(t, mu, sigma, lower.tail = FALSE, log.p = TRUE)}


# t : time
# ti : censored survival time
# beta : regression parameters
# theta: parameters of the baseline hazard

# We import the estimated parameters of our regression model.
x <- as.matrix(cbind(kidney$age,kidney$sex))
beta <- c(0.005256262,-1.375966481)
aft.theta <- c(2.067646568, exp(0.169594952))

#We define the function of prediction using the formula in the paper.
pred.surv <- function(t,i,beta,theta){
  x.beta <- as.numeric(x[i,]%*%beta)
  ti <- kidney$time[i]
  out <- ((exp(-chlnorm(ti*exp(x.beta), theta[1],theta[2]))) - exp(-chlnorm(t*exp(x.beta), theta[1],theta[2])))/(exp(-chlnorm(ti*exp(x.beta), theta[1],theta[2])))
  return(1 - out)
}


# Next we sub in the three patients we selected and show their remaining life predictions.
G.fit1 <- Vectorize(function(t) {pred.surv(t, 9, beta, aft.theta)})
G.fit2 <- Vectorize(function(t) {pred.surv(t, 53, beta, aft.theta)})
G.fit3 <- Vectorize(function(t) {pred.surv(t, 17, beta, aft.theta)})


curve(G.fit1,29,100,ylim=c(0,1),lwd=2,xlab="Time",ylab="Probability",main="Remaining Life estimation of patient id 5",
      sub="10-year-old man", col.lab="blue", col.sub="black")

curve(G.fit2,131,400,ylim=c(0,1),lwd=2,xlab="Time",ylab="Probability",main="Remaining Life estimation of patient id 27",
      sub="10-year-old woman", col.lab="blue", col.sub="black")

curve(G.fit3,52,225,ylim=c(0,1),lwd=2,xlab="Time",ylab="Probability",main="Remaining Life estimation of patient id 9",
      sub="69-year-old woman", col.lab="blue", col.sub="black")

# Here we show an example of calculating the median remaining life of a patient, 
# others can be obtained by simply amending the numbers.
G.fit2m <- Vectorize(function(x) G.fit2(x)-0.5)
uniroot(G.fit2m, c(200,300))

## $root
## [1] 281.5537
## 
## $f.root
## [1] -2.231892e-10
## 
## $iter
## [1] 5
## 
## $init.it
## [1] NA
## 
## $estim.prec
## [1] 6.103516e-05

Code for the Survival Analysis Paper

David Cai

3/21/2021

R Markdown