See this GitHub repository for more information: BVSSurv.
################################################################################
# Required packages
################################################################################
rm(list=ls())
library(mombf)
## Loading required package: mvtnorm
## Loading required package: ncvreg
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(survival)
library(xtable)
library(ggplot2)
library(BhGLM)
library(BVSNLP)
## Bayesian Variable Selection using Non-Local priors for survival and logistic regression data.
## Loading Version 1.1.8
################################################################################
# Data preparation
################################################################################
dat = read.table("ftbrs_survdata.txt",header=TRUE)
y= Surv(log(dat$Timerecurrence/12), event=dat$recurrence)
X= dat[,-1:-3]; X$stage= factor(X$stage)
X[,-1] <- apply(X[,-1],2,scale)
df <- data.frame(y = y, as.data.frame(X), time = dat$Timerecurrence/12, status = dat$recurrence)
# Visualise the Kaplan-Meier estimator (of survival)
km <- survfit(Surv(time, status) ~ 1, data = df)
plot(km, xlab = "Years", ylab = "Survival", main = "Kaplan-Meier estimator",
cex.axis = 1.5, cex.lab = 1.5)
####################################################################################################
## DATA ANALYSIS FOR MODEL ONLY WITH TGFB AND STAGE
## TGFB is supposed to be an important variable in cancer tumour evolution
####################################################################################################
#Run Bayesian variable selection
ms0= modelSelection(y ~ X$stage + X$tgfb,
priorCoef= momprior(taustd = 1),
priorDelta= modelbbprior(1,1))
## Enumerating models...
## Computing posterior probabilities........ Done.
pp0= postProb(ms0)
pp0
## modelid family pp
## 8 1,2,3,4 normal 6.902354e-01
## 6 1,4 normal 2.111825e-01
## 7 1,2,3 normal 7.452261e-02
## 5 1 normal 2.405950e-02
## 4 2,3,4 normal 1.073239e-09
## 3 2,3 normal 1.673414e-11
## 1 normal 9.084076e-34
## 2 4 normal 2.924971e-34
ms0$margpp #TGFB gets 0.9014 inclusion probability
## (Intercept) X$stage2 X$stage3 X$tgfb
## 1.0000000 0.7647580 0.7647580 0.9014179
################################################################################
# Bayesian Variable Selection (BVS) for TGFB and other variables
################################################################################
# Regression formula with linear effects
f1y <- formula(paste('y ~ ',paste(colnames(X), collapse='+'),sep=''))
#-------------------------------------------------------------------------------
# BVS: Prior 1 - Zellner's prior
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoef <- zellnerprior(tau=nrow(X))
# Prior on the model space
priorDelta <- modelbbprior(1,1)
# Selection step
ms1 <- modelSelection(f1y,data=df,
priorCoef=priorCoef,priorDelta=priorDelta,
enumerate=FALSE, method='Laplace', niter=10000,
center = TRUE, scale = TRUE)
## Zellner prior not implemented, using group Zellner prior instead
## Greedy searching posterior mode... Done.
## Zellner prior not implemented, using group Zellner prior instead
## Running Gibbs sampler........... Done.
# Calculating model posterior probabilities
pp1 <- postProb(ms1)
# Top models
head(pp1)
## modelid family pp
## 1 1,2,3,45 normal 0.09038226
## 2 1,2,3,15,59 normal 0.04403847
## 3 1,45 normal 0.03171598
## 4 1,15,59 normal 0.02624099
## 5 1,2,3,15,144 normal 0.01880220
## 6 1,2,3,15 normal 0.01844998
# Marginal inclusion probabilities
mp1 <- ms1$margpp
# Visualising the marginal inclusion probabilities > 0.5
ind1 <- which(mp1>0.5)
df1<-data.frame(names=names(mp1[ind1]),mp = as.vector(round(mp1[ind1],digits = 3)))
df1
## names mp
## 1 (Intercept) 1.000
## 2 stage2 0.759
## 3 stage3 0.759
## 4 X208394_x_at 0.648
ggplot(df1,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (Laplace)
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoefm <- momprior(taustd=1) # unit information p-mom (or tau = 0.192)
# Prior on the model space
priorDelta <- modelbbprior(1,1)
priorGroup= groupzellnerprior(tau=nrow(X))
# Selection step
ms2 <- modelSelection(f1y,data=df,
priorCoef=priorCoefm,priorDelta=priorDelta,
priorGroup = priorGroup,
enumerate=FALSE, method='Laplace', niter=10000,
center = TRUE, scale = TRUE)
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
# Calculating model posterior probabilities
pp2 <- postProb(ms2)
# Top models
head(pp2)
## modelid family pp
## 1 1,2,3,45 normal 0.12628218
## 2 1,45 normal 0.11230895
## 3 1,15,59 normal 0.07326369
## 4 1,2,3,15,59 normal 0.04689552
## 5 1,2,3,14,15,59 normal 0.02363060
## 6 1,15,31,59 normal 0.02236104
# Marginal inclusion probabilities
mp2 <- ms2$margpp
# Visualising the marginal inclusion probabilities
ind2 <- which(mp2>0.5)
df2<-data.frame(names=names(mp2[ind2]),mp = as.vector(round(mp2[ind2],digits = 3)))
df2
## names mp
## 1 (Intercept) 1.000
## 2 stage2 0.526
## 3 stage3 0.526
## 4 X208394_x_at 0.522
ggplot(df2,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
# What conclusions can we get from this?
# MoM priors indicate that stage explains recurrence time. Does this suggest any additional
# strategy in model selection? --- Stratification.
#-------------------------------------------------------------------------------
# BVS: Prior 3 - MOM prior (ALA)
#-------------------------------------------------------------------------------
priorGroup= groupzellnerprior(tau=nrow(X))
# Selection step
ms3 <- modelSelection(f1y,data=df,
priorCoef=priorCoefm,priorDelta=priorDelta,
priorGroup = priorGroup,
enumerate=FALSE, method='Laplace', niter=10000,
center = TRUE, scale = TRUE)
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
# Calculating model posterior probabilities
pp3 <- postProb(ms3)
# Top models
head(pp3)
## modelid family pp
## 1 1,2,3,45 normal 0.12582393
## 2 1,45 normal 0.11190140
## 3 1,15,59 normal 0.07299783
## 4 1,2,3,15,59 normal 0.04672535
## 5 1,2,3,14,15,59 normal 0.02354485
## 6 1,15,31,59 normal 0.02227989
# Marginal inclusion probabilities
mp3 <- ms3$margpp
# Visualising the marginal inclusion probabilities
ind3 <- which(mp3>0.5)
df3<-data.frame(names=names(mp3[ind3]),mp = as.vector(round(mp3[ind3],digits = 3)))
df3
## names mp
## 1 (Intercept) 1.000
## 2 stage2 0.529
## 3 stage3 0.529
## 4 X208394_x_at 0.549
ggplot(df3,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#####################################################################
# Cox PiMOM priors (sensitivity analysis)
#####################################################################
# Survival object with times
y2 <- Surv(time = df$time, df$status)
X2 <- X
bvsfit <- bvs(X = data.frame(X2), resp = as.matrix(cbind(df$time,df$status)),
family = "survival", nlptype = "piMOM", mod_prior = "unif",
niter = 100)
# Models with highest probability
head(bvsfit$max_models)
## [[1]]
## [,1]
## [1,] 2
## [2,] 44
##
## [[2]]
## [,1]
## [1,] 44
##
## [[3]]
## [,1]
## [1,] 2
## [2,] 10
## [3,] 44
##
## [[4]]
## [,1]
## [1,] 2
## [2,] 13
## [3,] 14
## [4,] 58
##
## [[5]]
## [,1]
## [1,] 2
## [2,] 9
## [3,] 13
## [4,] 14
## [5,] 58
##
## [[6]]
## [,1]
## [1,] 10
## [2,] 44
# Highest probabilities
head(exp(bvsfit$max_prob_vec - max(bvsfit$max_prob_vec))/sum(exp(bvsfit$max_prob_vec - max(bvsfit$max_prob_vec) )))
## [1] 0.07109050 0.05136030 0.04515483 0.03397149 0.03271321 0.03267788
# Highest posterior model
bvsfit$HPM
## [,1]
## [1,] 2
## [2,] 44
# Estimated coefficients
bvsfit$beta_hat
## [1] 0.9552322 0.6020811
# Number of Visited Models
bvsfit$num_vis_models
## [1] 3908
# Visualising the marginal inclusion probabilities
mp = as.vector(round(bvsfit$inc_probs,digits = 3))
dfi <- data.frame( names1 = names(mp1)[-1], mp = as.vector(round(bvsfit$inc_probs,digits = 3)))
index = tail(order(dfi$mp),n=10)
dfint <- dfi[index,]
ggplot(dfint,aes(x = reorder(names1, -mp), y = mp,label = mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#####################################################################
# Spike-and-Slab LASSO (sensitivity analysis)
#####################################################################
# Requires a different design matrix as it does not admit factors
X3= dat[,-1:-3];
X3 <- apply(X3,2,scale)
# Stage as a binary variable
#stageM <- matrix(0, ncol = 3, nrow = dim(X3)[1])
#for(i in 1:dim(X3)[1]) stageM[i,df$stage[i]] = 1
# Adding stage indicators
#X3 <- as.matrix(cbind(stageM[,2:3],X3))
# Selection step
spsl <- bmlasso(x = X3, y = y2, family = "cox")
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Posterior modes (only option in this package, no MCMC)
head(sort(spsl$coef,decreasing = TRUE))
## X226497_s_at stage tgfb X232478_at X205533_s_at X232090_at
## 0.6626776 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# Posterior modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1,
main = "Spike-and-Slab LASSO", lty = 2)
# Posterior Odds-Ratios for the modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1,
main = "Spike-and-Slab LASSO", lty = 2, OR = TRUE)
#####################################################################
# Cox-LASSO (sensitivity analysis)
#####################################################################
# Selection step
cv.fit = try(cv.glmnet(x = X3, y = y2, family="cox",
maxit=1000, nfolds=10, alpha=1), silent=TRUE)
fit = try(glmnet(x = X3, y=y2, family = "cox", maxit=1e5, alpha=1), silent=TRUE)
# active variables (lambda.min)
b.coxlasso = as.double(coef(fit, s=cv.fit$lambda.min))
which(b.coxlasso!=0)
## [1] 1 8 9 13 14 23 29 35 43 76 142
colnames(X)[which(b.coxlasso!=0)]
## [1] "stage" "X202241_at" "X201830_s_at" "X208394_x_at" "X225803_at"
## [6] "X229125_at" "X202668_at" "X239364_at" "X226497_s_at" "X212143_s_at"
## [11] "X205428_s_at"
# active variables (lambda.1se)
b2.coxlasso = as.double(coef(fit, s=cv.fit$lambda.1se))
which(b2.coxlasso!=0)
## integer(0)
colnames(X)[which(b2.coxlasso!=0)]
## character(0)
# Fitting a CoxPH model using the survival R package
fc <- formula(paste('y2 ~ ',paste(colnames(X3)[which(b.coxlasso!=0)], collapse='+'),sep=''))
fit2 <- coxph(fc, data = df)
summary(fit2)
## Call:
## coxph(formula = fc, data = df)
##
## n= 260, number of events= 50
##
## coef exp(coef) se(coef) z Pr(>|z|)
## stage2 0.81254 2.25362 0.78248 1.038 0.29908
## stage3 1.45827 4.29850 0.75390 1.934 0.05308 .
## X202241_at 0.21548 1.24046 0.14642 1.472 0.14111
## X201830_s_at 0.42024 1.52232 0.18514 2.270 0.02322 *
## X208394_x_at 0.50513 1.65720 0.20294 2.489 0.01281 *
## X225803_at 0.12884 1.13751 0.21047 0.612 0.54044
## X229125_at 0.01933 1.01952 0.16403 0.118 0.90618
## X202668_at 0.45199 1.57144 0.19818 2.281 0.02257 *
## X239364_at -0.45984 0.63138 0.15177 -3.030 0.00245 **
## X226497_s_at 0.38874 1.47512 0.21418 1.815 0.06953 .
## X212143_s_at 0.13972 1.14995 0.17687 0.790 0.42954
## X205428_s_at 0.20173 1.22352 0.16031 1.258 0.20824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## stage2 2.2536 0.4437 0.4862 10.4454
## stage3 4.2985 0.2326 0.9808 18.8380
## X202241_at 1.2405 0.8062 0.9310 1.6528
## X201830_s_at 1.5223 0.6569 1.0591 2.1883
## X208394_x_at 1.6572 0.6034 1.1134 2.4667
## X225803_at 1.1375 0.8791 0.7530 1.7183
## X229125_at 1.0195 0.9809 0.7392 1.4061
## X202668_at 1.5714 0.6364 1.0656 2.3174
## X239364_at 0.6314 1.5838 0.4689 0.8501
## X226497_s_at 1.4751 0.6779 0.9694 2.2446
## X212143_s_at 1.1500 0.8696 0.8131 1.6264
## X205428_s_at 1.2235 0.8173 0.8936 1.6752
##
## Concordance= 0.844 (se = 0.026 )
## Likelihood ratio test= 79.24 on 12 df, p=6e-12
## Wald test = 70.3 on 12 df, p=3e-10
## Score (logrank) test = 87.52 on 12 df, p=1e-13