Example 3: colon cancer data set

See this GitHub repository for more information: BVSSurv.

Data preparation

################################################################################
# Required packages
################################################################################
rm(list=ls())
library(mombf)
## Loading required package: mvtnorm
## Loading required package: ncvreg
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(survival)
library(xtable)
library(ggplot2)
library(BhGLM)
library(BVSNLP)
## Bayesian Variable Selection using Non-Local priors for survival and logistic regression data.
## Loading Version 1.1.8
################################################################################
# Data preparation
################################################################################
dat = read.table("ftbrs_survdata.txt",header=TRUE)
y= Surv(log(dat$Timerecurrence/12), event=dat$recurrence)
X= dat[,-1:-3]; X$stage= factor(X$stage)
X[,-1] <- apply(X[,-1],2,scale)
df <- data.frame(y = y, as.data.frame(X), time = dat$Timerecurrence/12, status = dat$recurrence)


# Visualise the Kaplan-Meier estimator (of survival)
km <- survfit(Surv(time, status) ~ 1, data = df)
plot(km, xlab = "Years", ylab = "Survival", main = "Kaplan-Meier estimator",
     cex.axis = 1.5, cex.lab = 1.5)

Bayesian Variable Selection

Data analysis for model only with TGFB and STAGE

####################################################################################################
##  DATA ANALYSIS FOR MODEL ONLY WITH TGFB AND STAGE
## TGFB is supposed to be an important variable in cancer tumour evolution
####################################################################################################

#Run Bayesian variable selection

ms0= modelSelection(y ~ X$stage + X$tgfb,  
                    priorCoef= momprior(taustd = 1), 
                    priorDelta= modelbbprior(1,1))
## Enumerating models...
## Computing posterior probabilities........ Done.
pp0= postProb(ms0)
pp0
##   modelid family           pp
## 8 1,2,3,4 normal 6.902354e-01
## 6     1,4 normal 2.111825e-01
## 7   1,2,3 normal 7.452261e-02
## 5       1 normal 2.405950e-02
## 4   2,3,4 normal 1.073239e-09
## 3     2,3 normal 1.673414e-11
## 1         normal 9.084076e-34
## 2       4 normal 2.924971e-34
ms0$margpp #TGFB gets 0.9014 inclusion probability
## (Intercept)    X$stage2    X$stage3      X$tgfb 
##   1.0000000   0.7647580   0.7647580   0.9014179

Full model

################################################################################
# Bayesian Variable Selection (BVS) for TGFB and other variables
################################################################################

# Regression formula with linear effects
f1y <-  formula(paste('y ~ ',paste(colnames(X), collapse='+'),sep=''))

#-------------------------------------------------------------------------------
# BVS: Prior 1 - Zellner's prior
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoef <- zellnerprior(tau=nrow(X))
# Prior on the model space
priorDelta <- modelbbprior(1,1)
# Selection step
ms1 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoef,priorDelta=priorDelta,
                      enumerate=FALSE, method='Laplace', niter=10000, 
                      center = TRUE, scale = TRUE)
## Zellner prior not implemented, using group Zellner prior instead
## Greedy searching posterior mode... Done.
## Zellner prior not implemented, using group Zellner prior instead
## Running Gibbs sampler........... Done.
# Calculating model posterior probabilities
pp1 <- postProb(ms1)
# Top models
head(pp1)
##        modelid family         pp
## 1     1,2,3,45 normal 0.09038226
## 2  1,2,3,15,59 normal 0.04403847
## 3         1,45 normal 0.03171598
## 4      1,15,59 normal 0.02624099
## 5 1,2,3,15,144 normal 0.01880220
## 6     1,2,3,15 normal 0.01844998
# Marginal inclusion probabilities
mp1 <- ms1$margpp

# Visualising the marginal inclusion probabilities > 0.5
ind1 <- which(mp1>0.5)
df1<-data.frame(names=names(mp1[ind1]),mp = as.vector(round(mp1[ind1],digits = 3)))
df1
##          names    mp
## 1  (Intercept) 1.000
## 2       stage2 0.759
## 3       stage3 0.759
## 4 X208394_x_at 0.648
ggplot(df1,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (Laplace)
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoefm <- momprior(taustd=1) # unit information p-mom (or tau = 0.192)
# Prior on the model space
priorDelta <- modelbbprior(1,1)

priorGroup= groupzellnerprior(tau=nrow(X))
# Selection step
ms2 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoefm,priorDelta=priorDelta,
                      priorGroup = priorGroup,
                      enumerate=FALSE, method='Laplace', niter=10000, 
                      center = TRUE, scale = TRUE)
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
# Calculating model posterior probabilities 
pp2 <- postProb(ms2)
# Top models
head(pp2)
##          modelid family         pp
## 1       1,2,3,45 normal 0.12628218
## 2           1,45 normal 0.11230895
## 3        1,15,59 normal 0.07326369
## 4    1,2,3,15,59 normal 0.04689552
## 5 1,2,3,14,15,59 normal 0.02363060
## 6     1,15,31,59 normal 0.02236104
# Marginal inclusion probabilities
mp2 <- ms2$margpp


# Visualising the marginal inclusion probabilities
ind2 <- which(mp2>0.5)
df2<-data.frame(names=names(mp2[ind2]),mp = as.vector(round(mp2[ind2],digits = 3)))
df2
##          names    mp
## 1  (Intercept) 1.000
## 2       stage2 0.526
## 3       stage3 0.526
## 4 X208394_x_at 0.522
ggplot(df2,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

# What conclusions can we get from this?
# MoM priors indicate that stage explains recurrence time. Does this suggest any additional 
# strategy in model selection? --- Stratification. 


#-------------------------------------------------------------------------------
# BVS: Prior 3 - MOM prior (ALA)
#-------------------------------------------------------------------------------

priorGroup= groupzellnerprior(tau=nrow(X))
# Selection step
ms3 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoefm,priorDelta=priorDelta,
                      priorGroup = priorGroup,
                      enumerate=FALSE, method='Laplace', niter=10000, 
                      center = TRUE, scale = TRUE)
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
# Calculating model posterior probabilities 
pp3 <- postProb(ms3)
# Top models
head(pp3)
##          modelid family         pp
## 1       1,2,3,45 normal 0.12582393
## 2           1,45 normal 0.11190140
## 3        1,15,59 normal 0.07299783
## 4    1,2,3,15,59 normal 0.04672535
## 5 1,2,3,14,15,59 normal 0.02354485
## 6     1,15,31,59 normal 0.02227989
# Marginal inclusion probabilities
mp3 <- ms3$margpp


# Visualising the marginal inclusion probabilities
ind3 <- which(mp3>0.5)
df3<-data.frame(names=names(mp3[ind3]),mp = as.vector(round(mp3[ind3],digits = 3)))
df3
##          names    mp
## 1  (Intercept) 1.000
## 2       stage2 0.529
## 3       stage3 0.529
## 4 X208394_x_at 0.549
ggplot(df3,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

Sensitivity Analysis: Cox PiMOM priors

#####################################################################
# Cox PiMOM priors (sensitivity analysis)
#####################################################################

# Survival object with times
y2 <- Surv(time = df$time, df$status)

X2 <- X

bvsfit <- bvs(X = data.frame(X2), resp = as.matrix(cbind(df$time,df$status)), 
              family  = "survival", nlptype = "piMOM", mod_prior = "unif",
              niter = 100)

# Models with highest probability
head(bvsfit$max_models)
## [[1]]
##      [,1]
## [1,]    2
## [2,]   44
## 
## [[2]]
##      [,1]
## [1,]   44
## 
## [[3]]
##      [,1]
## [1,]    2
## [2,]   10
## [3,]   44
## 
## [[4]]
##      [,1]
## [1,]    2
## [2,]   13
## [3,]   14
## [4,]   58
## 
## [[5]]
##      [,1]
## [1,]    2
## [2,]    9
## [3,]   13
## [4,]   14
## [5,]   58
## 
## [[6]]
##      [,1]
## [1,]   10
## [2,]   44
# Highest probabilities
head(exp(bvsfit$max_prob_vec - max(bvsfit$max_prob_vec))/sum(exp(bvsfit$max_prob_vec - max(bvsfit$max_prob_vec) )))
## [1] 0.07109050 0.05136030 0.04515483 0.03397149 0.03271321 0.03267788
# Highest posterior model
bvsfit$HPM
##      [,1]
## [1,]    2
## [2,]   44
# Estimated coefficients
bvsfit$beta_hat
## [1] 0.9552322 0.6020811
# Number of Visited Models
bvsfit$num_vis_models
## [1] 3908
# Visualising the marginal inclusion probabilities
mp = as.vector(round(bvsfit$inc_probs,digits = 3))
dfi <- data.frame( names1 = names(mp1)[-1], mp = as.vector(round(bvsfit$inc_probs,digits = 3)))

index = tail(order(dfi$mp),n=10)
dfint <- dfi[index,]


ggplot(dfint,aes(x =  reorder(names1, -mp), y = mp,label = mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

Sensitivity Analysis: Cox Spike-and-Slab LASSO

#####################################################################
# Spike-and-Slab LASSO (sensitivity analysis)
#####################################################################

# Requires a different design matrix as it does not admit factors
X3= dat[,-1:-3]; 
X3 <- apply(X3,2,scale)

# Stage as a binary variable
#stageM <- matrix(0, ncol = 3, nrow =  dim(X3)[1])
#for(i in 1:dim(X3)[1]) stageM[i,df$stage[i]] = 1

# Adding stage indicators
#X3 <- as.matrix(cbind(stageM[,2:3],X3))

# Selection step
spsl <- bmlasso(x = X3, y =  y2, family = "cox")
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Posterior modes (only option in this package, no MCMC)
head(sort(spsl$coef,decreasing = TRUE))
## X226497_s_at        stage         tgfb   X232478_at X205533_s_at   X232090_at 
##    0.6626776    0.0000000    0.0000000    0.0000000    0.0000000    0.0000000
# Posterior modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1, 
        main = "Spike-and-Slab LASSO", lty = 2) 

# Posterior Odds-Ratios for the modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1, 
        main = "Spike-and-Slab LASSO", lty = 2, OR = TRUE) 

Sensitivity Analysis: Cox-LASSO

#####################################################################
# Cox-LASSO (sensitivity analysis)
#####################################################################

# Selection step
cv.fit = try(cv.glmnet(x =  X3, y = y2, family="cox", 
                       maxit=1000, nfolds=10, alpha=1), silent=TRUE)
fit = try(glmnet(x = X3, y=y2, family = "cox", maxit=1e5, alpha=1), silent=TRUE)

# active variables (lambda.min)
b.coxlasso = as.double(coef(fit, s=cv.fit$lambda.min))
which(b.coxlasso!=0)
##  [1]   1   8   9  13  14  23  29  35  43  76 142
colnames(X)[which(b.coxlasso!=0)]
##  [1] "stage"        "X202241_at"   "X201830_s_at" "X208394_x_at" "X225803_at"  
##  [6] "X229125_at"   "X202668_at"   "X239364_at"   "X226497_s_at" "X212143_s_at"
## [11] "X205428_s_at"
# active variables (lambda.1se)
b2.coxlasso = as.double(coef(fit, s=cv.fit$lambda.1se))
which(b2.coxlasso!=0)
## integer(0)
colnames(X)[which(b2.coxlasso!=0)]
## character(0)
# Fitting a CoxPH model using the survival R package
fc <- formula(paste('y2 ~ ',paste(colnames(X3)[which(b.coxlasso!=0)], collapse='+'),sep=''))
fit2 <- coxph(fc, data = df)
summary(fit2)
## Call:
## coxph(formula = fc, data = df)
## 
##   n= 260, number of events= 50 
## 
##                  coef exp(coef) se(coef)      z Pr(>|z|)   
## stage2        0.81254   2.25362  0.78248  1.038  0.29908   
## stage3        1.45827   4.29850  0.75390  1.934  0.05308 . 
## X202241_at    0.21548   1.24046  0.14642  1.472  0.14111   
## X201830_s_at  0.42024   1.52232  0.18514  2.270  0.02322 * 
## X208394_x_at  0.50513   1.65720  0.20294  2.489  0.01281 * 
## X225803_at    0.12884   1.13751  0.21047  0.612  0.54044   
## X229125_at    0.01933   1.01952  0.16403  0.118  0.90618   
## X202668_at    0.45199   1.57144  0.19818  2.281  0.02257 * 
## X239364_at   -0.45984   0.63138  0.15177 -3.030  0.00245 **
## X226497_s_at  0.38874   1.47512  0.21418  1.815  0.06953 . 
## X212143_s_at  0.13972   1.14995  0.17687  0.790  0.42954   
## X205428_s_at  0.20173   1.22352  0.16031  1.258  0.20824   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## stage2          2.2536     0.4437    0.4862   10.4454
## stage3          4.2985     0.2326    0.9808   18.8380
## X202241_at      1.2405     0.8062    0.9310    1.6528
## X201830_s_at    1.5223     0.6569    1.0591    2.1883
## X208394_x_at    1.6572     0.6034    1.1134    2.4667
## X225803_at      1.1375     0.8791    0.7530    1.7183
## X229125_at      1.0195     0.9809    0.7392    1.4061
## X202668_at      1.5714     0.6364    1.0656    2.3174
## X239364_at      0.6314     1.5838    0.4689    0.8501
## X226497_s_at    1.4751     0.6779    0.9694    2.2446
## X212143_s_at    1.1500     0.8696    0.8131    1.6264
## X205428_s_at    1.2235     0.8173    0.8936    1.6752
## 
## Concordance= 0.844  (se = 0.026 )
## Likelihood ratio test= 79.24  on 12 df,   p=6e-12
## Wald test            = 70.3  on 12 df,   p=3e-10
## Score (logrank) test = 87.52  on 12 df,   p=1e-13