Example 1: Simulated data set

See this GitHub repository for more information: BVSSurv.

Data preparation

#===============================================================================
#===============================================================================
# Toy Example: Simulated data from an accelerated failure time model
#===============================================================================
#===============================================================================


################################################################################
# Required packages
################################################################################
rm(list=ls())
#library(devtools)
#install_github("davidrusi/mombf")
library(mombf)
## Loading required package: mvtnorm
## Loading required package: ncvreg
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
#install_github("nyiuab/BhGLM")
library(BhGLM)
library(mvtnorm)
library(survival)
library(ggplot2)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(BVSNLP)
## Bayesian Variable Selection using Non-Local priors for survival and logistic regression data.
## Loading Version 1.1.8
################################################################################
# Data simulation
################################################################################

# Number of simulations (check n = 250 vs. n = 1000)
ns <- 250

# Simulation seed
set.seed(123)

# Number of active variables
pa <- 3

# number of spurious variables
ps <- 7

# Design matrix 
mu0 <- rep(0,pa+ps) # mean
Sigma0 <- diag(pa+ps) # Correlation matrix
Sigma0[lower.tri(Sigma0)] <- 0.5
Sigma0[upper.tri(Sigma0)] <- 0.5
X <- rmvnorm(n = ns, mean = mu0, sigma = Sigma0)
colnames(X) <- paste("Var",1:(pa+ps), sep = "")

# Scale all columns of X
X <- as.matrix(apply(X, 2, scale))

# Residuals
sd0 <- 0.5 # standard deviation
eps <- rnorm(n = ns, mean = 0, sd = sd0)

# True values of the regression coefficients
beta0 <- -0.25
betas <- c(0.25,0.5,1,rep(0,ps)) 

# Response variables
y0 <- as.vector(beta0 + X%*%betas + eps)

# Times to event
os <- exp(y0)

# Censoring times
cens <- rep(1, ns)

# Observed times
times <- ifelse(os  <= cens, os , cens)

# Survival object
ys <- Surv(log(times), event = cens)

# Vital status indicators
status <-  ifelse(os  <= cens, 1 , 0)

# Data frame with all the data
df <- data.frame(times = times, ys = ys, status = status, X )

# Visualise the Kaplan-Meier estimator (of survival)
km <- survfit(Surv(times, status) ~ 1, data = df)
plot(km, xlab = "Years", ylab = "Survival", main = "Kaplan-Meier estimator",
     cex.axis = 1.5, cex.lab = 1.5)

Bayesian Variable Selection

################################################################################
# Bayesian Variable Selection (BVS)
################################################################################

# Regression formula with linear effects
paste(colnames(X), collapse='+')
## [1] "Var1+Var2+Var3+Var4+Var5+Var6+Var7+Var8+Var9+Var10"
f1y <- as.formula(ys ~Var1+Var2+Var3+Var4+Var5+Var6+Var7+Var8+Var9+Var10)

#-------------------------------------------------------------------------------
# BVS: Prior 1 - Zellner's prior
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoef= zellnerprior(tau=nrow(X))
# Prior on the model space
priorDelta= modelbbprior(1,1)
# Selection step
ms1 <- modelSelection(f1y,data=df,
                       priorCoef=priorCoef,priorDelta=priorDelta,
                       enumerate=TRUE, method='Laplace', 
                       center = FALSE, scale = FALSE)
## Enumerating models...
## Zellner prior not implemented, using group Zellner prior instead
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities
pp1 <- postProb(ms1)
# Top models
head(pp1)
##         modelid family         pp
## 1921    1,2,3,4 normal 0.42507955
## 1937  1,2,3,4,7 normal 0.11834865
## 1922 1,2,3,4,11 normal 0.03582752
## 1953  1,2,3,4,6 normal 0.03562511
## 1409      1,3,4 normal 0.03421926
## 1425    1,3,4,7 normal 0.03409237
# Marginal inclusion probabilities
mp1 <- ms1$margpp

# Visualising the marginal inclusion probabilities
df1<-data.frame(names=names(mp1),mp = as.vector(round(mp1,digits = 3)))
df1
##          names    mp
## 1  (Intercept) 1.000
## 2         Var1 0.865
## 3         Var2 1.000
## 4         Var3 1.000
## 5         Var4 0.082
## 6         Var5 0.096
## 7         Var6 0.271
## 8         Var7 0.081
## 9         Var8 0.068
## 10        Var9 0.075
## 11       Var10 0.104
ggplot(df1,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (Laplace)
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoefm =  momprior(taustd=1)  # momprior(tau=0.192) # p-mom
# Prior on the model space
priorDelta= modelbbprior(1,1)
# Selection step
ms2 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoefm,priorDelta=priorDelta,
                      enumerate=TRUE, method='Laplace', 
                      center = FALSE, scale = FALSE)
## Enumerating models...
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities 
pp2 <- postProb(ms2)
# Top models
head(pp2)
##        modelid family          pp
## 1921   1,2,3,4 normal 0.700780836
## 1409     1,3,4 normal 0.220420572
## 1425   1,3,4,7 normal 0.037751438
## 1937 1,2,3,4,7 normal 0.014932359
## 1410  1,3,4,11 normal 0.007532301
## 1441   1,3,4,6 normal 0.003550262
# Marginal inclusion probabilities
mp2 <- ms2$margpp


# Visualising the marginal inclusion probabilities
df2<-data.frame(names=names(mp2),mp = as.vector(round(mp2,digits = 3)))
df2
##          names    mp
## 1  (Intercept) 1.000
## 2         Var1 0.723
## 3         Var2 1.000
## 4         Var3 1.000
## 5         Var4 0.004
## 6         Var5 0.006
## 7         Var6 0.054
## 8         Var7 0.004
## 9         Var8 0.001
## 10        Var9 0.001
## 11       Var10 0.010
ggplot(df2,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (ALA)
#-------------------------------------------------------------------------------

# Selection step
ms3 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoefm,priorDelta=priorDelta,
                      enumerate=TRUE, method='ALA', 
                      center = FALSE, scale = FALSE)
## Enumerating models...
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities 
pp3 <- postProb(ms3)
# Top models
head(pp3)
##         modelid family         pp
## 1921    1,2,3,4 normal 0.21715743
## 1409      1,3,4 normal 0.12992689
## 1425    1,3,4,7 normal 0.05527505
## 1937  1,2,3,4,7 normal 0.05169672
## 1410   1,3,4,11 normal 0.02800886
## 1922 1,2,3,4,11 normal 0.02710773
# Marginal inclusion probabilities
mp3 <- ms3$margpp


# Visualising the marginal inclusion probabilities
df3<-data.frame(names=names(mp3),mp = as.vector(round(mp3,digits = 3)))
df3
##          names    mp
## 1  (Intercept) 1.000
## 2         Var1 0.583
## 3         Var2 0.993
## 4         Var3 1.000
## 5         Var4 0.134
## 6         Var5 0.146
## 7         Var6 0.266
## 8         Var7 0.137
## 9         Var8 0.114
## 10        Var9 0.122
## 11       Var10 0.161
ggplot(df3,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

Sensitivity Analysis: Cox PiMOM priors

#####################################################################
# Cox PiMOM priors (sensitivity analysis)
#####################################################################

# Survival object with times
y2 <- Surv(time = df$times, event=df$status)

X2 <- X

bvsfit <- bvs(X = data.frame(X2), resp = as.matrix(cbind(times,status)), 
              family  = "survival", nlptype = "piMOM", mod_prior = "unif",
              niter = 100)

# Models with highest probability
head(bvsfit$max_models)
## [[1]]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## 
## [[2]]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    5
## 
## [[3]]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    7
## 
## [[4]]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    9
## 
## [[5]]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]   10
## 
## [[6]]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    5
## [5,]    7
# Highest probabilities
head(exp(bvsfit$max_prob_vec)/sum(exp(bvsfit$max_prob_vec)))
## [1] 0.956198300 0.013031162 0.012094560 0.006006906 0.003879441 0.002649115
# Highest posterior model
bvsfit$HPM
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
# Estimated coefficients
bvsfit$beta_hat
## [1] -0.5818266 -1.0836512 -2.2386521
# Number of Visited Models
bvsfit$num_vis_models
## [1] 217
# Visualising the marginal inclusion probabilities
dfi<-data.frame(names=names(mp1)[-1],mp = as.vector(round(bvsfit$inc_probs,digits = 3)))
dfi
##    names    mp
## 1   Var1 1.000
## 2   Var2 1.000
## 3   Var3 1.000
## 4   Var4 0.002
## 5   Var5 0.017
## 6   Var6 0.001
## 7   Var7 0.016
## 8   Var8 0.000
## 9   Var9 0.007
## 10 Var10 0.005
ggplot(dfi,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

Sensitivity Analysis: Cox Spike-and-Slab LASSO

#####################################################################
# Spike-and-Slab LASSO (sensitivity analysis)
#####################################################################

# Selection step
spsl <- bmlasso(x = X2, y =  y2, family = "cox")

# Posterior modes (only option in this package, no MCMC)
spsl$coef
##       Var1       Var2       Var3       Var4       Var5       Var6       Var7 
## -0.5654226 -1.0700101 -2.2003060  0.0000000  0.0000000  0.0000000  0.0000000 
##       Var8       Var9      Var10 
##  0.0000000  0.0000000  0.0000000
# Posterior modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1, 
        main = "Spike-and-Slab LASSO", lty = 2) 

# Posterior Odds-Ratios for the modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1, 
        main = "Spike-and-Slab LASSO", lty = 2, OR = TRUE) 

Sensitivity Analysis: Cox-LASSO

#####################################################################
# Cox-LASSO (sensitivity analysis)
#####################################################################


# Selection step
cv.fit = try(cv.glmnet(x =  X2, y = y2, family="cox", 
                       maxit=10000, nfolds=10, alpha=1), silent=TRUE)
fit = try(glmnet(x = X2, y=y2, family = "cox", maxit=10000, alpha=1), silent=TRUE)

# active variables (lambda.min)
b.coxlasso = as.double(coef(fit, s=cv.fit$lambda.min))
which(b.coxlasso!=0)
## [1]  1  2  3  7  8 10
colnames(X)[which(b.coxlasso!=0)]
## [1] "Var1"  "Var2"  "Var3"  "Var7"  "Var8"  "Var10"
# active variables (lambda.1se)
b2.coxlasso = as.double(coef(fit, s=cv.fit$lambda.1se))
which(b2.coxlasso!=0)
## [1] 1 2 3 7
colnames(X)[which(b2.coxlasso!=0)]
## [1] "Var1" "Var2" "Var3" "Var7"