See this GitHub repository for more information: BVSSurv.
#===============================================================================
#===============================================================================
# Toy Example: Simulated data from an accelerated failure time model
#===============================================================================
#===============================================================================
################################################################################
# Required packages
################################################################################
rm(list=ls())
#library(devtools)
#install_github("davidrusi/mombf")
library(mombf)
## Loading required package: mvtnorm
## Loading required package: ncvreg
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
#install_github("nyiuab/BhGLM")
library(BhGLM)
library(mvtnorm)
library(survival)
library(ggplot2)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(BVSNLP)
## Bayesian Variable Selection using Non-Local priors for survival and logistic regression data.
## Loading Version 1.1.8
################################################################################
# Data simulation
################################################################################
# Number of simulations (check n = 250 vs. n = 1000)
ns <- 250
# Simulation seed
set.seed(123)
# Number of active variables
pa <- 3
# number of spurious variables
ps <- 7
# Design matrix
mu0 <- rep(0,pa+ps) # mean
Sigma0 <- diag(pa+ps) # Correlation matrix
Sigma0[lower.tri(Sigma0)] <- 0.5
Sigma0[upper.tri(Sigma0)] <- 0.5
X <- rmvnorm(n = ns, mean = mu0, sigma = Sigma0)
colnames(X) <- paste("Var",1:(pa+ps), sep = "")
# Scale all columns of X
X <- as.matrix(apply(X, 2, scale))
# Residuals
sd0 <- 0.5 # standard deviation
eps <- rnorm(n = ns, mean = 0, sd = sd0)
# True values of the regression coefficients
beta0 <- -0.25
betas <- c(0.25,0.5,1,rep(0,ps))
# Response variables
y0 <- as.vector(beta0 + X%*%betas + eps)
# Times to event
os <- exp(y0)
# Censoring times
cens <- rep(1, ns)
# Observed times
times <- ifelse(os <= cens, os , cens)
# Survival object
ys <- Surv(log(times), event = cens)
# Vital status indicators
status <- ifelse(os <= cens, 1 , 0)
# Data frame with all the data
df <- data.frame(times = times, ys = ys, status = status, X )
# Visualise the Kaplan-Meier estimator (of survival)
km <- survfit(Surv(times, status) ~ 1, data = df)
plot(km, xlab = "Years", ylab = "Survival", main = "Kaplan-Meier estimator",
cex.axis = 1.5, cex.lab = 1.5)
################################################################################
# Bayesian Variable Selection (BVS)
################################################################################
# Regression formula with linear effects
paste(colnames(X), collapse='+')
## [1] "Var1+Var2+Var3+Var4+Var5+Var6+Var7+Var8+Var9+Var10"
f1y <- as.formula(ys ~Var1+Var2+Var3+Var4+Var5+Var6+Var7+Var8+Var9+Var10)
#-------------------------------------------------------------------------------
# BVS: Prior 1 - Zellner's prior
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoef= zellnerprior(tau=nrow(X))
# Prior on the model space
priorDelta= modelbbprior(1,1)
# Selection step
ms1 <- modelSelection(f1y,data=df,
priorCoef=priorCoef,priorDelta=priorDelta,
enumerate=TRUE, method='Laplace',
center = FALSE, scale = FALSE)
## Enumerating models...
## Zellner prior not implemented, using group Zellner prior instead
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities
pp1 <- postProb(ms1)
# Top models
head(pp1)
## modelid family pp
## 1921 1,2,3,4 normal 0.42507955
## 1937 1,2,3,4,7 normal 0.11834865
## 1922 1,2,3,4,11 normal 0.03582752
## 1953 1,2,3,4,6 normal 0.03562511
## 1409 1,3,4 normal 0.03421926
## 1425 1,3,4,7 normal 0.03409237
# Marginal inclusion probabilities
mp1 <- ms1$margpp
# Visualising the marginal inclusion probabilities
df1<-data.frame(names=names(mp1),mp = as.vector(round(mp1,digits = 3)))
df1
## names mp
## 1 (Intercept) 1.000
## 2 Var1 0.865
## 3 Var2 1.000
## 4 Var3 1.000
## 5 Var4 0.082
## 6 Var5 0.096
## 7 Var6 0.271
## 8 Var7 0.081
## 9 Var8 0.068
## 10 Var9 0.075
## 11 Var10 0.104
ggplot(df1,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (Laplace)
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoefm = momprior(taustd=1) # momprior(tau=0.192) # p-mom
# Prior on the model space
priorDelta= modelbbprior(1,1)
# Selection step
ms2 <- modelSelection(f1y,data=df,
priorCoef=priorCoefm,priorDelta=priorDelta,
enumerate=TRUE, method='Laplace',
center = FALSE, scale = FALSE)
## Enumerating models...
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities
pp2 <- postProb(ms2)
# Top models
head(pp2)
## modelid family pp
## 1921 1,2,3,4 normal 0.700780836
## 1409 1,3,4 normal 0.220420572
## 1425 1,3,4,7 normal 0.037751438
## 1937 1,2,3,4,7 normal 0.014932359
## 1410 1,3,4,11 normal 0.007532301
## 1441 1,3,4,6 normal 0.003550262
# Marginal inclusion probabilities
mp2 <- ms2$margpp
# Visualising the marginal inclusion probabilities
df2<-data.frame(names=names(mp2),mp = as.vector(round(mp2,digits = 3)))
df2
## names mp
## 1 (Intercept) 1.000
## 2 Var1 0.723
## 3 Var2 1.000
## 4 Var3 1.000
## 5 Var4 0.004
## 6 Var5 0.006
## 7 Var6 0.054
## 8 Var7 0.004
## 9 Var8 0.001
## 10 Var9 0.001
## 11 Var10 0.010
ggplot(df2,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (ALA)
#-------------------------------------------------------------------------------
# Selection step
ms3 <- modelSelection(f1y,data=df,
priorCoef=priorCoefm,priorDelta=priorDelta,
enumerate=TRUE, method='ALA',
center = FALSE, scale = FALSE)
## Enumerating models...
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities
pp3 <- postProb(ms3)
# Top models
head(pp3)
## modelid family pp
## 1921 1,2,3,4 normal 0.21715743
## 1409 1,3,4 normal 0.12992689
## 1425 1,3,4,7 normal 0.05527505
## 1937 1,2,3,4,7 normal 0.05169672
## 1410 1,3,4,11 normal 0.02800886
## 1922 1,2,3,4,11 normal 0.02710773
# Marginal inclusion probabilities
mp3 <- ms3$margpp
# Visualising the marginal inclusion probabilities
df3<-data.frame(names=names(mp3),mp = as.vector(round(mp3,digits = 3)))
df3
## names mp
## 1 (Intercept) 1.000
## 2 Var1 0.583
## 3 Var2 0.993
## 4 Var3 1.000
## 5 Var4 0.134
## 6 Var5 0.146
## 7 Var6 0.266
## 8 Var7 0.137
## 9 Var8 0.114
## 10 Var9 0.122
## 11 Var10 0.161
ggplot(df3,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#####################################################################
# Cox PiMOM priors (sensitivity analysis)
#####################################################################
# Survival object with times
y2 <- Surv(time = df$times, event=df$status)
X2 <- X
bvsfit <- bvs(X = data.frame(X2), resp = as.matrix(cbind(times,status)),
family = "survival", nlptype = "piMOM", mod_prior = "unif",
niter = 100)
# Models with highest probability
head(bvsfit$max_models)
## [[1]]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
##
## [[2]]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 5
##
## [[3]]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 7
##
## [[4]]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 9
##
## [[5]]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 10
##
## [[6]]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 5
## [5,] 7
# Highest probabilities
head(exp(bvsfit$max_prob_vec)/sum(exp(bvsfit$max_prob_vec)))
## [1] 0.956198300 0.013031162 0.012094560 0.006006906 0.003879441 0.002649115
# Highest posterior model
bvsfit$HPM
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
# Estimated coefficients
bvsfit$beta_hat
## [1] -0.5818266 -1.0836512 -2.2386521
# Number of Visited Models
bvsfit$num_vis_models
## [1] 217
# Visualising the marginal inclusion probabilities
dfi<-data.frame(names=names(mp1)[-1],mp = as.vector(round(bvsfit$inc_probs,digits = 3)))
dfi
## names mp
## 1 Var1 1.000
## 2 Var2 1.000
## 3 Var3 1.000
## 4 Var4 0.002
## 5 Var5 0.017
## 6 Var6 0.001
## 7 Var7 0.016
## 8 Var8 0.000
## 9 Var9 0.007
## 10 Var10 0.005
ggplot(dfi,aes(x = reorder(names, -mp), y = mp,label=mp))+
geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
xlab("") + ylab("Marginal inclusion probability")
#####################################################################
# Spike-and-Slab LASSO (sensitivity analysis)
#####################################################################
# Selection step
spsl <- bmlasso(x = X2, y = y2, family = "cox")
# Posterior modes (only option in this package, no MCMC)
spsl$coef
## Var1 Var2 Var3 Var4 Var5 Var6 Var7
## -0.5654226 -1.0700101 -2.2003060 0.0000000 0.0000000 0.0000000 0.0000000
## Var8 Var9 Var10
## 0.0000000 0.0000000 0.0000000
# Posterior modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1,
main = "Spike-and-Slab LASSO", lty = 2)
# Posterior Odds-Ratios for the modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1,
main = "Spike-and-Slab LASSO", lty = 2, OR = TRUE)
#####################################################################
# Cox-LASSO (sensitivity analysis)
#####################################################################
# Selection step
cv.fit = try(cv.glmnet(x = X2, y = y2, family="cox",
maxit=10000, nfolds=10, alpha=1), silent=TRUE)
fit = try(glmnet(x = X2, y=y2, family = "cox", maxit=10000, alpha=1), silent=TRUE)
# active variables (lambda.min)
b.coxlasso = as.double(coef(fit, s=cv.fit$lambda.min))
which(b.coxlasso!=0)
## [1] 1 2 3 7 8 10
colnames(X)[which(b.coxlasso!=0)]
## [1] "Var1" "Var2" "Var3" "Var7" "Var8" "Var10"
# active variables (lambda.1se)
b2.coxlasso = as.double(coef(fit, s=cv.fit$lambda.1se))
which(b2.coxlasso!=0)
## [1] 1 2 3 7
colnames(X)[which(b2.coxlasso!=0)]
## [1] "Var1" "Var2" "Var3" "Var7"