Example 2: flchain data set

See this GitHub repository for more information: BVSSurv.

Data preparation

################################################################################
# Required packages
################################################################################
rm(list=ls())
#library(devtools)
#install_github("davidrusi/mombf")
library(mombf)
## Loading required package: mvtnorm
## Loading required package: ncvreg
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(mvtnorm)
library(survival)
library(ggplot2)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# if (!require("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("Biobase")
library(Biobase)
## Warning: package 'Biobase' was built under R version 4.2.1
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.2.1
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(BhGLM)
library(BVSNLP)
## Bayesian Variable Selection using Non-Local priors for survival and logistic regression data.
## Loading Version 1.1.8
library(eha)

################################################################################
# Data preparation
################################################################################

# flchain data set
data(flchain)
head(flchain)
##   age sex sample.yr kappa lambda flc.grp creatinine mgus futime death
## 1  97   F      1997  5.70  4.860      10        1.7    0     85     1
## 2  92   F      2000  0.87  0.683       1        0.9    0   1281     1
## 3  94   F      1997  4.36  3.850      10        1.4    0     69     1
## 4  92   F      1996  2.42  2.220       9        1.0    0    115     1
## 5  93   F      1996  1.32  1.690       6        1.1    0   1039     1
## 6  90   F      1997  2.01  1.860       9        1.0    0   1355     1
##       chapter
## 1 Circulatory
## 2   Neoplasms
## 3 Circulatory
## 4 Circulatory
## 5 Circulatory
## 6      Mental
# Variables of interest
colnames(flchain)[c(2,4,5,7,8,9,10)]
## [1] "sex"        "kappa"      "lambda"     "creatinine" "mgus"      
## [6] "futime"     "death"
# Complete cases of the variables of interest
data.c <- flchain[,c(2,4,5,7,8,9,10)]
data.c <- data.c[complete.cases(data.c),]
dim(data.c)
## [1] 6524    7
head(data.c)
##   sex kappa lambda creatinine mgus futime death
## 1   F  5.70  4.860        1.7    0     85     1
## 2   F  0.87  0.683        0.9    0   1281     1
## 3   F  4.36  3.850        1.4    0     69     1
## 4   F  2.42  2.220        1.0    0    115     1
## 5   F  1.32  1.690        1.1    0   1039     1
## 6   F  2.01  1.860        1.0    0   1355     1
# Assigning half-day survival to the zero-survivors
data.c$futime <- ifelse(data.c$futime==0,0.5,data.c$futime)
ys <- Surv(log(data.c$futime/365.25), data.c$death)
data.c$kappa <- scale(data.c$kappa) # scaled kappa
data.c$creatinine <- scale(data.c$creatinine) # scaled creatinine 
data.c$lambda <- scale(data.c$lambda) # scaled lambda
data.c$mgus <- scale(data.c$mgus) # scaled mgus 
data.c$sex <- as.numeric(data.c$sex)-1 # sex 0-1
data.c$sex <- scale(data.c$sex) # scaled sex 
X <- data.c[,-c(6,7)];  # Design matrix
dim(X)
## [1] 6524    5
# Data frame with all the data
df <- data.c

# Visualise the Kaplan-Meier estimator (of survival)
km <- survfit(Surv(futime/365.25, death) ~ 1, data = df)
plot(km, xlab = "Years", ylab = "Survival", main = "Kaplan-Meier estimator",
     cex.axis = 1.5, cex.lab = 1.5)

Bayesian Variable Selection

################################################################################
# Bayesian Variable Selection (BVS)
################################################################################

# Regression formula with linear effects
paste(colnames(X), collapse='+')
## [1] "sex+kappa+lambda+creatinine+mgus"
f1y <- as.formula(ys ~sex+kappa+lambda+creatinine+mgus)

#-------------------------------------------------------------------------------
# BVS: Prior 1 - Zellner's prior
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoef= zellnerprior(tau=nrow(X))
# Prior on the model space
priorDelta= modelbbprior(1,1)
# Selection step
ms1 <- modelSelection(f1y,data=df,
                       priorCoef=priorCoef,priorDelta=priorDelta,
                       enumerate=TRUE, method='Laplace', 
                       center = FALSE, scale = FALSE)
## Enumerating models...
## Zellner prior not implemented, using group Zellner prior instead
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities
pp1 <- postProb(ms1)
# Top models
head(pp1)
##    modelid family          pp
## 45   1,3,4 normal 0.703838711
## 41     1,3 normal 0.154613263
## 47 1,3,4,5 normal 0.047558767
## 61 1,2,3,4 normal 0.038608525
## 46 1,3,4,6 normal 0.030595652
## 42   1,3,6 normal 0.005164849
# Marginal inclusion probabilities
mp1 <- ms1$margpp

# Visualising the marginal inclusion probabilities
df1<-data.frame(names=names(mp1),mp = as.vector(round(mp1,digits = 3)))
df1
##         names    mp
## 1 (Intercept) 1.000
## 2         sex 0.051
## 3       kappa 1.000
## 4      lambda 0.831
## 5  creatinine 0.059
## 6        mgus 0.044
ggplot(df1,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (Laplace)
#-------------------------------------------------------------------------------
# Priors for the coefficients
priorCoefm= momprior(tau = 0.192) # p-mom
# Prior on the model space
priorDelta= modelbbprior(1,1)
# Selection step
ms2 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoefm,priorDelta=priorDelta,
                      enumerate=TRUE, method='Laplace', 
                      center = FALSE, scale = FALSE)
## Enumerating models...
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities 
pp2 <- postProb(ms2)
# Top models
head(pp2)
##    modelid family           pp
## 41     1,3 normal 8.006134e-01
## 45   1,3,4 normal 1.990169e-01
## 42   1,3,6 normal 2.933692e-04
## 46 1,3,4,6 normal 6.838268e-05
## 43   1,3,5 normal 3.651998e-06
## 57   1,2,3 normal 2.312149e-06
# Marginal inclusion probabilities
mp2 <- ms2$margpp


# Visualising the marginal inclusion probabilities
df2<-data.frame(names=names(mp2),mp = as.vector(round(mp2,digits = 3)))
df2
##         names    mp
## 1 (Intercept) 1.000
## 2         sex 0.000
## 3       kappa 1.000
## 4      lambda 0.199
## 5  creatinine 0.000
## 6        mgus 0.000
ggplot(df2,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

#-------------------------------------------------------------------------------
# BVS: Prior 2 - MOM prior (ALA)
#-------------------------------------------------------------------------------

# Selection step
ms3 <- modelSelection(f1y,data=df,
                      priorCoef=priorCoefm,priorDelta=priorDelta,
                      enumerate=TRUE, method='ALA', 
                      center = FALSE, scale = FALSE)
## Enumerating models...
## Computing posterior probabilities........... Done.
# Calculating model posterior probabilities 
pp3 <- postProb(ms3)
# Top models
head(pp3)
##    modelid family         pp
## 41     1,3 normal 0.53303230
## 45   1,3,4 normal 0.34788272
## 61 1,2,3,4 normal 0.02301631
## 57   1,2,3 normal 0.01976501
## 47 1,3,4,5 normal 0.01913925
## 46 1,3,4,6 normal 0.01678421
# Marginal inclusion probabilities
mp3 <- ms3$margpp


# Visualising the marginal inclusion probabilities
df3<-data.frame(names=names(mp3),mp = as.vector(round(mp3,digits = 3)))
df3
##         names    mp
## 1 (Intercept) 1.000
## 2         sex 0.050
## 3       kappa 1.000
## 4      lambda 0.413
## 5  creatinine 0.040
## 6        mgus 0.039
ggplot(df3,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

# AFT model using a lognormal baseline distribution
fit.aft <- aftreg(formula = Surv(futime, death) ~ kappa + lambda, data=df,
                  dist = "lognormal")

summary(fit.aft)
## Covariate            W.mean      Coef Time-Accn  se(Coef)    LR p
## kappa                -0.133     0.524     1.689     0.049   0.0000 
## lambda               -0.118     0.160     1.174     0.048   0.0009 
## 
## Events                    1962 
## Total time at risk        23796306 
## Max. log. likelihood      -20238 
## LR test statistic         610.70 
## Degrees of freedom        2 
## Overall p-value           0

Sensitivity Analysis: Cox PiMOM priors

#####################################################################
# Cox PiMOM priors (sensitivity analysis)
#####################################################################

# Survival object with times
y2 <- Surv(time = data.c$futime/365.25, data.c$death)

X2 <- X

bvsfit <- bvs(X = data.frame(X2), resp = as.matrix(cbind(data.c$futime/365.2,data.c$death)), 
              family  = "survival", nlptype = "piMOM", mod_prior = "unif",
              niter = 100)

# Models with highest probability
head(bvsfit$max_models)
## [[1]]
##      [,1]
## [1,]    2
## 
## [[2]]
##      [,1]
## [1,]    2
## [2,]    4
## 
## [[3]]
##      [,1]
## [1,]    2
## [2,]    3
## [3,]    4
## 
## [[4]]
##      [,1]
## [1,]    2
## [2,]    5
## 
## [[5]]
##      [,1]
## [1,]    2
## [2,]    3
## 
## [[6]]
##      [,1]
## [1,]    3
# Highest probabilities
head(exp(bvsfit$max_prob_vec - max(bvsfit$max_prob_vec))/sum(exp(bvsfit$max_prob_vec - max(bvsfit$max_prob_vec) )))
## [1] 9.264891e-01 6.983535e-02 3.666899e-03 6.322414e-06 1.393818e-06
## [6] 6.025645e-07
# Highest posterior model
bvsfit$HPM
##      [,1]
## [1,]    2
# Estimated coefficients
bvsfit$beta_hat
## [1] 0.2326395
# Number of Visited Models
bvsfit$num_vis_models
## [1] 28
# Visualising the marginal inclusion probabilities
dfi<-data.frame(names=names(mp1)[-1],mp = as.vector(round(bvsfit$inc_probs,digits = 3)))
dfi
##        names    mp
## 1        sex 0.000
## 2      kappa 1.000
## 3     lambda 0.004
## 4 creatinine 0.074
## 5       mgus 0.000
ggplot(dfi,aes(x =  reorder(names, -mp), y = mp,label=mp))+
  geom_bar(stat="identity")+geom_text(size=2.5,hjust=0)+coord_flip()+
  xlab("") + ylab("Marginal inclusion probability")

Sensitivity Analysis: Cox Spike-and-Slab LASSO

#####################################################################
# Spike-and-Slab LASSO (sensitivity analysis)
#####################################################################

# Selection step
spsl <- bmlasso(x = X2, y =  y2, family = "cox")
## Warning: from glmnet C++ code (error code -30001); Numerical error at 1th
## lambda value; solutions for larger values of lambda returned
## Warning in getcoef(fit, nvars, nx, vnames): an empty model has been returned;
## probably a convergence issue
# Posterior modes (only option in this package, no MCMC)
spsl$coef
##         sex       kappa      lambda  creatinine        mgus 
##  0.00000000  0.21704395  0.10736486 -0.08341914 -0.03383896
# Posterior modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1, 
        main = "Spike-and-Slab LASSO", lty = 2) 

# Posterior Odds-Ratios for the modes
plot.bh(coefs = spsl$coef, threshold = spsl$df, gap = 1, 
        main = "Spike-and-Slab LASSO", lty = 2, OR = TRUE) 

Sensitivity Analysis: Cox-LASSO

#####################################################################
# Cox-LASSO (sensitivity analysis)
#####################################################################

# Survival object with times
y2 <- Surv(time = df$futime/365.25, event=df$death)

X2 <- as.matrix(X)

# Selection step
cv.fit = try(cv.glmnet(x =  X2, y = y2, family="cox", 
                       maxit=10000, nfolds=10, alpha=1), silent=TRUE)
fit = try(glmnet(x = X2, y=y2, family = "cox", maxit=10000, alpha=1), silent=TRUE)

# active variables (lambda.min)
b.coxlasso = as.double(coef(fit, s=cv.fit$lambda.min))
which(b.coxlasso!=0)
## [1] 2
colnames(X)[which(b.coxlasso!=0)]
## [1] "kappa"
# active variables (lambda.1se)
b2.coxlasso = as.double(coef(fit, s=cv.fit$lambda.1se))
which(b2.coxlasso!=0)
## [1] 2
colnames(X)[which(b2.coxlasso!=0)]
## [1] "kappa"
# Fitting a CoxPH model using the survival R package
fit2 <- coxph(Surv(futime, death) ~ kappa + lambda, data=df)
summary(fit2)
## Call:
## coxph(formula = Surv(futime, death) ~ kappa + lambda, data = df)
## 
##   n= 6524, number of events= 1962 
## 
##           coef exp(coef) se(coef)     z Pr(>|z|)    
## kappa  0.14348   1.15428  0.01930 7.433 1.06e-13 ***
## lambda 0.11388   1.12062  0.02151 5.295 1.19e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## kappa      1.154     0.8663     1.111     1.199
## lambda     1.121     0.8924     1.074     1.169
## 
## Concordance= 0.675  (se = 0.006 )
## Likelihood ratio test= 391.2  on 2 df,   p=<2e-16
## Wald test            = 937.5  on 2 df,   p=<2e-16
## Score (logrank) test = 1114  on 2 df,   p=<2e-16