2024-04-22
Question of interest: to find the linear combination coefficient \(\boldsymbol{c}^{\prime}\) such that the VUS (AUC as a special case) associated with univariate scores \(\boldsymbol{c}^{\prime} \boldsymbol{Y}_d\) is improved.
Assume that \(\boldsymbol{Y}_{d}\thicksim\) \(N_p\left(\boldsymbol{\mu}_{d}, \boldsymbol{\Sigma}_{d}\right)\), \(d=1,2,3\).
To minimize \(\boldsymbol{c}^{\prime}\left({\sum_{d=1}^{3}} \boldsymbol{\Sigma}_{d}\right)\boldsymbol{c}\) (similar to total within-group variance) and at the same time,
maximize \(\boldsymbol{c}^{\prime}\sum_{d=1}^{3}\left[ \boldsymbol{\mu}_{d} -\overline{\boldsymbol{\mu}}\right] \left[\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right] ^{T}\boldsymbol{c}\) (similar to between-group variance).
Penalized-Distance: \[\boldsymbol{c}^{\prime}\left\{ \sum\limits_{d=1}^{3}\left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) \left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) ^{T}-\left( \boldsymbol{\Sigma}_{1}+ \boldsymbol{\Sigma}_{2}+\boldsymbol{\Sigma}_{3}\right)\right\}\boldsymbol{c}\]
Scaled-Distance: \[ \boldsymbol{c}^{\prime}\left\{\left( \boldsymbol{\Sigma}_{1}+ \boldsymbol{\Sigma}_{2}+\boldsymbol{\Sigma}_{3}\right) ^{-1} \sum\limits_{d=1}^{3}\left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) \left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) ^{T}\right\}\boldsymbol{c} \] How to find \(\boldsymbol{c}\)?
ssd.func <- function(new.1,new.2,new.3,design='unbal') {
n1 = nrow(new.1)
n2 = nrow(new.2)
n3 = nrow(new.3)
xbar.1 = colMeans(new.1)
xbar.2 = colMeans(new.2)
xbar.3 = colMeans(new.3)
if (design=='unbal') xbar.0=(n1*xbar.1 + n2*xbar.2 + n3*xbar.3)/(n1+n2+n3) else xbar.0=(xbar.1 + xbar.2 + xbar.3)/3
between=cbind(xbar.1-xbar.0, xbar.2-xbar.0, xbar.3-xbar.0)
info.mat=solve(var(new.1)+var(new.2)+var(new.3))%*%between%*%t(between)
est.coef=as.real(eigen(info.mat)$vectors[,1])
if(mean(new.1%*%est.coef)<mean(new.3%*%est.coef)) return(list(coef=est.coef)) else return(list(coef=-est.coef))
}
Stepwise search using empirical distribution-free approach to maximize the Mann-Whitney U statistic.
Order the \(p\) diagnostic tests or biomarkers according to their Mann-Whitney statistic, which is a nonparametric estimation for VUS.
Employ empirical approach to combine two diagnostic tests or biomarkers with first two largest VUSs.
After combining the first two diagnostic tests or biomarkers to get a new biomarker, employ empirical search approach again to combine the newly obtained biomarker with the original biomarker having third largest VUS.
Proceed stepwisely until only one biomarker is obtained, with which we maximize the VUS.
Step-Down? Step-Up?
nonpar.combine2.vus <- function(alpha,rate,data.1,data.2,data.3) {
new.1 = data.1%*%c(alpha,rate)
new.2 = data.2%*%c(alpha,rate)
new.3 = data.3%*%c(alpha,rate)
nonp.vus(new.1,new.2,new.3)
}
nonpar.combine2.coef <- function(dat_1,dat_2,dat_3,evalnum=201) {
rate=seq(-1,1,length=evalnum)
alpha=rev(rate)[-1]
vus.rate_x = sapply(rate, nonpar.combine2.vus, alpha=1,data.1=dat_1,data.2=dat_2,data.3=dat_3)
vus.alpha_x = sapply(alpha, nonpar.combine2.vus, rate=1,data.1=dat_1,data.2=dat_2,data.3=dat_3)
vus.0 = c(vus.rate_x,vus.alpha_x)
vmax.idx = which.max(vus.0)
if(vmax.idx<=evalnum) return(c(alpha=1,rate=rate[vmax.idx],
vus.max=vus.0[vmax.idx]))
if(vmax.idx> evalnum) return(c(alpha=alpha[vmax.idx-evalnum],rate=1,
vus.max=vus.0[vmax.idx]))
}
nonp.vus.check <- function(health,middle,diseas) {
vus.i=numeric(ncol(health))
for (i in 1:ncol(health)) {
new.1=health[,i]
new.2=middle[,i]
new.3=diseas[,i]
vus.i[i]=nonp.vus(new.1,new.2,new.3)}
vus.i
}
step.coef <- function(new.1,new.2,new.3,design='step-down') {
n1 = nrow(new.1)
n2 = nrow(new.2)
n3 = nrow(new.3)
VARnum = ncol(new.1)
combcoef = matrix(0,nrow=VARnum-1,ncol=2)
if (design=='step-down') {
vus.order = sort(nonp.vus.check(health=new.1,middle=new.2,diseas=new.3),
index.return=T,decreasing=T)$ix } else {
vus.order = sort(nonp.vus.check(health=new.1,middle=new.2,diseas=new.3),
index.return=T,decreasing=F)$ix }
combmarker.1=new.1[,vus.order[1]]
combmarker.2=new.2[,vus.order[1]]
combmarker.3=new.3[,vus.order[1]]
final.coef=1
for (i in 2:VARnum) {
combmarker.1 = cbind(combmarker.1,new.1[,vus.order[i]])
combmarker.2 = cbind(combmarker.2,new.2[,vus.order[i]])
combmarker.3 = cbind(combmarker.3,new.3[,vus.order[i]])
temp.info = nonpar.combine2.coef(combmarker.1,combmarker.2,combmarker.3)
combcoef[i-1,] = temp.info[1:2]
final.coef = c(final.coef*combcoef[i-1,1],combcoef[i-1,2])
combmarker.1 = combmarker.1%*%temp.info[1:2]
combmarker.2 = combmarker.2%*%temp.info[1:2]
combmarker.3 = combmarker.3%*%temp.info[1:2]
}
final.coef = final.coef[sort(vus.order,index.return=T)$ix]
return(list(coef=as.numeric(final.coef),
vus.combined=as.numeric(temp.info[3])))
}
cum.logistic <- function(new.1,new.2,new.3) {
n1 = nrow(new.1)
n2 = nrow(new.2)
n3 = nrow(new.3)
dat.clr = rbind(new.1,new.2,new.3)
response = as.factor( rep(c(1,2,3),times=c(n1,n2,n3)) )
obj.clr <- try(polr(response~ ., data = dat.clr),silent=T)
if (class(obj.clr)=="try-error") return(list(coef=rep(0,ncol(new.1)))) else est.coef=as.numeric(obj.clr$coef)
if(mean(new.1%*%est.coef)<mean(new.3%*%est.coef)) return(list(coef=est.coef)) else return(list(coef=-est.coef))
}
What is the rationale?
\(P(Y_{\max}>c)=1-P(Y_{\max}\leq c)=1-P(Y_{i}\leq c, \forall i)\\ ~~~~~~~~~~~~~~~~~~~~~\geq 1-P(Y_{i}\leq c)=P(Y_i>c)\)
\(P(Y_{\min}>c)=P(Y_{i}> c, \forall i)\leq P(Y_{i}> c)\)
\(P(X_{\max}\leq c)=P(X_i \leq c, \forall i)\leq P(X_{i}\leq c)\)
\(P(X_{\min}\leq c)=1-P(X_{\min}> c)=1-P(X_i>c, \forall i)\\ ~~~~~~~~~~~~~~~~~~~~~~\geq 1-P(X_{i}> c)=P(X_i\leq c)\)
It is clear that the max statistic has the largest sensitivity and smallest specificity, also the min statistic has the largest specificity and smallest sensitivity, for any given threshold.
minmax.coef <- function(new.1,new.2,new.3) {
max_min.1=cbind(apply(new.1,1,max),apply(new.1,1,min))
max_min.2=cbind(apply(new.2,1,max),apply(new.2,1,min))
max_min.3=cbind(apply(new.3,1,max),apply(new.3,1,min))
est.coef = nonpar.combine2.coef(max_min.1,max_min.2,max_min.3)
return(list(coef=as.numeric(est.coef[1:2]),
vus.combined=as.numeric(est.coef[3])))
}
nonp.vus <- function(new.1,new.2,new.3) {
n1=length(new.1)
n2=length(new.2)
n3=length(new.3)
usum <- function(u) {
is.small = new.2[u<new.2]
if (length(is.small)>0) return(sum(sapply(is.small, function(v) sum(v<new.3)))) else return(0)
}
return(sum(sapply(new.1,usum))/n1/n2/n3)
}
slow.nonp.vus <- function(new.1,new.2,new.3) {
count=0
n1=length(new.1)
n2=length(new.2)
n3=length(new.3)
for (i in 1:n1) {
for (j in 1:n2) {
for (k in 1:n3) {
count=count+1*(new.1[i]<new.2[j] & new.2[j]<new.3[k])
}}}
return(count/n1/n2/n3)
}
start.time <- Sys.time()
set.seed(2024)
nonp.vus(rnorm(50),rnorm(50),rnorm(50))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
start.time <- Sys.time()
set.seed(2024)
slow.nonp.vus(rnorm(50),rnorm(50),rnorm(50))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken