2024-04-17
Measurement error model
n_P=n_N=50
n_rep=5 # number of replicates
sderr=1
trueAUC=pnorm(1.2/sqrt(1+1.5^2))
for (iter in 1:5000) {
ID=rep(1:n_N,each=n_rep)
true_N=rnorm(n_N,mean=0,sd=1)
observed_N=rep(true_N,each=n_rep)+rnorm(n_N*n_rep,mean=0,sd=sderr) # error corrupted data
avg_N=as.vector(tapply(observed_N,ID,mean)) # naive simple average
ID=rep(1:n_P,each=n_rep)
true_P=rnorm(n_P,mean=1.2,sd=1.5)
observed_P=rep(true_P,each=n_rep)+rnorm(n_P*n_rep,mean=0,sd=sderr) # you may consider different error variance
avg_P=as.vector(tapply(observed_P,ID,mean)) # naive simple average
naiveAUC[iter]=pnorm( (mean(avg_P)-mean(avg_N))/sqrt(var(avg_P)+var(avg_N)) )
sigma_err_N = sum((observed_N-rep(avg_N,each=n_rep))^2)/n_N/(n_rep-1)
sigma_err_P = sum((observed_P-rep(avg_P,each=n_rep))^2)/n_P/(n_rep-1)
sigma_N = var(avg_N)-sigma_err_N/n_rep
sigma_P = var(avg_P)-sigma_err_P/n_rep
correctedAUC[iter]=pnorm( (mean(observed_P)-mean(observed_N))/sqrt(sigma_P+sigma_N) )
}
# relative bias
(mean(naiveAUC)-trueAUC)/trueAUC*100
(mean(correctedAUC)-trueAUC)/trueAUC*100
rm(list=ls())
sims=10000
library(mvtnorm)
compare_estimate <- function(pho_1, pho_2){
mu_N=c(0, 0)
mu_P=c(1,1.2)
sigma_N=matrix(c(1,pho_1,pho_1,1),2,2)
sigma_P=matrix(c(1,pho_2,pho_2,1),2,2)
n_N=5
n_P=15
varAUC_diff_DL_vec <- numeric()
varAUC_diff_U_vec<- numeric()
diff_AUC_vec <- numeric()
for (time in 1:sims){
N=rmvnorm(n_N,mean=mu_N,sigma=sigma_N)
P=rmvnorm(n_P,mean=mu_P,sigma=sigma_P)
S1_N=N[1:n_N, 1]
S2_N=N[1:n_N, 2]
S1_P=P[1:n_P, 1]
S2_P=P[1:n_P, 2]
kernel=function(x,y) (sign(x-y)+1)/2
Kdiff=outer(S1_P,S1_N,kernel)-outer(S2_P,S2_N,kernel)
AUC_diff=mean(Kdiff)
# DeLong
varAUC_diff_DL = var(rowMeans(Kdiff))/n_P + var(colMeans(Kdiff))/n_N
varAUC_diff_DL_vec <- c(varAUC_diff_DL_vec, varAUC_diff_DL)
# U statistic
varAUC_diff_U = AUC_diff^2 - mean(mapply(
function(i,j) Kdiff[i,j]*mean(Kdiff[-i,-j]),rep(1:n_P,each=n_N),rep(1:n_N,times=n_P)))
varAUC_diff_U_vec <- c(varAUC_diff_U_vec, varAUC_diff_U)
# AUC difference
diff_AUC <- AUC_diff
diff_AUC_vec <- c(diff_AUC_vec, diff_AUC)
}
v1 <- var(diff_AUC_vec)
v2 <- mean(varAUC_diff_DL_vec)
v3 <- mean(varAUC_diff_U_vec)
result <- c(v1, v2, v3)
return(result)
}
var.output=compare_estimate(0.88, 0.9)
(var.output[2]-var.output[1])/var.output[1]*100
(var.output[3]-var.output[1])/var.output[1]*100
Note: AUC as a special case.
Question of interest: to find the linear combination coefficient \(\boldsymbol{c}^{\prime}\) such that the VUS associated with univariate scores \(\boldsymbol{c}^{\prime} \boldsymbol{Y}_d\) \((d=1,2,3)\) is improved.
Assume that \(\boldsymbol{Y}_{d}\thicksim\) \(N_p\left(\boldsymbol{\mu}_{d}, \boldsymbol{\Sigma}_{d}\right)\), \(d=1,2,3\).
To minimize \(\boldsymbol{c}^{\prime}\left( {\sum_{d=1}^{3}} \boldsymbol{\Sigma}_{d}\right) \boldsymbol{c}\) (similar to total within-group variance) and maximize \(\boldsymbol{c}^{\prime}\sum_{d=1}^{3}\left[ \boldsymbol{\mu}_{d} -\overline{\boldsymbol{\mu}}\right] \left[ \boldsymbol{\mu}_{d} -\overline{\boldsymbol{\mu}}\right] ^{T}\boldsymbol{c}\) (similar to between-group variance).
Penalized-Distance: \[\boldsymbol{c}^{\prime}\left\{ \sum\limits_{d=1}^{3}\left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) \left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) ^{T}-\left( \boldsymbol{\Sigma}_{1}+ \boldsymbol{\Sigma}_{2}+\boldsymbol{\Sigma}_{3}\right)\right\}\boldsymbol{c}\]
Scaled-Distance: \[ \boldsymbol{c}^{\prime}\left\{\left( \boldsymbol{\Sigma}_{1}+ \boldsymbol{\Sigma}_{2}+\boldsymbol{\Sigma}_{3}\right) ^{-1} \sum\limits_{d=1}^{3}\left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) \left(\boldsymbol{\mu}_{d}-\overline{\boldsymbol{\mu}}\right) ^{T}\right\}\boldsymbol{c} \] How to find \(\boldsymbol{c}\)?
ssd.func <- function(new.1,new.2,new.3,design='unbal') {
n1 = nrow(new.1)
n2 = nrow(new.2)
n3 = nrow(new.3)
xbar.1 = colMeans(new.1)
xbar.2 = colMeans(new.2)
xbar.3 = colMeans(new.3)
if (design=='unbal') xbar.0=(n1*xbar.1 + n2*xbar.2 + n3*xbar.3)/(n1+n2+n3) else xbar.0=(xbar.1 + xbar.2 + xbar.3)/3
between=cbind(xbar.1-xbar.0, xbar.2-xbar.0, xbar.3-xbar.0)
info.mat=solve(var(new.1)+var(new.2)+var(new.3))%*%between%*%t(between)
est.coef=as.real(eigen(info.mat)$vectors[,1])
if(mean(new.1%*%est.coef)<mean(new.3%*%est.coef)) return(list(coef=est.coef)) else return(list(coef=-est.coef))
}
Stepwise search using empirical distribution-free approach to maximize the Mann-Whitney U statistic.
Order the \(p\) diagnostic tests or biomarkers according to their Mann-Whitney statistic, which is a nonparametric estimation for VUS.
Employ empirical approach to combine two diagnostic tests or biomarkers with first two largest VUSs.
After combining the first two diagnostic tests or biomarkers to get a new biomarker, employ empirical search approach again to combine the newly obtained biomarker with the original biomarker having third largest VUS.
Proceed stepwisely until only one biomarker is obtained, with which we maximize the VUS.
Step-Down? Step-Up?
nonpar.combine2.vus <- function(alpha,rate,data.1,data.2,data.3) {
new.1 = data.1%*%c(alpha,rate)
new.2 = data.2%*%c(alpha,rate)
new.3 = data.3%*%c(alpha,rate)
nonp.vus(new.1,new.2,new.3)
}
nonpar.combine2.coef <- function(dat_1,dat_2,dat_3,evalnum=201) {
rate=seq(-1,1,length=evalnum)
alpha=rev(rate)[-1]
vus.rate_x = sapply(rate, nonpar.combine2.vus, alpha=1,data.1=dat_1,data.2=dat_2,data.3=dat_3)
vus.alpha_x = sapply(alpha, nonpar.combine2.vus, rate=1,data.1=dat_1,data.2=dat_2,data.3=dat_3)
vus.0 = c(vus.rate_x,vus.alpha_x)
vmax.idx = which.max(vus.0)
if(vmax.idx<=evalnum) return(c(alpha=1,rate=rate[vmax.idx],
vus.max=vus.0[vmax.idx]))
if(vmax.idx> evalnum) return(c(alpha=alpha[vmax.idx-evalnum],rate=1,
vus.max=vus.0[vmax.idx]))
}
nonp.vus.check <- function(health,middle,diseas) {
vus.i=numeric(ncol(health))
for (i in 1:ncol(health)) {
new.1=health[,i]
new.2=middle[,i]
new.3=diseas[,i]
vus.i[i]=nonp.vus(new.1,new.2,new.3)}
vus.i
}
step.coef <- function(new.1,new.2,new.3,design='step-down') {
n1 = nrow(new.1)
n2 = nrow(new.2)
n3 = nrow(new.3)
VARnum = ncol(new.1)
combcoef = matrix(0,nrow=VARnum-1,ncol=2)
if (design=='step-down') {
vus.order = sort(nonp.vus.check(health=new.1,middle=new.2,diseas=new.3),
index.return=T,decreasing=T)$ix } else {
vus.order = sort(nonp.vus.check(health=new.1,middle=new.2,diseas=new.3),
index.return=T,decreasing=F)$ix }
combmarker.1=new.1[,vus.order[1]]
combmarker.2=new.2[,vus.order[1]]
combmarker.3=new.3[,vus.order[1]]
final.coef=1
for (i in 2:VARnum) {
combmarker.1 = cbind(combmarker.1,new.1[,vus.order[i]])
combmarker.2 = cbind(combmarker.2,new.2[,vus.order[i]])
combmarker.3 = cbind(combmarker.3,new.3[,vus.order[i]])
temp.info = nonpar.combine2.coef(combmarker.1,combmarker.2,combmarker.3)
combcoef[i-1,] = temp.info[1:2]
final.coef = c(final.coef*combcoef[i-1,1],combcoef[i-1,2])
combmarker.1 = combmarker.1%*%temp.info[1:2]
combmarker.2 = combmarker.2%*%temp.info[1:2]
combmarker.3 = combmarker.3%*%temp.info[1:2]
}
final.coef = final.coef[sort(vus.order,index.return=T)$ix]
return(list(coef=as.numeric(final.coef),
vus.combined=as.numeric(temp.info[3])))
}
cum.logistic <- function(new.1,new.2,new.3) {
n1 = nrow(new.1)
n2 = nrow(new.2)
n3 = nrow(new.3)
dat.clr = rbind(new.1,new.2,new.3)
response = as.factor( rep(c(1,2,3),times=c(n1,n2,n3)) )
obj.clr <- try(polr(response~ ., data = dat.clr),silent=T)
if (class(obj.clr)=="try-error") return(list(coef=rep(0,ncol(new.1)))) else est.coef=as.numeric(obj.clr$coef)
if(mean(new.1%*%est.coef)<mean(new.3%*%est.coef)) return(list(coef=est.coef)) else return(list(coef=-est.coef))
}
minmax.coef <- function(new.1,new.2,new.3) {
max_min.1=cbind(apply(new.1,1,max),apply(new.1,1,min))
max_min.2=cbind(apply(new.2,1,max),apply(new.2,1,min))
max_min.3=cbind(apply(new.3,1,max),apply(new.3,1,min))
est.coef = nonpar.combine2.coef(max_min.1,max_min.2,max_min.3)
return(list(coef=as.numeric(est.coef[1:2]),
vus.combined=as.numeric(est.coef[3])))
}