##read data
source("./R/qcFun.R")
dataList=c("./data/2019_protein_normal.txt",
"./data/2019_protein_normal_impute.txt",
"./data/2019_protein_normal_impute_combat.txt",
"./data/2019_protein_no_normal.txt",
"./data/2019_protein_no_normal_combat.txt",
"./data/2019_protein_no_normal_impute.txt",
"./data/2019_protein_no_normal_impute_combat.txt"
)
proFile=dataList[3]
proFile="2018-05-21pppb_prot.txt"
prot=read.table(proFile, as.is = T, header = T)
rownames(prot)=prot[,1]
prot=prot[,-c(1,2)]
PPP0=read.csv("PPPB1_lineup.csv", as.is = T, header = T)
PPP1=PPP0
PPP1$PPPA_ID=tolower(PPP1$PPPA_ID)
datmp=t(prot)
###line up protein matrix and datmp
exDat=matrix(0, nrow(datmp), ncol(datmp))
rID=matrix(0, nrow(datmp), 1)
for (i in 1:nrow(PPP1)) {
idx=which(rownames(datmp) == PPP1$PPPA_ID[i])
rID[i,1]=idx
exDat[i,] = datmp[idx,]
}
rownames(exDat) = PPP1$PPPA_ID[rID[,1]]
colnames(exDat) = rownames(prot)
#evaluate missing value
missCut=0.8
misD=matrix(1, nrow(exDat), 4)
for(i in 1:nrow(misD)) {
naIdx=which(is.na(exDat[i,]))
if (length(naIdx) < ncol(exDat) ) {
misD[i, 1] = length(naIdx)/ncol(exDat)
misD[i, 2] = min(datmp[i,-naIdx])
misD[i, 3] = median(exDat[i, -naIdx])
misD[i, 4] = mean(exDat[i, -naIdx])
}
}
blankInd = which(misD[,1]> missCut)
hist(misD[,1], breaks = 50, main="Missing per sample", xlab="Missing rate")
if(length(blankInd)> 0) {
exDat=exDat[-blankInd,]
PPP1=PPP1[-blankInd,]
}
###quantile normalization
###https://en.wikipedia.org/wiki/Quantile_normalization
library(preprocessCore)
#exDat1=exDat
#qdat=t(normalize.quantiles(t(exDat1)))
qdat=exDat
rownames(qdat)=rownames(exDat)
colnames(qdat)=colnames(exDat)
for(i in 1:ncol(datmp)) {
idx=which(rownames(qdat) == rownames(datmp)[i])
if(length(idx) > 0) {
datmp[i,] = qdat[idx,]
}
}
write.table(t(datmp), "Apr_PPP1.txt", row.names = T, col.names = T, quote=F)
###proQC package
qdat=t(read.table("Apr_PPP1.txt", as.is = T, header = T)) #row for individual, column for protein
PPP1=read.csv("PPPB1_lineup.csv", as.is = T, header = T)
indM=array(0, nrow(qdat))
for(i in 1:nrow(qdat)) {
indM[i]=length(which(is.na(qdat[i,])))
}
idx=which(indM/ncol(qdat) > 0.8)
if(length(idx) > 0) {
qdat=qdat[-idx,]
PPP1=PPP1[-idx,]
}
######Slide 1: density plot
print("Slide 1: A-AQUA, C-CTRL, N-Normal, T-Tumor")
## [1] "Slide 1: A-AQUA, C-CTRL, N-Normal, T-Tumor"
proDensityPlot(qdat, PPP1$Tissue)
Quality score is defined as \[Q=\frac{\sum_{i=1}^\tilde{M}\tilde{s}_i(k-\frac{n_o}{n})^c}{\sum_{i=1}^Ms_i}\] in which \(\tilde{s}_i=[y_i-(E(y_i|n_{o_i})+T_{\alpha/M}\sigma_i)]^2\) if \(y_i > E(y_i|n_{o_i})\) or \(\tilde{s}_i=[y_i-(E(y_i|n_{o_i})-T_{\alpha/M}\sigma_i)]^2\) if \(y_i < E(y_i|n_{o_i})\). In contrast, \(s_i=[y_i-E(y_i|n_{o_i})]^2\).
\(T_{\alpha/M}\) is the z-score given the p-value cutoff \(\alpha/M\), and \(\alpha\) can be for example 0.01 or 0.05.
\(E(y_i|n_{o_i})\) can be predicted from the fitted spline.
\(\alpha\) determins the confidence interval of the spline. The smaller the \(\alpha\), the larger the interval and the smaller \(\tilde{M}\) till zero.
\(K\), between 0 and 1, indicates the direction of the penalty. When it is 1, the penalty is more weighted on the proteins with fewer observations, and nearly no penalty at all for proteins without missing values.
\(C\) is the penalty for the score.
In summary, the quality score is upon \(M\), \(\alpha\), \(K\), and \(C\).
#####Slide 2: Missing plot
print("Slide 2: Qscore")
## [1] "Slide 2: Qscore"
Qalpha=0.05
TAG=c("A", "C", "N", "T")
indMPro=list(qdat[PPP1$Tissue==TAG[1],],
qdat[PPP1$Tissue==TAG[2],],
qdat[PPP1$Tissue==TAG[3],],
qdat[PPP1$Tissue==TAG[4],])
layout(matrix(1:(length(TAG)+1), 1, (length(TAG)+1)))
QStest=array(0, dim=c(6, length(TAG)))
colnames(QStest)=TAG
for(i in 1:length(TAG)) {
refAmean=colMeans(indMPro[[i]], na.rm = T)
refObsPro=array(1, dim=length(refAmean))
for(j in 1:length(refAmean)) {
if(length(which(!is.na(indMPro[[i]][,j])))>0) {
refObsPro[j] = length(which(!is.na(indMPro[[i]][,j])))
}
}
idNA=which(is.na(refAmean))
if(length(idNA)>0) {
ss=smooth.spline(1-refObsPro[-idNA]/nrow(indMPro[[i]]), refAmean[-idNA], df=5)
err=qnorm(1-Qalpha/length(refAmean[-idNA])/2)*sd(ss$x)
} else {
ss=smooth.spline(1-refObsPro/nrow(indMPro[[i]]), refAmean, df=5)
err=qnorm(1-Qalpha/length(refAmean)/2)*sd(ss$x)
}
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=0, C=0.5)
QStest[1,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=0.5)
QStest[2,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=0, C=1)
QStest[3,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=1)
QStest[4,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=0, C=2)
QStest[5,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=2)
QStest[6,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
}
print(QStest)
## A C N T
## [1,] 0.1943176 0.1721030 0.1689077 0.1703303
## [2,] 0.2182414 0.2299983 0.1566001 0.1545109
## [3,] 0.1549254 0.1292517 0.1436103 0.1443521
## [4,] 0.1957597 0.2079745 0.1405350 0.1368648
## [5,] 0.1317600 0.1050205 0.1271915 0.1269831
## [6,] 0.1725943 0.1837433 0.1241161 0.1194958
layout(matrix(1:2, 1, 2))
barplot(main="K=0 (remove low missing rate)", QStest[c(1,3,5),], beside = T, border = NA, ylim=c(0, max(QStest)*1.2))
barplot(main="K=1 (remove high missing rate)", QStest[c(2,4,6),], beside = T, border = NA, ylim=c(0, max(QStest)*1.2))
legend("topright", legend = paste("C=", c(0.5, 1, 2)), pch=15, col=c("grey10", "grey50", "grey80"))
##QS
Qalpha=0.05
TAG=c("A", "C", "N", "T")
indMPro=list(qdat[PPP1$Tissue==TAG[1],],
qdat[PPP1$Tissue==TAG[2],],
qdat[PPP1$Tissue==TAG[3],],
qdat[PPP1$Tissue==TAG[4],])
layout(matrix(1:(length(TAG)+1), 1, (length(TAG)+1)))
qdatAC=qdat[PPP1$Tissue==TAG[1] | PPP1$Tissue==TAG[2],]
refAmean=colMeans(qdatAC, na.rm = T)
refObsPro=array(1, dim=length(refAmean))
for(i in 1:length(refAmean)) {
if(length(which(!is.na(qdatAC[,i])))>0) {
refObsPro[i] = length(which(!is.na(qdatAC[,i])))
}
}
idNA=which(is.na(refAmean))
plot(bty='n', main="A+C", xlab="Missing rate", ylab="Expression",
xlim=c(0, 1), ylim=c(0, 23),
1-refObsPro[-idNA]/nrow(qdatAC[-idNA,]),
refAmean[-idNA],
cex=0.5,pch=16, col="grey")
ssAC=smooth.spline(1-refObsPro[-idNA]/nrow(qdatAC[-idNA,]), refAmean[-idNA], df=5)
err=qnorm(1-Qalpha/length(refAmean)/2)*sd(ssAC$x)
lines(ssAC, col="green", lty=1, lwd=3)
lines(ssAC$x, ssAC$y+err, col="green", lty=3, lwd=2)
lines(ssAC$x, ssAC$y-err, col="green", lty=3, lwd=2)
Qscore=array(0, dim=length(TAG))
for(i in 1:length(TAG)) {
refAmean=colMeans(indMPro[[i]], na.rm = T)
refObsPro=array(1, dim=length(refAmean))
for(j in 1:length(refAmean)) {
if(length(which(!is.na(indMPro[[i]][,j])))>0) {
refObsPro[j] = length(which(!is.na(indMPro[[i]][,j])))
}
}
idNA=which(is.na(refAmean))
ss=smooth.spline(1-refObsPro[-idNA]/nrow(indMPro[[i]]), refAmean[-idNA], df=5)
err=qnorm(1-Qalpha/length(refAmean)/2)*sd(ss$x)
Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=2)
Qscore[i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T)
plot(bty='n', main=paste(TAG[i], ": Qscore=", format(1-Qscore[i], digits = 3)),
xlab="Missing rate", ylab="Expression",
xlim=c(0, 1), ylim=c(0, 23),
1-refObsPro/nrow(indMPro[[i]]), refAmean,cex=0.5,pch=16, col="grey")
if(i==1) {
ssA=smooth.spline(1-refObsPro[-idNA]/nrow(indMPro[[i]]), refAmean[-idNA], df=5)
}
lines(ssA, col="blue", lty=2, lwd=3)
lines(ss, col=ifelse(i!=1, "red", "blue"), lty=1, lwd=3)
lines(ss$x, ss$y+err, col=ifelse(i!=1, "red", "blue"), lty=3, lwd=1)
lines(ss$x, ss$y-err, col=ifelse(i!=1, "red", "blue"), lty=3, lwd=1)
}
alpha=0.05
beta=0.2
qdatT=list(a=qdat[PPP1$Tissue==TAG[1],],
c=qdat[PPP1$Tissue==TAG[2],],
n=qdat[PPP1$Tissue==TAG[3],],
t=qdat[PPP1$Tissue==TAG[4],])
pT=matrix(NA, ncol(qdat), 7)
missT=matrix(NA, ncol(qdat), 7)
naCnt=matrix(NA, ncol(qdat), 2)
for(i in 1:nrow(pT)) {
naCnt[i,1]=naN=length(which(!is.na(qdatT$n[,i])))
naCnt[i,2]=naT=length(which(!is.na(qdatT$t[,i])))
if(naN > 1 && naT > 1) {
tt=t.test(qdatT$t[,i], qdatT$n[,i])
pT[i,c(1,2)] = c(tt$statistic, tt$parameter)
pT[i,3]=-1*pt(abs(pT[i,1]), pT[i,2], lower.tail=FALSE, log.p=T)/log(10) - log10(2)
pT[i,c(4,5)]=tt$estimate
pT[i,6]=var(qdatT$n[,i], na.rm = T)
pT[i,7]=var(qdatT$t[,i], na.rm = T)
mTT=t.test(c(rep(1, naT), rep(0, nrow(qdatT$t))),
c(rep(1, naN), rep(0, nrow(qdatT$n))))
missT[i,c(1,2)] = c(mTT$statistic, mTT$parameter)
missT[i,3]=-1*pt(abs(missT[i,1]), missT[i,2], lower.tail=FALSE, log.p=T)/log(10) - log10(2)
}
}
#
mat=matrix(0, 4, 4)
mat[col(mat)<row(mat)]=seq(1,11,2)
mat=t(mat)
mat[col(mat)<row(mat)]=seq(2,12,2)
diag(mat)=13
mat=t(mat)
layout(mat)
par(mai=c(0.2,0.2,0.2,0.2))
for(i in 1:3) {
for(j in (i+1):4) {
mT=colMeans(qdatT[[i]], na.rm = T)
mN=colMeans(qdatT[[j]], na.rm = T)
id=which(!is.na(mT)&!is.na(mN))
mTclean=mT[id]
mNclean=mN[id]
rkT=order(mTclean)
rkN=order(mNclean)
plot(main=paste(length(mNclean), TAG[i], "vs", TAG[j]), mNclean[rkT], col=j, cex=0.5, bty='n')
points(1:length(mTclean), mTclean[rkT], pch=16, cex=0.5, col=i)
plot(main=paste(length(mTclean), TAG[j], "vs", TAG[i]), mTclean[rkN], col=i, cex=0.5, bty='n')
points(1:length(mNclean), mNclean[rkN], pch=16, cex=0.5, col=j)
}
}
## slide 3: gc
print("Slide 3: GC control")
## [1] "Slide 3: GC control"
print("Bartlett test")
## [1] "Bartlett test"
BTpval=array(1, ncol(qdat))
for(i in 1:length(BTpval)) {
ll=array(0, 4)
for(j in 1:length(TAG)) {
ll[j]=length(which(!is.na(qdat[PPP1$Tissue==TAG[j],i])))
}
if(all(ll >= 2)) {
BTpval[i] = bartlett.test(qdat[,i], PPP1$Tissue)$p.value
}
}
plot(-log10(BTpval), pch=16, cex=0.5, bty='l')
pmat=matrix(NA, 6, ncol(indMPro[[1]]))
pIdx=1
layout(matrix(c(1,2,3,8,4,5,7,7,6), 3, 3))
gc=array(0,6)
for(i in 1:3) {
dn=indMPro[[i]]
for(j in (i+1):4) {
dm=indMPro[[j]]
for(k in 1:ncol(dn)) {
if( (length(which(!is.na(dn[,k]))) > 1) & (length(which(!is.na(dm[,k]))) >1)) {
ft= var.test(dn[,k], dm[,k], na.rm = T)
pmat[pIdx,k] = ft$p.value
}
}
hist(main=paste0(TAG[j], "/", TAG[i]), xlab="p-value (F test)",
pmat[pIdx,which(!is.na(pmat[pIdx,]))],
breaks = 20, col=pIdx)
abline(h=length(which(!is.na(pmat[pIdx,])))/20, lty=2, col="grey")
gc[pIdx]=qchisq(median(pmat[pIdx,], na.rm = T), df=1, lower.tail = F)/0.455
legend("topright", bty = 'n', legend = paste("GC=", format(gc[pIdx], digits = 3)))
pIdx=pIdx+1
}
}
pIdx=1
for(i in 1:3) {
for(j in (i+1):4) {
if(pIdx==1) {
qqplot(bty='n', col=pIdx, ylim=c(0,70), xlab="Expected -log10(p)", ylab="Observed -log10(p)",
-log10(runif(length(which(pmat[pIdx,]<1)))),-log10(pmat[pIdx,]), pch=16, cex=0.5)
abline(a=0, b=1, col="grey", lty=2)
} else {
points(sort(-log10(runif(length(which(pmat[pIdx,]<1))))),sort(-log10(pmat[pIdx,])), col=pIdx, pch=16,cex=0.5)
}
pIdx=pIdx+1
}
}
legend("topleft", bty='n', legend = paste0("GC=", format(gc, digits=3)), pch=15, col=1:6)
## [1] "Slide 4: PCA"
## [,1] [,2] [,3]
## [1,] -0.5004384 0.09965401 0.3500926
## [2,] 0.6110714 1.05428997 1.4276982
## [3,] 0.4640483 0.72234540 0.9376862
## [4,] -0.4652352 0.87182009 1.0882639
## [,1] [,2] [,3]
## [1,] 0.46446991 0.9441532 1.1598855
## [2,] -0.06996770 0.7039145 0.7088100
## [3,] -0.07602626 0.8486921 0.8544721
## [4,] 0.03849055 1.1561785 1.1576600