Teaching notes for the R.
## [1] "Document was last updated at 2018-10-19."
set.seed(2018)getwd() #where are you?## [1] "/Users/gc5k/StatGenet/zjafu/day1"
help("plot")
#data
a=1
b=10
c=a+b
print(c)## [1] 11
#generate matrix and write to a file
dat=matrix(c(1,2,3,4,5,6), 2, 3, byrow = T)
write.table(dat, "myRdata.txt", row.names = F, col.names = F, quote = F)
#generate and read a file
dat1=read.table("myRdata.txt", as.is = T)
print(dat1)## V1 V2 V3
## 1 1 2 3
## 2 4 5 6
ls() #show elements## [1] "a" "b" "c" "dat" "dat1" "Date"
ls.str() #show elements, a strong version## a : num 1
## b : num 10
## c : num 11
## dat : num [1:2, 1:3] 1 4 2 5 3 6
## dat1 : 'data.frame': 2 obs. of 3 variables:
## $ V1: int 1 4
## $ V2: int 2 5
## $ V3: int 3 6
## Date : Date[1:1], format: "2018-10-19"
mode(a)## [1] "numeric"
mode(dat)## [1] "numeric"
mode(dat1)## [1] "list"
rm(dat1) #remove itx=1:30
x1=seq(1, 30, 2)
x2=seq(from=1, to=30, length.out = 6)
print(x)## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30
print(x1)## [1] 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29
print(x2)## [1] 1.0 6.8 12.6 18.4 24.2 30.0
b=0.5
print(paste("ceiling(b): ", ceiling(b)))## [1] "ceiling(b): 1"
print(paste("floor(b): ", floor(b)))## [1] "floor(b): 0"
b1=-0.5
print(paste("ceiling(b1): ", ceiling(b1)))## [1] "ceiling(b1): 0"
print(paste("floor(b1): ", floor(b1)))## [1] "floor(b1): -1"
gl(3, 5)## [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
## Levels: 1 2 3
gl(3, 5, length = 30)## [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
## Levels: 1 2 3
gl(2, 5, labels = c("Male", "Female"))## [1] Male Male Male Male Male Female Female Female Female Female
## Levels: Male Female
#all combinations
expand.grid(h=c(60,80), w=c(100, 300), sex=c("Male", "Female"))## h w sex
## 1 60 100 Male
## 2 80 100 Male
## 3 60 300 Male
## 4 80 300 Male
## 5 60 100 Female
## 6 80 100 Female
## 7 60 300 Female
## 8 80 300 Female
Normal distribtuion wiki page.
\(\chi^2_1\) and \(\chi^2_5\), chi-square distribution wiki page.
pv=0.025
qnorm(1-pv)## [1] 1.959964
qchisq(2*pv, df=1, lower.tail = F)## [1] 3.841459
qchisq(2*pv, df=5, lower.tail = F)## [1] 11.0705
layout(matrix(1:4, 2, 2, byrow = T))
plot(density(rnorm(10000)), main="Normal distribution", bty='l')
plot(density(rt(10000, 30)), main="Student-t distribution (df=30)", bty='l')
plot(density(rchisq(10000, 1)), main="Chisq distribution (df=1)", bty='l')
plot(density(rchisq(10000, 1, ncp = 2)), main="Chisq distribution (df=1, ncp=2)", bty='l')layout(matrix(1:4, 2, 2, byrow = T))
hist(rnorm(10000), breaks=25, main="Normal distribution")
hist(rt(10000, 30), breaks=25, main="Student-t distribution (df=30)", bty='l')
hist(rchisq(10000, 1), breaks=25, main=expression(paste(chi[1]^2, "distribution")), bty='l')
hist(rchisq(10000, 1, ncp = 2), breaks=25, main=expression(paste(chi[1,ncp=2]^2, "distribution")), bty='l')X = c(1, 3, 4, 5, 16, 9)
for(i in 1:length(X)) {
print(X[i])
}## [1] 1
## [1] 3
## [1] 4
## [1] 5
## [1] 16
## [1] 9
NotHere="China"
FIFA=c("Croatia", "France", "Belgien", "England", "Brazil", "Germany", "China", "Japan", "Sweden", "Russia", "Argentina", "Spain", "Chile", "Uruguay")
cnt=1
while (FIFA[cnt] != NotHere & cnt <= length(FIFA)) {
print(FIFA[cnt])
cnt = cnt + 1
}## [1] "Croatia"
## [1] "France"
## [1] "Belgien"
## [1] "England"
## [1] "Brazil"
## [1] "Germany"
idx=which(FIFA==NotHere)
print(idx)## [1] 7
FIFA1=FIFA[order(FIFA)]
print(FIFA1)## [1] "Argentina" "Belgien" "Brazil" "Chile" "China"
## [6] "Croatia" "England" "France" "Germany" "Japan"
## [11] "Russia" "Spain" "Sweden" "Uruguay"
FIFA2=FIFA[order(FIFA, decreasing = T)]
print(FIFA2)## [1] "Uruguay" "Sweden" "Spain" "Russia" "Japan"
## [6] "Germany" "France" "England" "Croatia" "China"
## [11] "Chile" "Brazil" "Belgien" "Argentina"
FIFAR=matrix(c("Brazial", 5, 1.1, "Germany", 4, 0.8, "Italy", 4, 0.7, "Argentina", 2, 4), 4, 3, byrow = T)
print(FIFAR[order(FIFAR[,1], FIFAR[,3]), ])## [,1] [,2] [,3]
## [1,] "Argentina" "2" "4"
## [2,] "Brazial" "5" "1.1"
## [3,] "Germany" "4" "0.8"
## [4,] "Italy" "4" "0.7"
boxplot(matrix(c(rnorm(100), rnorm(100, 1)), 100, 2))hist(rnorm(1000))mat=matrix(c(4,5,16,17, 34, 40), 2, 3)
colnames(mat)=c("G1", "G2", "G3")
barplot(mat, beside = T, border = F, xlab="XLab")barplot(t(mat), beside = T, border = F)z1=rnorm(1000)
z2=rnorm(1000)
layout(matrix(1:9, 3, 3, byrow = T)) #what's the difference if byrow = F
plot(z1, z2, main="1")
plot(z1, z2, main="2", bty='l', pch=16, cex=0.5)
plot(z1, z2, main="3", xlab=expression(z[1]), ylab=expression(z^2))
plot(z1, z2, main="4", xlab=expression(z[1]), ylab=expression(z^2), col="red", pch=16, cex=0.5)
plot(z1, z2, main="5", xlab=expression(z[1]), ylab=expression(z^2), col="red", pch=16, cex=0.5, bty='l')
plot(z1, z2, main="6", xlab=expression(z[1]), ylab=expression(z^2), col=ifelse(z1>0, "red", "blue"), pch=16, cex=0.5, bty='l')
plot(z1, z2, main="7", xlab=expression(z[1]), ylab=expression(z^2), col=ifelse(z1*z2>0, "red", "blue"), pch=16, cex=0.5, bty='l')
plot(z1, z2, main="8", xlim=c(-6, 6), ylim=c(-6,6), xlab=expression(z[1]), ylab=expression(z^2), col=ifelse(z1*z2>0, "red", "blue"), pch=16, cex=0.5, bty='l')
plot(z1, z2, main="9", xlim=c(-6, 6), ylim=c(-6,6), xlab=expression(z[1]), ylab=expression(z^2), cex=0.5, pch=16, col=rgb((max(abs(z1))-abs(z1))/max(abs(z1)), (max(abs(z1))-abs(z1))/max(abs(z1)), (max(abs(z1))-abs(z1))/max(abs(z1))), bty='l')layout(matrix(1:6, 2, 3))
rho=c(0.1, 0.3, 0.5, 0.7, 0.9, 0.99)
for(i in 1:length(rho)) {
x1=z1
x2=z1*rho[i]+sqrt(1-rho[i]^2)*z2
plot(x1, x2, pch=16, cex=0.5, bty='l')
abline(lm(x2~x1), col="red", lwd=2, lty=3)
legend("topleft", legend = c(paste("rho=", rho[i])), pch=1, col="red", bty = 'n')
}layout(matrix(c(1,1,2,1,1,3,1,1,4), 3, 3, byrow = T))
ts=rt(10000, df = 10)
cs1=rchisq(10000, df=1)
cs5=rchisq(10000, df=5, ncp=2)
hist(ts, breaks = 50, xlim=c(-10, 15), ylim=c(0, 5000))
hist(cs1, breaks = 50, add=T, col="red")
hist(cs5, breaks = 50, add=T, col="blue")
hist(pt(ts,df=10), main="p-value (t-test)")
hist(pchisq(cs1, df=1, ncp=0, lower.tail = T), col="red", main=expression(paste("p-value (", chi[1]^2,")")))
hist(pchisq(cs5, df=5, ncp=2, lower.tail = T), col="blue", main=expression(paste("p-value (", chi[5]^2,")")))#linear model
coR=c(0.1, 0.3, 0.5, 0.7)
x1=rnorm(5000)
layout(matrix(1:4, 2, 2))
for(i in 1:length(coR)) {
x2=coR[1]*x1+sqrt(1-coR^2)*rnorm(5000)
plot(x1, x2, cex=0.5, bty='l', pch=16, main="Linear regression", xlab="X", ylab="Y")
abline(lm(x2~x1), col="red", lty=2)
}#logistic model
a=0.1
b=1
sz=1000
x1=rnorm(sz)
layout(matrix(1:6, 2, 2, byrow = T))
err=c(0.1, 0.5, 1)
for(i in 1:length(err)) {
y=array(0, dim=length(x1))
Sp=a+b*x1 +rnorm(sz, sd = err[i])
pro=exp(Sp)/(1+exp(Sp))
y = rbinom(length(x1), 1, pro)
ox=order(x1)
plot(x1[ox], pro[ox], main=paste("S curve, res=", err[i]), bty="l", cex=0.5, pch=16)
glMod=glm(y~x1, family = "binomial")
points(x1[ox], glMod$fitted.values[ox], col="red", cex=0.5, pch=15)
plot(x1, y, cex=0.5, bty='l', pch=16, main="Logistic regression", xlab="X", ylab="Y")
abline(glMod, col="red", lty=2)
}cR=0.1
Sz=500
pm=5000
x1=rnorm(Sz)
y1=cR*x1+sqrt(1-cR^2)*rnorm(Sz)
regB=matrix(0, pm, 4)
mod1=lm(y1~x1)
for(i in 1:pm) {
regB[i,] = summary(lm(y1~sample(x1)))$coefficient[2,]
}
hist(regB[,1], xlim=c(min(c(regB[,1], summary(mod1)$coefficient[2,1])), max(c(regB[,1], summary(mod1)$coefficients[2,1]))), main="Permutation test", xlab=expression(paste(beta)))
abline(v=summary(mod1)$coefficient[2,1], col="red")
ln=length(which(abs(regB[,1])>summary(mod1)$coefficient[2,1]))
ep=ifelse(ln>0, ln/pm, 0)
legend("topleft", legend = paste("p=",ep), bty='n')layout(matrix(1:4, 2,2))
REP=c(1,2,4,256)
n=100
for(i in 1:length(REP)) {
plot(main=paste(REP[i], "shoot"), x=NULL, y=NULL, xlim=c(-4,4), ylim=c(-4,4), axes = F, xlab="", ylab="")
for(j in 1:n) {
points(mean(rnorm(REP[i])), mean(rnorm(REP[i])), pch=16, cex=0.5, col="red")
abline(h=0, v=0)
}
points(rep(0, 6),rep(0,6), cex=seq(5,30, 5))
}Mendel vs Fisher
set.seed(20180625)
layout(matrix(1:4, 2,2))
REP=c(1,2,4,256)
n=1000
for(i in 1:length(REP)) {
xp=array(0, n)
for(j in 1:n) {
xp[j] = mean(rnorm(REP[i]))
}
# qqplot(rnorm(n, 0, sqrt(1/REP[i])), xp, bty='l', ylab="Simulated", xlab="Theoretical", xlim=c(-4, 4), ylim=c(-4,4))
qqplot(rnorm(n, 0, sqrt(1/REP[i])), xp, bty='l', ylab="Simulated", xlab="Theoretical", xlim=c(-4, 4), ylim=c(-4,4), pch=16, cex=0.5)
abline(a=0, b=1, col="red")
}##########################
##########################
#sample size
N=1000
M=20000
h2=0.5
#uniform
freq=runif(M)
#0.5
freq=rep(0.5, M)
#
gmat=matrix(0, N, M)
for(i in 1:M) {
gmat[,i] = rbinom(N, 2, freq[i])
}
eff=rnorm(M, 0, sqrt(1/M))
bv=gmat %*% eff
y=bv+rnorm(N, 0, sd=sqrt(var(bv)/h2*(1-h2)))
layout(matrix(1:4, 2, 2))
freqDat=colMeans(gmat)/2
plot(freqDat, frame.plot = F, xlab="Locus", ylab="Freq", pch=16, cex=0.5)
qqplot(freqDat, rnorm(M, freq, sqrt(freq*(1-freq)/(2*N))), xlab = "Observed", ylab="Expected", frame.plot=F, pch=16, cex=0.5)
abline(a=0, b=1, col="red")
pvN=pnorm((freqDat-0.5)/sqrt(freqDat*(1-freqDat)/(2*N)))
pvB=pbinom(apply(gmat, 2, sum), 2*N, 0.5)
hist(main="p-value distribution", pvN)
hist(main="p-value distribution", pvB)layout(matrix(1:6, 2, 3))
FinaC=read.table("Fina_Genotype.ped", as.is = T, na.strings = "99")[,-c(1:6)]
GCnt=array(0, ncol(FinaC))
for(i in 1:ncol(FinaC)) {
GCnt[i] = length(which(!is.na(FinaC[,i])))
}
FinaSC = matrix(NA, nrow = nrow(FinaC), ncol = ncol(FinaC))
for(i in 1:ncol(FinaC)) {
idx=which(!is.na(FinaC[,i]))
FinaSC[idx,i]=rbinom(length(idx), 1, 0.5)*2
}
FinaFreq=colMeans(FinaC, na.rm = T)/2
FinaSCFreq=colMeans(FinaSC, na.rm = T)/2
plot(FinaFreq, col="grey", bty="l", cex=0.5, pch=16)
qqplot(FinaFreq, FinaSCFreq, xlab = "Observed freq", ylab="Expected freq", bty="l", pch=16, cex=0.5)
abline(a=0, b=1, col="red")
#########t-test
#FinaFreq = FinaSCFreq
tTest=matrix(0, ncol(FinaC), 2)
tTest[,1]=(FinaFreq-0.5)/sqrt(FinaFreq*(1-FinaFreq)/(GCnt))
tTest[,2]=2*pt(-1*abs(tTest[,1]), GCnt-1)
hist(tTest[,2], breaks = 50, main="t test")
#######chi-sq
#FinaC=FinaSC
chiTest=matrix(0, ncol(FinaC), 2)
for(i in 1:ncol(FinaC)) {
gcnt=table(FinaC[,i])
gm=mean(gcnt)
chiTest[i,1] = (gcnt[1]-gm)^2/gm + (gcnt[2]-gm)^2/gm
chiTest[i,2] = pchisq(chiTest[i,1], 1, lower.tail = F)
}
hist(chiTest[,2], breaks = 50, main="Chisq test")
plot(FinaFreq, col=ifelse(chiTest[,2] < (0.05/length(FinaFreq)), "red", "grey"), pch=16, bty="l")layout(matrix(1:2, 1, 2))
qchi=qchisq(runif(10000), df = 1, ncp = 0.1)
xchi=rchisq(10000, df = 1)
qqplot(x=xchi, y=qchi, bty="l", xlab=expression(paste("Theoretical ", chi[1]^2)), ylab=expression(paste("Observed ", chi[1]^2)), pch=16, cex=0.5)
abline(a=0, b=1, col="red")
gc=qchisq(median(pchisq(qchi, df=1, lower.tail = T)), df=1)/qchisq(0.5, df=1, lower.tail = T)
qqplot(x=xchi, y=qchi/gc, bty="l", xlab=expression(paste("Theoretical ", chi[1]^2)), ylab=expression(paste("Observed ", chi[1]^2)), pch=16, cex=0.5)
abline(a=0, b=1, col="red")\(\int_{p_1}^{p_2}2p(1-p)\beta^2dp=\int_{p_1}^{p_2}2p\beta^2dp-\int_{p_1}^{p_2}2p^2\beta^2dp=(p_2^2-p_1^2)\beta^2-\frac{2}{3}(p_2^3-p_1^3)\beta^2\), and given \(p_1=0\) and \(p_2=0.5\), it is \(\frac{1}{6}\beta^2\).
Statistical power is determined by the type I error rate \(\alpha\), by default \(\alpha=0.05\), and the number of markers. Here, the power is calculated from \(\chi^2_1\) distribution, and the noncentrality parameter is \[\lambda=\frac{2pq(1+F_{st})\beta^2N}{\sigma^2_y-2pq(1+F_{st})\beta^2}=\frac{Nh^2}{1-h^2}\] , in which \(\sigma^2_y=2pq\beta^2(1+F_{st})\frac{1-h^2}{h^2}\) and \(F_{st}=1\) for inbred lines and 0 for random mating populaitons. So \(power=\chi^2_{1,\lambda|\alpha,m}\).
H2=c(0.05, 0.1, 0.15, 0.2)
SampleSize=c(100, 200, 500, 750)
Po=matrix(0, length(H2), length(SampleSize))
marker=1e6
for(i in 1:length(H2)) {
p=0.3
q=1-p
beta=1
h2=H2[i]
vb=2*p*q*beta^2
vy=vb/h2*(1-h2)
Ncp=(SampleSize*vb)/(vy-vb)
tcut=qchisq(0.05/marker, 1, lower.tail = F)
Po[i,]=pchisq(tcut, df=1, ncp = Ncp, lower.tail = F)
}
colnames(Po)=SampleSize
barplot(Po, beside = T, border = NA, ylab="Power", xlab="Sample size")Inside each cluster from left to right is \(h^2=(0.05, 0.1, 0.15, 0.2)\). Note, it is irrelevant to inbred or not.
| case | control | |
|---|---|---|
| p | \(\bar{p}=1-p\) | |
| q | \(\bar{q}=1-q\) |
\(OR=\frac{p\bar{q}}{q\bar{p}}\), and if odds ratio is close to 1 then \(ln(OR)=\beta\approx OR-1\), in which \(\beta\) is the coefficient from logistic regression.
p=0.52
q=1-p
p_c=1-p
q_c=1-q
OR=p*q_c/(q*p_c)
H2=c(0.05, 0.1, 0.15, 0.2)
SampleSize=c(100, 200, 500, 750)
Po=matrix(0, length(H2), length(SampleSize))
marker=1e6
for(i in 1:length(H2)) {
p=0.3
q=1-p
beta=OR-1
h2=H2[i]
vb=2*p*q*beta^2
vy=vb/h2*(1-h2)
Ncp=(SampleSize*vb)/(vy-vb)
tcut=qchisq(0.05/marker, 1, lower.tail = F)
Po[i,]=pchisq(tcut, df=1, ncp = Ncp, lower.tail = F)
}
colnames(Po)=SampleSize
barplot(Po, beside = T, border = NA, ylab="Power", xlab="Sample size")rep=10000
ss=array(0, dim=rep)
for(i in 1:rep) {
x=rnorm(100, runif(1, 0.5, 1))
ss[i]=sum(x^2)
}
mean(ss)## [1] 158.6174
var(ss)## [1] 904.7643
set.seed(1000)
CT=4
N=ceiling(rnorm(CT, 3000, 1000))
M=10000
#freq
P=runif(M, 0.05, 0.95)
Freq=matrix(0, M, CT)
for(i in 1:CT) {
Freq[,i] = rnorm(M, P, sd=sqrt((P*(1-P))/(2*N[i])))
}
#mat
h2=0.5
Beta=rnorm(M, 0, sd=0)#=sqrt(h2/sum(2*P*(1-P))))
Bmat=matrix(0, M, CT)
Smat=matrix(0, M, CT)
Pmat=matrix(0, M, CT)
for(i in 1:CT) {
Smat[,i]=1/(N[i]*2*Freq[,i]*(1-Freq[,i]))
Bmat[,i] = rnorm(M, Beta, sd=sqrt(Smat[,i]))
Pmat[,i] = (1-pnorm(abs(Bmat[,i]/sqrt(Smat[,i]))))*2
}
#fixed model
FisherP=matrix(0, M, 1)
Fmeta=matrix(0, M, 3)
Qmeta=matrix(0, M, 6)
for(i in 1:M) {
FisherP[i,1]=-2*sum(log(Pmat[i,]))
v=1/Smat[i,]
Fmeta[i,2]=1/sum(v)
Fmeta[i,1]=sum(Bmat[i,]*v)/sum(v)
Fmeta[i,3]=Fmeta[i,1]/sqrt(Fmeta[i,2])
Qmeta[i,4]=sum(v*(Bmat[i,]-Fmeta[i,1])^2)
Qmeta[i,5]=sum(v)-sum(v^2)/sum(v)
Qmeta[i,6]=(Qmeta[i,4]-(CT-1))/Qmeta[i,5]
if(Qmeta[i,6]<0) {
Qmeta[i,6]=0
}
vr=1/(Smat[i,]+Qmeta[i,6])
Qmeta[i,2]=1/sum(v)+Qmeta[i,6]
Qmeta[i,1]=sum(vr*Bmat[i,])/sum(vr)
Qmeta[i,3]=Qmeta[i,1]^2/Qmeta[i,2]
}
layout(matrix(1:4, 2, 2))
qqplot(main="Fisher's methods", xlab=expression(paste("Theoretical ", chi[8]^2)),
ylab=expression(paste("Observed ", chi[8]^2)),
FisherP, rchisq(M, 8), bty="n", pch=16)
abline(a=0, b=1)
qqplot(main="Fixed model", xlab=expression(paste("Theoretical ", chi[1]^2)),
ylab=expression(paste("Observed ", chi[1]^2)),
rchisq(M, 1), Fmeta[,3]^2, bty='n', pch=16)
abline(a=0, b=1)
qqplot(main="Random model", xlab=expression(paste("Theoretical ", chi[1]^2)),
ylab=expression(paste("Observed ", chi[1]^2)),
rchisq(M,1), Qmeta[,1]^2/Qmeta[,2], bty='n', pch=16)
abline(a=0, b=1)
qqplot(main="Heterogeneity (4 cohorts)", xlab="Q", ylab=expression(paste("Theoretical ", chi[3]^2)), Qmeta[,4], rchisq(M,CT-1), bty='n', pch=16)
abline(a=0, b=1)library(RcppArmadillo)## Warning: package 'RcppArmadillo' was built under R version 3.4.4
y=rnorm(1000)
x=rnorm(1000)
mod1=lm(y~x)
mod2=fastLmPure(cbind(1, x), y)
print(mod1)##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 0.009178 -0.040040
print(mod2)## $coefficients
## [,1]
## [1,] 0.009178328
## [2,] -0.040040368
##
## $stderr
## [,1]
## [1,] 0.03245848
## [2,] 0.03278206
##
## $df.residual
## [1] 998
Principal variance component analysis
\(y=a+\sum b_iE_i +e\)
in which \(E_i\) is the \(i^{th}\) eigenvector associated to the \(i^{th}\) eigenvalue \(\lambda_i\), \(y\) is the value of interest. \(R^2\) tells how much \(y\) is determined by the eigenvectors.
Let see show it in this famous iris data [Fisher, R.A, 1936, Annals of Eugeneics, 7:179-188; Anderson, E, 1935, The irises of the Gaspe Peninsula, Bulletin of the American Iris Society, 59, 2-5]
irisE=eigen(as.matrix(iris[,1:4])%*%t(as.matrix(iris[,1:4])))
plot(main="Iris", irisE$vectors[,1], irisE$vectors[,2], xlab="Eigenvector 1", ylab="Eigenvector 2")barplot(main="Iris eigenvalue", irisE$values[1:4], border = F)irisS=apply(as.matrix(iris[,1:4]), 2, scale)
irisVC=apply(as.matrix(irisE$vectors[, 1:3]), 2, scale)
pvca=matrix(0,ncol(irisS),1)
for(i in 1:ncol(irisS)) {
mod=lm(irisS[,i]~irisVC)
pvca[i,1]=summary(mod)$r.squared
}
rownames(pvca)=names(iris)[1:4]
barplot(t(pvca), border = F)Assuming \(p_i\) the features of the individual, the correlation between individual \(i\) and \(j\) is defined as below \(C_{i,j}=\frac{1}{m_{e,ij}}\sum_k p_{i,k} p_{j,k}\) in which m_{e,ij} counts the features available in both \(i\) and \(j\).
irisT=as.matrix(iris[,1:4]) #using iris data
#randomly set 20 missing values
for(i in 1:20) {
irisT[sample(nrow(irisT), 1), sample(ncol(irisT), 1)]=NA
}
irisTS=matrix(0, nrow(irisT), ncol(irisT))
for(i in 1:ncol(irisT)) {
irisTS[,i] = scale(irisT[,i])
}
exCor=matrix(0, ncol(irisTS), ncol(irisTS))
for(i in 1:ncol(irisTS)) {
for(j in 1:i)
{
idxD=which(!is.na(irisTS[, i]) & !is.na(irisTS[,j]))
exCor[i,j]=exCor[j,i]=mean(irisTS[idxD,i] * irisTS[idxD,j])
}
}
print(exCor)## [,1] [,2] [,3] [,4]
## [1,] 0.9931973 -0.1258679 0.8575409 0.8158430
## [2,] -0.1258679 0.9930556 -0.4005498 -0.3482476
## [3,] 0.8575409 -0.4005498 0.9931034 0.9687636
## [4,] 0.8158430 -0.3482476 0.9687636 0.9931034
irisEg=eigen(exCor)Reference paper for pROC
library(pROC)## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data(aSAH)
rocD1=roc(aSAH$outcome, aSAH$s100b)
plot(rocD1)print(rocD1)##
## Call:
## roc.default(response = aSAH$outcome, predictor = aSAH$s100b)
##
## Data: aSAH$s100b in 72 controls (aSAH$outcome Good) < 41 cases (aSAH$outcome Poor).
## Area under the curve: 0.7314
#cross-validation example for three loci
cv=5
s=100
Sz=cv*s
err=2
Gm=matrix(c(rbinom(Sz, 2, 0.5), rbinom(Sz, 2, 0.5), rbinom(Sz, 2, 0.5)), Sz, 3)
b=matrix(c(1,1,1), 3, 1)
y=Gm %*% b + rnorm(Sz, sd=err)
yClass=ifelse(y>mean(y), 1, 0)
tag=sample(rep(1:cv, s), Sz)
sumStat=array(0, dim=c(cv, 2))
for(i in 1:cv) {
yTr=y[tag!=i]
GmTr=Gm[tag!=i,]
yTt=y[tag==i]
GmTt=Gm[tag==i,]
mod=lm(yTr~GmTr)
b_hat=matrix(summary(mod)$coefficient[2:4, 1], 3, 1)
yPre=apply(GmTt %*% b_hat, 1, sum)
rocRes=roc(yClass[tag==i], yPre)
plot(rocRes)
sumStat[i,1] = rocRes$auc
sumStat[i,2] = cor(yClass[tag==i], yPre)^2
}print(sumStat)## [,1] [,2]
## [1,] 0.6910764 0.1154597
## [2,] 0.7902576 0.2385977
## [3,] 0.7366947 0.1884256
## [4,] 0.7649538 0.2112350
## [5,] 0.7818035 0.2327375
Example from wiki survival
library(survival)## Warning: package 'survival' was built under R version 3.4.4
# sort the aml data by time
aml <- aml[order(aml$time),]
print(aml)## time status x
## 12 5 1 Nonmaintained
## 13 5 1 Nonmaintained
## 14 8 1 Nonmaintained
## 15 8 1 Nonmaintained
## 1 9 1 Maintained
## 16 12 1 Nonmaintained
## 2 13 1 Maintained
## 3 13 0 Maintained
## 17 16 0 Nonmaintained
## 4 18 1 Maintained
## 5 23 1 Maintained
## 18 23 1 Nonmaintained
## 19 27 1 Nonmaintained
## 6 28 0 Maintained
## 20 30 1 Nonmaintained
## 7 31 1 Maintained
## 21 33 1 Nonmaintained
## 8 34 1 Maintained
## 22 43 1 Nonmaintained
## 9 45 0 Maintained
## 23 45 1 Nonmaintained
## 10 48 1 Maintained
## 11 161 0 Maintained
# Create graph of length of time that each subject was in the study
with(aml, plot(time, type="h"))# Create the life table survival object for aml
# The functions survfit() and Surv() create a life table survival object.
# The life table object is passed to the plot() function to create the KM plot.
aml.survfit <- survfit(Surv(time, status == 1) ~ 1, data=aml)
# Plot the Kaplan-Meier curve for aml.
# By default, R includes the confidence interval.
plot(aml.survfit, xlab = "Time (weeks)", ylab="Proportion surviving", main="Survival in AML")# The summary() function displays the life table
summary(aml.survfit)## Call: survfit(formula = Surv(time, status == 1) ~ 1, data = aml)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 5 23 2 0.9130 0.0588 0.8049 1.000
## 8 21 2 0.8261 0.0790 0.6848 0.996
## 9 19 1 0.7826 0.0860 0.6310 0.971
## 12 18 1 0.7391 0.0916 0.5798 0.942
## 13 17 1 0.6957 0.0959 0.5309 0.912
## 18 14 1 0.6460 0.1011 0.4753 0.878
## 23 13 2 0.5466 0.1073 0.3721 0.803
## 27 11 1 0.4969 0.1084 0.3240 0.762
## 30 9 1 0.4417 0.1095 0.2717 0.718
## 31 8 1 0.3865 0.1089 0.2225 0.671
## 33 7 1 0.3313 0.1064 0.1765 0.622
## 34 6 1 0.2761 0.1020 0.1338 0.569
## 43 5 1 0.2208 0.0954 0.0947 0.515
## 45 4 1 0.1656 0.0860 0.0598 0.458
## 48 2 1 0.0828 0.0727 0.0148 0.462
# Create aml life tables and KM plots broken out by treatment (x, "Maintained" vs. "Not maintained")
surv.by.aml.rx <- survfit(Surv(time, status == 1) ~ x, data = aml)
summary(surv.by.aml.rx)## Call: survfit(formula = Surv(time, status == 1) ~ x, data = aml)
##
## x=Maintained
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 9 11 1 0.909 0.0867 0.7541 1.000
## 13 10 1 0.818 0.1163 0.6192 1.000
## 18 8 1 0.716 0.1397 0.4884 1.000
## 23 7 1 0.614 0.1526 0.3769 0.999
## 31 5 1 0.491 0.1642 0.2549 0.946
## 34 4 1 0.368 0.1627 0.1549 0.875
## 48 2 1 0.184 0.1535 0.0359 0.944
##
## x=Nonmaintained
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 5 12 2 0.8333 0.1076 0.6470 1.000
## 8 10 2 0.6667 0.1361 0.4468 0.995
## 12 8 1 0.5833 0.1423 0.3616 0.941
## 23 6 1 0.4861 0.1481 0.2675 0.883
## 27 5 1 0.3889 0.1470 0.1854 0.816
## 30 4 1 0.2917 0.1387 0.1148 0.741
## 33 3 1 0.1944 0.1219 0.0569 0.664
## 43 2 1 0.0972 0.0919 0.0153 0.620
## 45 1 1 0.0000 NaN NA NA
# Plot KM
plot(surv.by.aml.rx, xlab = "Time", ylab="Survival",col=c("black", "red"), lty = 1:2, main="Kaplan-Meier Survival vs. Maintenance in AML")
# Add legend
legend(100, .6, c("Maintained", "Not maintained"), lty = 1:2, col=c("black", "red"))# Perform the log rank test using the R function survdiff().
surv.diff.aml <- survdiff(Surv(time, status == 1) ~ x, data=aml)
surv.diff.aml## Call:
## survdiff(formula = Surv(time, status == 1) ~ x, data = aml)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained 11 7 10.69 1.27 3.4
## x=Nonmaintained 12 11 7.31 1.86 3.4
##
## Chisq= 3.4 on 1 degrees of freedom, p= 0.07
Generate Wishart distribution
## Artificial
S <- toeplitz((10:1)/10)
set.seed(11)
R <- rWishart(1000, 20, S)
dim(R) # 10 10 1000## [1] 10 10 1000
cor(R[,,1])## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000000 0.9892037 0.85009974 0.7349126 0.40302917 -0.10687984
## [2,] 0.9892037 1.0000000 0.90912646 0.7834566 0.41542351 -0.12399023
## [3,] 0.8500997 0.9091265 1.00000000 0.9135839 0.57037342 0.01718272
## [4,] 0.7349126 0.7834566 0.91358394 1.0000000 0.82871062 0.38087610
## [5,] 0.4030292 0.4154235 0.57037342 0.8287106 1.00000000 0.79783054
## [6,] -0.1068798 -0.1239902 0.01718272 0.3808761 0.79783054 1.00000000
## [7,] -0.5194526 -0.5528924 -0.45133366 -0.1292602 0.32855574 0.74949700
## [8,] -0.8033035 -0.8441922 -0.76921814 -0.5102579 -0.05009135 0.45706156
## [9,] -0.9787197 -0.9846968 -0.85143400 -0.7074475 -0.31657491 0.17689287
## [10,] -0.9771913 -0.9736050 -0.85344880 -0.7742355 -0.44638410 0.02528311
## [,7] [,8] [,9] [,10]
## [1,] -0.5194526 -0.80330348 -0.9787197 -0.97719132
## [2,] -0.5528924 -0.84419222 -0.9846968 -0.97360500
## [3,] -0.4513337 -0.76921814 -0.8514340 -0.85344880
## [4,] -0.1292602 -0.51025791 -0.7074475 -0.77423553
## [5,] 0.3285557 -0.05009135 -0.3165749 -0.44638410
## [6,] 0.7494970 0.45706156 0.1768929 0.02528311
## [7,] 1.0000000 0.87737055 0.5566532 0.43394504
## [8,] 0.8773705 1.00000000 0.8480151 0.75653301
## [9,] 0.5566532 0.84801514 1.0000000 0.97486002
## [10,] 0.4339450 0.75653301 0.9748600 1.00000000
mR <- apply(R, 1:2, mean) # ~= E[ Wish(S, 20) ] = 20 * S
stopifnot(all.equal(mR, 20*S, tolerance = .009))
## See Details, the variance is
Va <- 20*(S^2 + tcrossprod(diag(S)))
vR <- apply(R, 1:2, var)
stopifnot(all.equal(vR, Va, tolerance = 1/16))