Teaching notes

Teaching notes for the R.

## [1] "Document was last updated at 2018-10-19."

Table of contents

0 Random seed

set.seed(2018)

1 R elements

getwd() #where are you?
## [1] "/Users/gc5k/StatGenet/zjafu/day1"
help("plot")

#data
a=1
b=10
c=a+b
print(c)
## [1] 11
#generate matrix and write to a file
dat=matrix(c(1,2,3,4,5,6), 2, 3, byrow = T)
write.table(dat, "myRdata.txt", row.names = F, col.names = F, quote = F)

#generate and read a file
dat1=read.table("myRdata.txt", as.is = T)
print(dat1)
##   V1 V2 V3
## 1  1  2  3
## 2  4  5  6
ls() #show elements
## [1] "a"    "b"    "c"    "dat"  "dat1" "Date"
ls.str() #show elements, a strong version
## a :  num 1
## b :  num 10
## c :  num 11
## dat :  num [1:2, 1:3] 1 4 2 5 3 6
## dat1 : 'data.frame': 2 obs. of  3 variables:
##  $ V1: int  1 4
##  $ V2: int  2 5
##  $ V3: int  3 6
## Date :  Date[1:1], format: "2018-10-19"
mode(a)
## [1] "numeric"
mode(dat)
## [1] "numeric"
mode(dat1)
## [1] "list"
rm(dat1) #remove it

2 Generating data

x=1:30
x1=seq(1, 30, 2)
x2=seq(from=1, to=30, length.out = 6)
print(x)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30
print(x1)
##  [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29
print(x2)
## [1]  1.0  6.8 12.6 18.4 24.2 30.0
b=0.5
print(paste("ceiling(b): ", ceiling(b)))
## [1] "ceiling(b):  1"
print(paste("floor(b): ", floor(b)))
## [1] "floor(b):  0"
b1=-0.5
print(paste("ceiling(b1): ", ceiling(b1)))
## [1] "ceiling(b1):  0"
print(paste("floor(b1): ", floor(b1)))
## [1] "floor(b1):  -1"
gl(3, 5)
##  [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
## Levels: 1 2 3
gl(3, 5, length = 30)
##  [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
## Levels: 1 2 3
gl(2, 5, labels = c("Male", "Female"))
##  [1] Male   Male   Male   Male   Male   Female Female Female Female Female
## Levels: Male Female
#all combinations
expand.grid(h=c(60,80), w=c(100, 300), sex=c("Male", "Female"))
##    h   w    sex
## 1 60 100   Male
## 2 80 100   Male
## 3 60 300   Male
## 4 80 300   Male
## 5 60 100 Female
## 6 80 100 Female
## 7 60 300 Female
## 8 80 300 Female

3 Random sequence, distribution

Normal distribtuion wiki page.

\(\chi^2_1\) and \(\chi^2_5\), chi-square distribution wiki page.

pv=0.025
qnorm(1-pv)
## [1] 1.959964
qchisq(2*pv, df=1, lower.tail = F)
## [1] 3.841459
qchisq(2*pv, df=5, lower.tail = F)
## [1] 11.0705
layout(matrix(1:4, 2, 2, byrow = T))
plot(density(rnorm(10000)), main="Normal distribution", bty='l')
plot(density(rt(10000, 30)), main="Student-t distribution (df=30)", bty='l')
plot(density(rchisq(10000, 1)), main="Chisq distribution (df=1)", bty='l')
plot(density(rchisq(10000, 1, ncp = 2)), main="Chisq distribution (df=1, ncp=2)", bty='l')

layout(matrix(1:4, 2, 2, byrow = T))
hist(rnorm(10000), breaks=25, main="Normal distribution")
hist(rt(10000, 30), breaks=25, main="Student-t distribution (df=30)", bty='l')
hist(rchisq(10000, 1), breaks=25, main=expression(paste(chi[1]^2, "distribution")), bty='l')
hist(rchisq(10000, 1, ncp = 2), breaks=25, main=expression(paste(chi[1,ncp=2]^2, "distribution")), bty='l')

4 For & while

X = c(1, 3, 4, 5, 16, 9)
for(i in 1:length(X)) {
  print(X[i])
}
## [1] 1
## [1] 3
## [1] 4
## [1] 5
## [1] 16
## [1] 9
NotHere="China"
FIFA=c("Croatia", "France", "Belgien", "England", "Brazil", "Germany", "China", "Japan", "Sweden", "Russia", "Argentina", "Spain", "Chile", "Uruguay")

cnt=1
while (FIFA[cnt] != NotHere & cnt <= length(FIFA)) {
  print(FIFA[cnt])
  cnt = cnt + 1
}
## [1] "Croatia"
## [1] "France"
## [1] "Belgien"
## [1] "England"
## [1] "Brazil"
## [1] "Germany"
idx=which(FIFA==NotHere)
print(idx)
## [1] 7
FIFA1=FIFA[order(FIFA)]
print(FIFA1)
##  [1] "Argentina" "Belgien"   "Brazil"    "Chile"     "China"    
##  [6] "Croatia"   "England"   "France"    "Germany"   "Japan"    
## [11] "Russia"    "Spain"     "Sweden"    "Uruguay"
FIFA2=FIFA[order(FIFA, decreasing = T)]
print(FIFA2)
##  [1] "Uruguay"   "Sweden"    "Spain"     "Russia"    "Japan"    
##  [6] "Germany"   "France"    "England"   "Croatia"   "China"    
## [11] "Chile"     "Brazil"    "Belgien"   "Argentina"
FIFAR=matrix(c("Brazial", 5, 1.1, "Germany", 4, 0.8, "Italy", 4, 0.7, "Argentina", 2, 4), 4, 3, byrow = T)

print(FIFAR[order(FIFAR[,1], FIFAR[,3]), ])
##      [,1]        [,2] [,3] 
## [1,] "Argentina" "2"  "4"  
## [2,] "Brazial"   "5"  "1.1"
## [3,] "Germany"   "4"  "0.8"
## [4,] "Italy"     "4"  "0.7"

5 Figures

boxplot(matrix(c(rnorm(100), rnorm(100, 1)), 100, 2))

hist(rnorm(1000))

mat=matrix(c(4,5,16,17, 34, 40), 2, 3)
colnames(mat)=c("G1", "G2", "G3")
barplot(mat, beside = T, border = F, xlab="XLab")

barplot(t(mat), beside = T, border = F)

6 Plot demonstration

z1=rnorm(1000)
z2=rnorm(1000)
layout(matrix(1:9, 3, 3, byrow = T)) #what's the difference if byrow = F
plot(z1, z2, main="1")
plot(z1, z2, main="2", bty='l', pch=16, cex=0.5)
plot(z1, z2, main="3", xlab=expression(z[1]), ylab=expression(z^2))
plot(z1, z2, main="4", xlab=expression(z[1]), ylab=expression(z^2), col="red", pch=16, cex=0.5)
plot(z1, z2, main="5", xlab=expression(z[1]), ylab=expression(z^2), col="red", pch=16, cex=0.5, bty='l')
plot(z1, z2, main="6", xlab=expression(z[1]), ylab=expression(z^2), col=ifelse(z1>0, "red", "blue"), pch=16, cex=0.5, bty='l')

plot(z1, z2, main="7", xlab=expression(z[1]), ylab=expression(z^2), col=ifelse(z1*z2>0, "red", "blue"), pch=16, cex=0.5, bty='l')

plot(z1, z2, main="8", xlim=c(-6, 6), ylim=c(-6,6), xlab=expression(z[1]), ylab=expression(z^2), col=ifelse(z1*z2>0, "red", "blue"), pch=16, cex=0.5, bty='l')

plot(z1, z2, main="9", xlim=c(-6, 6), ylim=c(-6,6), xlab=expression(z[1]), ylab=expression(z^2), cex=0.5, pch=16, col=rgb((max(abs(z1))-abs(z1))/max(abs(z1)), (max(abs(z1))-abs(z1))/max(abs(z1)), (max(abs(z1))-abs(z1))/max(abs(z1))), bty='l')

layout(matrix(1:6, 2, 3))
rho=c(0.1, 0.3, 0.5, 0.7, 0.9, 0.99)
for(i in 1:length(rho)) {
  x1=z1
  x2=z1*rho[i]+sqrt(1-rho[i]^2)*z2
  plot(x1, x2, pch=16, cex=0.5, bty='l')
  abline(lm(x2~x1), col="red", lwd=2, lty=3)
  legend("topleft", legend = c(paste("rho=", rho[i])), pch=1, col="red", bty = 'n')
}

7 p-value distribution

layout(matrix(c(1,1,2,1,1,3,1,1,4), 3, 3, byrow = T))
ts=rt(10000, df = 10)
cs1=rchisq(10000, df=1)
cs5=rchisq(10000, df=5, ncp=2)
hist(ts, breaks = 50, xlim=c(-10, 15), ylim=c(0, 5000))
hist(cs1, breaks = 50, add=T, col="red")
hist(cs5, breaks = 50, add=T, col="blue")
hist(pt(ts,df=10), main="p-value (t-test)")
hist(pchisq(cs1, df=1, ncp=0, lower.tail = T), col="red", main=expression(paste("p-value (", chi[1]^2,")")))
hist(pchisq(cs5, df=5, ncp=2, lower.tail = T), col="blue", main=expression(paste("p-value (", chi[5]^2,")")))

8 Linear model

#linear model
coR=c(0.1, 0.3, 0.5, 0.7)
x1=rnorm(5000)
layout(matrix(1:4, 2, 2))
for(i in 1:length(coR)) {
  x2=coR[1]*x1+sqrt(1-coR^2)*rnorm(5000)
  plot(x1, x2, cex=0.5, bty='l', pch=16, main="Linear regression", xlab="X", ylab="Y")
  abline(lm(x2~x1), col="red", lty=2)
}

9 Logistic regression

#logistic model
a=0.1
b=1
sz=1000
x1=rnorm(sz)
layout(matrix(1:6, 2, 2, byrow = T))
err=c(0.1, 0.5, 1)
for(i in 1:length(err)) {
  y=array(0, dim=length(x1))
  Sp=a+b*x1 +rnorm(sz, sd = err[i])
  pro=exp(Sp)/(1+exp(Sp))
  y = rbinom(length(x1), 1, pro)

  ox=order(x1)
  plot(x1[ox], pro[ox], main=paste("S curve, res=", err[i]), bty="l", cex=0.5, pch=16)
  glMod=glm(y~x1, family = "binomial")
  points(x1[ox], glMod$fitted.values[ox], col="red", cex=0.5, pch=15)
  plot(x1, y, cex=0.5, bty='l', pch=16, main="Logistic regression", xlab="X", ylab="Y")
  abline(glMod, col="red", lty=2)
}

10 Permutation

cR=0.1
Sz=500
pm=5000
x1=rnorm(Sz)
y1=cR*x1+sqrt(1-cR^2)*rnorm(Sz)
regB=matrix(0, pm, 4)
mod1=lm(y1~x1)
for(i in 1:pm) {
  regB[i,] = summary(lm(y1~sample(x1)))$coefficient[2,]
}
hist(regB[,1], xlim=c(min(c(regB[,1], summary(mod1)$coefficient[2,1])), max(c(regB[,1], summary(mod1)$coefficients[2,1]))), main="Permutation test", xlab=expression(paste(beta)))
abline(v=summary(mod1)$coefficient[2,1], col="red")

ln=length(which(abs(regB[,1])>summary(mod1)$coefficient[2,1]))
ep=ifelse(ln>0, ln/pm, 0)
legend("topleft", legend = paste("p=",ep), bty='n')

11 Mendel vs Fisher

layout(matrix(1:4, 2,2))
REP=c(1,2,4,256)
n=100
for(i in 1:length(REP)) {
  plot(main=paste(REP[i], "shoot"), x=NULL, y=NULL, xlim=c(-4,4), ylim=c(-4,4), axes = F, xlab="", ylab="")
  for(j in 1:n) {
    points(mean(rnorm(REP[i])), mean(rnorm(REP[i])), pch=16, cex=0.5, col="red")
    abline(h=0, v=0)
  }
  points(rep(0, 6),rep(0,6), cex=seq(5,30, 5))
}
Mendel vs Fisher

Mendel vs Fisher

12 Shoot it again

set.seed(20180625)
layout(matrix(1:4, 2,2))
REP=c(1,2,4,256)
n=1000
for(i in 1:length(REP)) {
  xp=array(0, n)
  for(j in 1:n) {
    xp[j] = mean(rnorm(REP[i]))
  }
#  qqplot(rnorm(n, 0, sqrt(1/REP[i])), xp, bty='l', ylab="Simulated", xlab="Theoretical", xlim=c(-4, 4), ylim=c(-4,4))
    qqplot(rnorm(n, 0, sqrt(1/REP[i])), xp, bty='l', ylab="Simulated", xlab="Theoretical", xlim=c(-4, 4), ylim=c(-4,4), pch=16, cex=0.5)

  abline(a=0, b=1, col="red")
}

13 Simulated F2

##########################
##########################
#sample size
N=1000
M=20000
h2=0.5
#uniform
freq=runif(M)
#0.5
freq=rep(0.5, M)
#

gmat=matrix(0, N, M)
for(i in 1:M) {
  gmat[,i] = rbinom(N, 2, freq[i])
}

eff=rnorm(M, 0, sqrt(1/M))
bv=gmat %*% eff
y=bv+rnorm(N, 0, sd=sqrt(var(bv)/h2*(1-h2)))

layout(matrix(1:4, 2, 2))
freqDat=colMeans(gmat)/2
plot(freqDat, frame.plot = F, xlab="Locus", ylab="Freq", pch=16, cex=0.5)
qqplot(freqDat, rnorm(M, freq, sqrt(freq*(1-freq)/(2*N))), xlab = "Observed", ylab="Expected", frame.plot=F, pch=16, cex=0.5)
abline(a=0, b=1, col="red")
pvN=pnorm((freqDat-0.5)/sqrt(freqDat*(1-freqDat)/(2*N)))
pvB=pbinom(apply(gmat, 2, sum), 2*N, 0.5)
hist(main="p-value distribution", pvN)
hist(main="p-value distribution", pvB)

14 Real maize F2

layout(matrix(1:6, 2, 3))
FinaC=read.table("Fina_Genotype.ped", as.is = T, na.strings = "99")[,-c(1:6)]
GCnt=array(0, ncol(FinaC))
for(i in 1:ncol(FinaC)) {
  GCnt[i] = length(which(!is.na(FinaC[,i])))
}

FinaSC = matrix(NA, nrow = nrow(FinaC), ncol = ncol(FinaC))
for(i in 1:ncol(FinaC)) {
  idx=which(!is.na(FinaC[,i]))
  FinaSC[idx,i]=rbinom(length(idx), 1, 0.5)*2
}

FinaFreq=colMeans(FinaC, na.rm = T)/2
FinaSCFreq=colMeans(FinaSC, na.rm = T)/2
plot(FinaFreq, col="grey", bty="l", cex=0.5, pch=16)
qqplot(FinaFreq, FinaSCFreq, xlab = "Observed freq", ylab="Expected freq", bty="l", pch=16, cex=0.5)
abline(a=0, b=1, col="red")

#########t-test
#FinaFreq = FinaSCFreq
tTest=matrix(0, ncol(FinaC), 2)
tTest[,1]=(FinaFreq-0.5)/sqrt(FinaFreq*(1-FinaFreq)/(GCnt))
tTest[,2]=2*pt(-1*abs(tTest[,1]), GCnt-1)
hist(tTest[,2], breaks = 50, main="t test")

#######chi-sq
#FinaC=FinaSC
chiTest=matrix(0, ncol(FinaC), 2)
for(i in 1:ncol(FinaC)) {
  gcnt=table(FinaC[,i])
  gm=mean(gcnt)
  chiTest[i,1] = (gcnt[1]-gm)^2/gm + (gcnt[2]-gm)^2/gm
  chiTest[i,2] = pchisq(chiTest[i,1], 1, lower.tail = F)
}
hist(chiTest[,2], breaks = 50, main="Chisq test")

plot(FinaFreq, col=ifelse(chiTest[,2] < (0.05/length(FinaFreq)), "red", "grey"), pch=16, bty="l")

15 GC control

layout(matrix(1:2, 1, 2))
qchi=qchisq(runif(10000), df = 1, ncp = 0.1)
xchi=rchisq(10000, df = 1)
qqplot(x=xchi, y=qchi, bty="l", xlab=expression(paste("Theoretical ", chi[1]^2)), ylab=expression(paste("Observed ", chi[1]^2)), pch=16, cex=0.5)

abline(a=0, b=1, col="red")
gc=qchisq(median(pchisq(qchi, df=1, lower.tail = T)), df=1)/qchisq(0.5, df=1, lower.tail = T)
qqplot(x=xchi, y=qchi/gc, bty="l", xlab=expression(paste("Theoretical ", chi[1]^2)), ylab=expression(paste("Observed ", chi[1]^2)), pch=16, cex=0.5)
abline(a=0, b=1, col="red")

16 \(\chi^2_1\) power for QTLs

\(\int_{p_1}^{p_2}2p(1-p)\beta^2dp=\int_{p_1}^{p_2}2p\beta^2dp-\int_{p_1}^{p_2}2p^2\beta^2dp=(p_2^2-p_1^2)\beta^2-\frac{2}{3}(p_2^3-p_1^3)\beta^2\), and given \(p_1=0\) and \(p_2=0.5\), it is \(\frac{1}{6}\beta^2\).

Statistical power is determined by the type I error rate \(\alpha\), by default \(\alpha=0.05\), and the number of markers. Here, the power is calculated from \(\chi^2_1\) distribution, and the noncentrality parameter is \[\lambda=\frac{2pq(1+F_{st})\beta^2N}{\sigma^2_y-2pq(1+F_{st})\beta^2}=\frac{Nh^2}{1-h^2}\] , in which \(\sigma^2_y=2pq\beta^2(1+F_{st})\frac{1-h^2}{h^2}\) and \(F_{st}=1\) for inbred lines and 0 for random mating populaitons. So \(power=\chi^2_{1,\lambda|\alpha,m}\).

H2=c(0.05, 0.1, 0.15, 0.2)
SampleSize=c(100, 200, 500, 750)
Po=matrix(0, length(H2), length(SampleSize))
marker=1e6
for(i in 1:length(H2)) {
  p=0.3
  q=1-p
  beta=1
  h2=H2[i]
  vb=2*p*q*beta^2
  vy=vb/h2*(1-h2)
  Ncp=(SampleSize*vb)/(vy-vb)
  tcut=qchisq(0.05/marker, 1, lower.tail = F)
  Po[i,]=pchisq(tcut, df=1, ncp = Ncp, lower.tail = F)
}
colnames(Po)=SampleSize
barplot(Po, beside = T, border = NA, ylab="Power", xlab="Sample size")

Inside each cluster from left to right is \(h^2=(0.05, 0.1, 0.15, 0.2)\). Note, it is irrelevant to inbred or not.

17 Power for case-control studies

case control
p \(\bar{p}=1-p\)
q \(\bar{q}=1-q\)

\(OR=\frac{p\bar{q}}{q\bar{p}}\), and if odds ratio is close to 1 then \(ln(OR)=\beta\approx OR-1\), in which \(\beta\) is the coefficient from logistic regression.

p=0.52
q=1-p
p_c=1-p
q_c=1-q
OR=p*q_c/(q*p_c)
H2=c(0.05, 0.1, 0.15, 0.2)
SampleSize=c(100, 200, 500, 750)
Po=matrix(0, length(H2), length(SampleSize))
marker=1e6
for(i in 1:length(H2)) {
  p=0.3
  q=1-p
  beta=OR-1
  h2=H2[i]
  vb=2*p*q*beta^2
  vy=vb/h2*(1-h2)
  Ncp=(SampleSize*vb)/(vy-vb)
  tcut=qchisq(0.05/marker, 1, lower.tail = F)
  Po[i,]=pchisq(tcut, df=1, ncp = Ncp, lower.tail = F)
}
colnames(Po)=SampleSize
barplot(Po, beside = T, border = NA, ylab="Power", xlab="Sample size")

18 Chisq properties

rep=10000
ss=array(0, dim=rep)
for(i in 1:rep) {
  x=rnorm(100, runif(1, 0.5, 1))
  ss[i]=sum(x^2)
}
mean(ss)
## [1] 158.6174
var(ss)
## [1] 904.7643

19 Meta-analysis

set.seed(1000)
CT=4
N=ceiling(rnorm(CT, 3000, 1000))
M=10000

#freq
P=runif(M, 0.05, 0.95)
Freq=matrix(0, M, CT)
for(i in 1:CT) {
  Freq[,i] = rnorm(M, P, sd=sqrt((P*(1-P))/(2*N[i])))
}

#mat
h2=0.5
Beta=rnorm(M, 0, sd=0)#=sqrt(h2/sum(2*P*(1-P))))
Bmat=matrix(0, M, CT)
Smat=matrix(0, M, CT)
Pmat=matrix(0, M, CT)
for(i in 1:CT) {
  Smat[,i]=1/(N[i]*2*Freq[,i]*(1-Freq[,i]))
  Bmat[,i] = rnorm(M, Beta, sd=sqrt(Smat[,i]))
  Pmat[,i] = (1-pnorm(abs(Bmat[,i]/sqrt(Smat[,i]))))*2
}

#fixed model
FisherP=matrix(0, M, 1)
Fmeta=matrix(0, M, 3)
Qmeta=matrix(0, M, 6)
for(i in 1:M) {
  FisherP[i,1]=-2*sum(log(Pmat[i,]))

  v=1/Smat[i,]
  Fmeta[i,2]=1/sum(v)
  Fmeta[i,1]=sum(Bmat[i,]*v)/sum(v)
  Fmeta[i,3]=Fmeta[i,1]/sqrt(Fmeta[i,2])

  Qmeta[i,4]=sum(v*(Bmat[i,]-Fmeta[i,1])^2)
  Qmeta[i,5]=sum(v)-sum(v^2)/sum(v)
  Qmeta[i,6]=(Qmeta[i,4]-(CT-1))/Qmeta[i,5]
  if(Qmeta[i,6]<0) {
    Qmeta[i,6]=0
  }
  vr=1/(Smat[i,]+Qmeta[i,6])
  Qmeta[i,2]=1/sum(v)+Qmeta[i,6]
  Qmeta[i,1]=sum(vr*Bmat[i,])/sum(vr)
  Qmeta[i,3]=Qmeta[i,1]^2/Qmeta[i,2]
}
layout(matrix(1:4, 2, 2))
qqplot(main="Fisher's methods", xlab=expression(paste("Theoretical ", chi[8]^2)),
       ylab=expression(paste("Observed ", chi[8]^2)),
       FisherP, rchisq(M, 8), bty="n", pch=16)
abline(a=0, b=1)

qqplot(main="Fixed model", xlab=expression(paste("Theoretical ", chi[1]^2)),
       ylab=expression(paste("Observed ", chi[1]^2)),
       rchisq(M, 1), Fmeta[,3]^2, bty='n', pch=16)
abline(a=0, b=1)

qqplot(main="Random model", xlab=expression(paste("Theoretical ", chi[1]^2)),
       ylab=expression(paste("Observed ", chi[1]^2)),
       rchisq(M,1), Qmeta[,1]^2/Qmeta[,2], bty='n', pch=16)
abline(a=0, b=1)

qqplot(main="Heterogeneity (4 cohorts)", xlab="Q", ylab=expression(paste("Theoretical ", chi[3]^2)), Qmeta[,4], rchisq(M,CT-1), bty='n', pch=16)
abline(a=0, b=1)

20 RcppArmadillo

library(RcppArmadillo)
## Warning: package 'RcppArmadillo' was built under R version 3.4.4
y=rnorm(1000)
x=rnorm(1000)
mod1=lm(y~x)
mod2=fastLmPure(cbind(1, x), y)
print(mod1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##    0.009178    -0.040040
print(mod2)
## $coefficients
##              [,1]
## [1,]  0.009178328
## [2,] -0.040040368
## 
## $stderr
##            [,1]
## [1,] 0.03245848
## [2,] 0.03278206
## 
## $df.residual
## [1] 998

21 PVCA [iris]

Principal variance component analysis

\(y=a+\sum b_iE_i +e\)

in which \(E_i\) is the \(i^{th}\) eigenvector associated to the \(i^{th}\) eigenvalue \(\lambda_i\), \(y\) is the value of interest. \(R^2\) tells how much \(y\) is determined by the eigenvectors.

Let see show it in this famous iris data [Fisher, R.A, 1936, Annals of Eugeneics, 7:179-188; Anderson, E, 1935, The irises of the Gaspe Peninsula, Bulletin of the American Iris Society, 59, 2-5]

irisE=eigen(as.matrix(iris[,1:4])%*%t(as.matrix(iris[,1:4])))
plot(main="Iris", irisE$vectors[,1], irisE$vectors[,2], xlab="Eigenvector 1", ylab="Eigenvector 2")

barplot(main="Iris eigenvalue", irisE$values[1:4], border = F)

irisS=apply(as.matrix(iris[,1:4]), 2, scale)
irisVC=apply(as.matrix(irisE$vectors[, 1:3]), 2, scale)
pvca=matrix(0,ncol(irisS),1)
for(i in 1:ncol(irisS)) {
  mod=lm(irisS[,i]~irisVC)
  pvca[i,1]=summary(mod)$r.squared
}
rownames(pvca)=names(iris)[1:4]
barplot(t(pvca), border = F)

22 PCA for NA [iris]

Assuming \(p_i\) the features of the individual, the correlation between individual \(i\) and \(j\) is defined as below \(C_{i,j}=\frac{1}{m_{e,ij}}\sum_k p_{i,k} p_{j,k}\) in which m_{e,ij} counts the features available in both \(i\) and \(j\).

irisT=as.matrix(iris[,1:4]) #using iris data

#randomly set 20 missing values
for(i in 1:20) {
  irisT[sample(nrow(irisT), 1), sample(ncol(irisT), 1)]=NA
}
irisTS=matrix(0, nrow(irisT), ncol(irisT))
for(i in 1:ncol(irisT)) {
  irisTS[,i] = scale(irisT[,i])
}


exCor=matrix(0, ncol(irisTS), ncol(irisTS))
for(i in 1:ncol(irisTS)) {
  for(j in 1:i)
  {
    idxD=which(!is.na(irisTS[, i]) & !is.na(irisTS[,j]))
    exCor[i,j]=exCor[j,i]=mean(irisTS[idxD,i] * irisTS[idxD,j])
  }
}
print(exCor)
##            [,1]       [,2]       [,3]       [,4]
## [1,]  0.9931973 -0.1258679  0.8575409  0.8158430
## [2,] -0.1258679  0.9930556 -0.4005498 -0.3482476
## [3,]  0.8575409 -0.4005498  0.9931034  0.9687636
## [4,]  0.8158430 -0.3482476  0.9687636  0.9931034
irisEg=eigen(exCor)

23 CV & ROC [pROC]

Reference paper for pROC

library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data(aSAH)
rocD1=roc(aSAH$outcome, aSAH$s100b)
plot(rocD1)

print(rocD1)
## 
## Call:
## roc.default(response = aSAH$outcome, predictor = aSAH$s100b)
## 
## Data: aSAH$s100b in 72 controls (aSAH$outcome Good) < 41 cases (aSAH$outcome Poor).
## Area under the curve: 0.7314
#cross-validation example for three loci
cv=5
s=100
Sz=cv*s

err=2

Gm=matrix(c(rbinom(Sz, 2, 0.5), rbinom(Sz, 2, 0.5), rbinom(Sz, 2, 0.5)), Sz, 3)
b=matrix(c(1,1,1), 3, 1)
y=Gm %*% b + rnorm(Sz, sd=err)
yClass=ifelse(y>mean(y), 1, 0)
tag=sample(rep(1:cv, s), Sz)

sumStat=array(0, dim=c(cv, 2))
for(i in 1:cv) {
  yTr=y[tag!=i]
  GmTr=Gm[tag!=i,]

  yTt=y[tag==i]
  GmTt=Gm[tag==i,]

  mod=lm(yTr~GmTr)
  b_hat=matrix(summary(mod)$coefficient[2:4, 1], 3, 1)
  yPre=apply(GmTt %*% b_hat, 1, sum)
  rocRes=roc(yClass[tag==i], yPre)
  plot(rocRes)
  sumStat[i,1] = rocRes$auc
  sumStat[i,2] = cor(yClass[tag==i], yPre)^2
}

print(sumStat)
##           [,1]      [,2]
## [1,] 0.6910764 0.1154597
## [2,] 0.7902576 0.2385977
## [3,] 0.7366947 0.1884256
## [4,] 0.7649538 0.2112350
## [5,] 0.7818035 0.2327375

24 Survival

Example from wiki survival

library(survival)
## Warning: package 'survival' was built under R version 3.4.4
# sort the aml data by time
aml <- aml[order(aml$time),]

print(aml)
##    time status             x
## 12    5      1 Nonmaintained
## 13    5      1 Nonmaintained
## 14    8      1 Nonmaintained
## 15    8      1 Nonmaintained
## 1     9      1    Maintained
## 16   12      1 Nonmaintained
## 2    13      1    Maintained
## 3    13      0    Maintained
## 17   16      0 Nonmaintained
## 4    18      1    Maintained
## 5    23      1    Maintained
## 18   23      1 Nonmaintained
## 19   27      1 Nonmaintained
## 6    28      0    Maintained
## 20   30      1 Nonmaintained
## 7    31      1    Maintained
## 21   33      1 Nonmaintained
## 8    34      1    Maintained
## 22   43      1 Nonmaintained
## 9    45      0    Maintained
## 23   45      1 Nonmaintained
## 10   48      1    Maintained
## 11  161      0    Maintained
# Create graph of length of time that each subject was in the study
with(aml, plot(time, type="h"))

# Create the life table survival object for aml
# The functions survfit() and Surv() create a life table survival object.
# The life table object is passed to the plot() function to create the KM plot.
aml.survfit <- survfit(Surv(time, status == 1) ~ 1, data=aml)

# Plot the Kaplan-Meier curve for aml.
# By default, R includes the confidence interval. 
plot(aml.survfit, xlab = "Time (weeks)", ylab="Proportion surviving", main="Survival in AML")

# The summary() function displays the life table
summary(aml.survfit)
## Call: survfit(formula = Surv(time, status == 1) ~ 1, data = aml)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5     23       2   0.9130  0.0588       0.8049        1.000
##     8     21       2   0.8261  0.0790       0.6848        0.996
##     9     19       1   0.7826  0.0860       0.6310        0.971
##    12     18       1   0.7391  0.0916       0.5798        0.942
##    13     17       1   0.6957  0.0959       0.5309        0.912
##    18     14       1   0.6460  0.1011       0.4753        0.878
##    23     13       2   0.5466  0.1073       0.3721        0.803
##    27     11       1   0.4969  0.1084       0.3240        0.762
##    30      9       1   0.4417  0.1095       0.2717        0.718
##    31      8       1   0.3865  0.1089       0.2225        0.671
##    33      7       1   0.3313  0.1064       0.1765        0.622
##    34      6       1   0.2761  0.1020       0.1338        0.569
##    43      5       1   0.2208  0.0954       0.0947        0.515
##    45      4       1   0.1656  0.0860       0.0598        0.458
##    48      2       1   0.0828  0.0727       0.0148        0.462
# Create aml life tables and KM plots broken out by treatment (x,  "Maintained" vs. "Not maintained")
surv.by.aml.rx <- survfit(Surv(time, status == 1) ~ x, data = aml)

summary(surv.by.aml.rx)
## Call: survfit(formula = Surv(time, status == 1) ~ x, data = aml)
## 
##                 x=Maintained 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     9     11       1    0.909  0.0867       0.7541        1.000
##    13     10       1    0.818  0.1163       0.6192        1.000
##    18      8       1    0.716  0.1397       0.4884        1.000
##    23      7       1    0.614  0.1526       0.3769        0.999
##    31      5       1    0.491  0.1642       0.2549        0.946
##    34      4       1    0.368  0.1627       0.1549        0.875
##    48      2       1    0.184  0.1535       0.0359        0.944
## 
##                 x=Nonmaintained 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5     12       2   0.8333  0.1076       0.6470        1.000
##     8     10       2   0.6667  0.1361       0.4468        0.995
##    12      8       1   0.5833  0.1423       0.3616        0.941
##    23      6       1   0.4861  0.1481       0.2675        0.883
##    27      5       1   0.3889  0.1470       0.1854        0.816
##    30      4       1   0.2917  0.1387       0.1148        0.741
##    33      3       1   0.1944  0.1219       0.0569        0.664
##    43      2       1   0.0972  0.0919       0.0153        0.620
##    45      1       1   0.0000     NaN           NA           NA
# Plot KM 
plot(surv.by.aml.rx, xlab = "Time", ylab="Survival",col=c("black", "red"), lty = 1:2, main="Kaplan-Meier Survival vs. Maintenance in AML")

# Add legend
legend(100, .6, c("Maintained", "Not maintained"), lty = 1:2, col=c("black", "red"))

# Perform the log rank test using the R function survdiff().

surv.diff.aml <- survdiff(Surv(time, status == 1) ~ x, data=aml)

surv.diff.aml
## Call:
## survdiff(formula = Surv(time, status == 1) ~ x, data = aml)
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained    11        7    10.69      1.27       3.4
## x=Nonmaintained 12       11     7.31      1.86       3.4
## 
##  Chisq= 3.4  on 1 degrees of freedom, p= 0.07

25 Wishart distribution

Generate Wishart distribution

## Artificial
S <- toeplitz((10:1)/10)
set.seed(11)
R <- rWishart(1000, 20, S)
dim(R)  #  10 10  1000
## [1]   10   10 1000
cor(R[,,1])
##             [,1]       [,2]        [,3]       [,4]        [,5]        [,6]
##  [1,]  1.0000000  0.9892037  0.85009974  0.7349126  0.40302917 -0.10687984
##  [2,]  0.9892037  1.0000000  0.90912646  0.7834566  0.41542351 -0.12399023
##  [3,]  0.8500997  0.9091265  1.00000000  0.9135839  0.57037342  0.01718272
##  [4,]  0.7349126  0.7834566  0.91358394  1.0000000  0.82871062  0.38087610
##  [5,]  0.4030292  0.4154235  0.57037342  0.8287106  1.00000000  0.79783054
##  [6,] -0.1068798 -0.1239902  0.01718272  0.3808761  0.79783054  1.00000000
##  [7,] -0.5194526 -0.5528924 -0.45133366 -0.1292602  0.32855574  0.74949700
##  [8,] -0.8033035 -0.8441922 -0.76921814 -0.5102579 -0.05009135  0.45706156
##  [9,] -0.9787197 -0.9846968 -0.85143400 -0.7074475 -0.31657491  0.17689287
## [10,] -0.9771913 -0.9736050 -0.85344880 -0.7742355 -0.44638410  0.02528311
##             [,7]        [,8]       [,9]       [,10]
##  [1,] -0.5194526 -0.80330348 -0.9787197 -0.97719132
##  [2,] -0.5528924 -0.84419222 -0.9846968 -0.97360500
##  [3,] -0.4513337 -0.76921814 -0.8514340 -0.85344880
##  [4,] -0.1292602 -0.51025791 -0.7074475 -0.77423553
##  [5,]  0.3285557 -0.05009135 -0.3165749 -0.44638410
##  [6,]  0.7494970  0.45706156  0.1768929  0.02528311
##  [7,]  1.0000000  0.87737055  0.5566532  0.43394504
##  [8,]  0.8773705  1.00000000  0.8480151  0.75653301
##  [9,]  0.5566532  0.84801514  1.0000000  0.97486002
## [10,]  0.4339450  0.75653301  0.9748600  1.00000000
mR <- apply(R, 1:2, mean)  # ~= E[ Wish(S, 20) ] = 20 * S
stopifnot(all.equal(mR, 20*S, tolerance = .009))


## See Details, the variance is
Va <- 20*(S^2 + tcrossprod(diag(S)))
vR <- apply(R, 1:2, var)
stopifnot(all.equal(vR, Va, tolerance = 1/16))