1 Setting
2 Random matrix theory
> M = 100
> K = 500
> S = matrix(rnorm(M*K,sd=sqrt(1/K)), M, K)
> SS = tcrossprod(S, S)
> heatmap(SS, symm = T, revC = T,
+ Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "S-by-t(S)",
+ col = hcl.colors(250, "Blues", rev = T))
> heatmap(diag(M), symm = T, revC = T,
+ Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "Identity matrix",
+ col = hcl.colors(250, "Blues", rev = T))
>
> n1=20
> n2=25
> freq=runif(M, 0.05, 0.5)
> X1 = GenerateGenoMatrix(n1, freq, F)
> X2 = GenerateGenoMatrix(n2, freq, F)
> X1SSX2 = X1 %*% SS %*% t(X2)
> X1X2 = X1 %*% t(X2)
> heatmap(X1SSX2, revC = T,
+ Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "X1SSX2",
+ col = hcl.colors(250, "Blues", rev = T))
> heatmap(X1X2, revC = T,
+ Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "X1X2",
+ col = hcl.colors(250, "Blues", rev = T))3 encGRM
3.1 var(GRM)
- population: 2 cohorts with (0-3)-degree relations under increasing number of markers.
- LD: without LD
> ScaleRow = T
> n1 = 1000
> n2 = 1000
> n_cp = n1
> n1_0 = n1-n_cp
> n2_0 = n2-n_cp
> Mvec = seq(n1, n1*2, n1/5)
> Mnum = length(Mvec)
> dt0 = data.frame()
> for(r in 0:3)
+ {
+ rst = vector()
+ for(i in 1:Mnum)
+ {
+ M = Mvec[i]
+ freq = runif(M, 0.05, 0.5)
+ ## r-degree relatives
+ Gr = GenerateGeno_r(freq, n_cp, r, FALSE)
+ X1 = Gr[[1]]
+ X2 = Gr[[2]]
+ X1 = rbind(X1, GenerateGeno(freq, n1_0))
+ X2 = rbind(X2, GenerateGeno(freq, n2_0))
+ if (ScaleRow)
+ {
+ X1 = t(apply(X1, 1, scale))
+ X2 = t(apply(X2, 1, scale))
+ } else {
+ X1 = scale(X1)
+ X2 = scale(X2)
+ }
+
+ ## real relationship calculated from individual genotype
+ K12 = tcrossprod(X1, X2) / M
+ rst = c(rst, var(diag(K12)))
+ }
+ dt = data.frame(grp = rep(c("Theoretical", "Observed"), each = Mnum),
+ r = r,
+ M = rep(Mvec, 2),
+ var = c((1-(0.5)^r)/Mvec, rst))
+ dt0 = rbind(dt0, dt)
+ }
> dt0$r = factor(dt0$r, labels = paste0("Degree:",0:3))
> ggplot(data = dt0, aes(x=M, y=var, grp=grp))+
+ geom_point(aes(shape=grp, color=grp), size = 2)+
+ scale_color_lancet(name="") +
+ scale_shape_manual(values = c(16,17), name="")+
+ ylab("Var(GRM)")+
+ facet_wrap(~r, ncol = 2)+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))3.2 var(encGRM)
- population: 2 cohorts with (0-3)-degree relations under increasing number of markers.
- LD: without LD
> ScaleRow = T
> n1 = 1000
> n2 = 1000
> n_cp = n1
> n1_0 = n1-n_cp
> n2_0 = n2-n_cp
> Mvec = seq(n1, n1*2, n1/5)
> Kvec = ceiling(Mvec/(1/0.8-1))
> Mnum = length(Mvec)
> dt0 = data.frame()
> for(r in 0:3)
+ {
+ rst = vector()
+ for(i in 1:Mnum)
+ {
+ M = Mvec[i]
+ K = Kvec[i]
+ freq = runif(M, 0.05, 0.5)
+ ## r-degree relatives
+ Gr = GenerateGeno_r(freq, n_cp, r, FALSE)
+ X1 = Gr[[1]]
+ X2 = Gr[[2]]
+ X1 = rbind(X1, GenerateGeno(freq, n1_0))
+ X2 = rbind(X2, GenerateGeno(freq, n2_0))
+ if (ScaleRow)
+ {
+ X1 = t(apply(X1, 1, scale))
+ X2 = t(apply(X2, 1, scale))
+ } else {
+ X1 = scale(X1)
+ X2 = scale(X2)
+ }
+
+ ## encryption
+ S = matrix(rnorm(M*K,sd=sqrt(1/M)), M, K)
+ X1hat = tcrossprod(X1, t(S))
+ X2hat = tcrossprod(X2, t(S))
+ if (ScaleRow)
+ {
+ X1hat = t(apply(X1hat, 1, scale))
+ X2hat = t(apply(X2hat, 1, scale))
+ } else {
+ X1hat = scale(X1hat)
+ X2hat = scale(X2hat)
+ }
+
+ ## realized relationship calculated from encrypted genotype
+ K12hat = tcrossprod(X1hat, X2hat) / K
+ rst = c(rst, var(diag(K12hat)))
+ }
+ dt = data.frame(grp = rep(c("Theoretical", "Observed"), each = Mnum),
+ r = r,
+ M = rep(Mvec, 2),
+ var = c((1-(0.5)^r)/Mvec+(1-(0.5)^r)/Kvec, rst))
+ dt0 = rbind(dt0, dt)
+ }
> dt0$r = factor(dt0$r, labels = paste0("Degree:",0:3))
> ggplot(data = dt0, aes(x=M, y=var, grp=grp))+
+ geom_point(aes(shape=grp, color=grp), size = 2)+
+ scale_color_lancet(name="") +
+ scale_shape_manual(values = c(16,17), name="")+
+ ylab("Var(encGRM)")+
+ facet_wrap(~r, ncol = 2)+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))3.3 M table
- Required number of markers in distinguishing relatives
- Upon the number of markers included, the statistical power to identify r-degree relatives is as below.
> n1 = n2 = 200
> m = c(50,500,5000,10000)
> alpha = 0.05 # type I error
> Kseta = c(0.95, 0.45, 0.2, 0.075)
> D = length(Kseta)
> M = length(m)
> P = matrix(NA, D, M)
> P = as.data.frame(P)
> for(i in 1:D)
+ {
+ for(j in 1:M)
+ {
+ P[i,j] = pnorm((sqrt(m[j])*Kseta[i]-qnorm(1-alpha/n1/n2/2))/sqrt(1-Kseta[i]^2)) #???
+ }
+ }
> rownames(P) = c("identity","1-degree","2-degree","3-degree")
> colnames(P) = paste0("M=",m)
> P %>%
+ kbl(caption = paste0("Table1 Statistical power to identify r-degree relatives ($\\alpha$=", alpha, ", n1=",n1,", n2=",n2,")"),
+ escape = FALSE, digits = 3) %>%
+ kable_classic(html_font = "Cambria", font_size = 15)| M=50 | M=500 | M=5000 | M=10000 | |
|---|---|---|---|---|
| identity | 1.000 | 1.000 | 1.000 | 1.000 |
| 1-degree | 0.031 | 1.000 | 1.000 | 1.000 |
| 2-degree | 0.000 | 0.351 | 1.000 | 1.000 |
| 3-degree | 0.000 | 0.001 | 0.676 | 0.996 |
3.4 K table
- Minimal column number of random matrix
> n1 = n2 = 200
> alpha = 0.05
> beta = 0.2
> Kseta = c(0.95, 0.45, 0.2, 0.075, 0)
> D = length(Kseta)
> K = matrix(NA, 1, D-1)
> for(i in 1:(D-1))
+ {
+ tmp = ( ( qnorm(1-beta)*sqrt(1-Kseta[i]^2) + qnorm(1-alpha/n1/n2/2) ) / Kseta[i] )^2
+ K[1,i] = as.character(round(tmp))
+ }
> colnames(K) = c("identity","1-degree","2-degree","3-degree")
> rownames(K) = "unrelated"
>
> K %>%
+ kbl(caption = paste0("Table2 Minimal column number of random matrix(K) ($\\alpha$=", alpha,", $\\beta$=", beta, ", n1=",n1,", n2=",n2,")"),escape = FALSE) %>%
+ kable_classic(html_font = "Cambria", font_size = 15)| identity | 1-degree | 2-degree | 3-degree | |
|---|---|---|---|---|
| unrelated | 29 | 155 | 804 | 5749 |
3.5 Rsq
Purpose: to measure the encryption accuracy between real GRM and encrypted GRM, and computational cost under different encryption levels.
- R2(G,Ghat)
- population: 2 cohorts of unrelated individuals
- LD: with or without LD
> n1 = 200
> n2 = 200
> M = 500 # number of markers
> freq = runif(M, 0.05, 0.5)
> Kvec = seq(2*M, 10*M, 2*M)
> knum = length(Kvec)
> count = 20
> ############### LE ##################
> ## genotype
> X1 = GenerateGeno(freq, n1)
> X2 = GenerateGeno(freq, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
> K21 = tcrossprod(X2, X1) / M
> K11 = tcrossprod(X1, X1) / M
> Me = 1/var(K11[lower.tri(K11)])
> ## Encryption
> error = matrix(NA, knum, count)
> corrK21 = matrix(NA, knum, count)
> time_LE = matrix(NA, knum, count)
> for(c in 1:count)
+ {
+ for(i in 1:knum)
+ {
+ t0 = proc.time()
+ K = Kvec[i]
+ A = RandomMatrixEncryption(X1, X2, M, K, FALSE)
+ K21hat = tcrossprod(A[[2]], A[[1]])/ K
+ t1 = proc.time()
+ time_LE[i,c] = (t1-t0)[1]
+ error[i,c] = sqrt(sum((as.vector(K21)-as.vector(K21hat))^2))
+ corrK21[i,c] = cor(as.vector(K21), as.vector(K21hat))^2
+ }
+ }
> corrK21.t = 1/(1+Me/Kvec)
> corr2 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)),
+ rsq=c(apply(corrK21, 1, mean), corrK21.t),
+ sd=c(apply(corrK21, 1, sd), rep(0,knum)))
> error.t=sqrt(n1*n2/Kvec)
> error2 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)),
+ error=c(apply(error, 1, mean), error.t),
+ sd=c(apply(error, 1, sd), rep(0,knum)))
>
> ############### LD ##################
> ## genotype
> Dprime = runif(M, 0.5, 0.9)
> ld = DprimetoD(freq, Dprime)
> X1 = GenerateLDGeno(freq, ld, n1)
> X2 = GenerateLDGeno(freq, ld, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
> K21 = tcrossprod(X2, X1) / M
> K11 = tcrossprod(X1, X1) / M
> Me = 1/var(K11[lower.tri(K11)])
> ## Encryption
> error = matrix(NA, knum, count)
> corrK21 = matrix(NA, knum, count)
> time_LD = matrix(NA, knum, count)
> for(c in 1:count)
+ {
+ for(i in 1:knum)
+ {
+ t0 = proc.time()
+ K = Kvec[i]
+ A = RandomMatrixEncryption(X1, X2, M, K, FALSE)
+ K21hat = tcrossprod(A[[2]], A[[1]])/ K
+ error[i,c] = sqrt(sum((as.vector(K21)-as.vector(K21hat))^2))
+ corrK21[i,c] = cor(as.vector(K21), as.vector(K21hat))^2
+ t1 = proc.time()
+ time_LD[i,c] = (t1-t0)[1]
+ }
+ }
> corrK21.t = 1/(1+Me/Kvec)
> corr1 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)),
+ rsq=c(apply(corrK21, 1, mean), corrK21.t),
+ sd=c(apply(corrK21, 1, sd), rep(0,knum)))
> error.t = sqrt(n1*n2/Kvec)
> error1 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)),
+ error=c(apply(error, 1, mean), error.t),
+ sd=c(apply(error, 1, sd), rep(0,knum)))
>
> ################# Plot ################
> color = c("olivedrab3","lightyellow3")
> ## GRM accuracy
> dt = rbind(corr2,corr1)
> dt$ld = rep(c("without LD", "with LD"), each=2*knum)
> ggplot(dt, aes(x=K, y=rsq, fill=grp, color=grp))+
+ geom_bar(stat="identity", width = 0.6, position=position_dodge(width = 0.8))+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=0.2, position=position_dodge(0.8))+
+ geom_text(aes(label=round(rsq,4)),vjust=-1.2, color="black", size=2.5, position=position_dodge(0.8))+
+ scale_color_manual(values=color, name="") +
+ scale_fill_manual(values=color, name="") +
+ labs(title = expression(paste("Accuracy between G and ", hat(G))),
+ subtitle = paste("M=",M," n1=",n1," n2=", n2),
+ y=expression(R^2))+
+ facet_wrap(~ld)+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
> ## error
> dt = rbind(error2,error1)
> dt$ld = rep(c("without LD", "with LD"), each=2*knum)
> ggplot(dt, aes(x=K, y=error, fill=grp, color=grp))+
+ geom_bar(stat="identity", width = 0.6, position=position_dodge(width = 0.8))+
+ geom_errorbar(aes(ymin=error-sd, ymax=error+sd), color="black", width=0.2, position=position_dodge(0.8))+
+ geom_text(aes(label=round(error,4)),vjust=-1.2, color="black", size=2.5, position=position_dodge(0.8))+
+ scale_color_manual(values=color, name="") +
+ scale_fill_manual(values=color, name="") +
+ labs(title = expression(paste("Error between G and ", hat(G))),
+ subtitle = paste("M=",M," n1=",n1," n2=", n2),
+ y="error")+
+ facet_wrap(~ld)+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
>
> ## CPU time
> time = data.frame(grp=rep(c("without LD", "with LD"), each=knum), K=as.factor(rep(Kvec, 2)),
+ t=c(apply(time_LE, 1, mean), apply(time_LD, 1, mean)),
+ sd=c(apply(time_LE, 1, sd), apply(time_LD, 1, sd)))
> ggplot(time, aes(x=K, y=t, group = grp))+
+ geom_point()+
+ geom_line()+
+ geom_errorbar(aes(ymin=t-sd, ymax=t+sd), width=.2,
+ position=position_dodge(0.05))+
+ scale_color_lancet(name="")+
+ labs(title = "CPU time for Encryption",
+ subtitle = paste("M=",M," n1=",n1," n2=", n2),
+ y="CPU time(secs)")+
+ facet_wrap(~grp)+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 3.6 M vs K
- population: 2 cohorts with identity, 1-degree, 2-degree and 3-degree relations
- LD: without LD
> color = c("indianred","sienna1","goldenrod2","olivedrab3","lightyellow3")
> n1 = 200 # sample size in cohort1
> n2 = 200 # sample size in cohort2
> alpha = 0.05/(n1*n2) # type I error
> beta = 0.2 # type II error two tail test
> D = 4 # top degree of relations included in simulation
> r = c(0:(D-1))
> n_cp = rep(10, D) # numbers for each degree of relations
> n1_0 = n1-sum(n_cp)
> n2_0 = n2-sum(n_cp)
>
> p=1
> dt = list()
> Mvec=c(50, 200, 900, 6200)
> setavec=c(0.95, 0.45, 0.2, 0.075)
> for(m in 1:length(Mvec))
+ {
+ for(k in 1:length(setavec))
+ {
+ # Determine K
+ seta = setavec[k]
+ K = ( ( qnorm(1-beta)*sqrt(1-seta^2) + qnorm(1-alpha/2) ) / seta )^2
+ K = ceiling(K)
+ # Genotypes
+ M = Mvec[m]
+ freq = rep(0.5, M)
+ X1 = matrix(NA, 0, M)
+ X2 = matrix(NA, 0, M)
+ for(i in 1:D)
+ {
+ Gr = GenerateGeno_r(freq, n_cp[i], r[i], FALSE)
+ X1 = rbind(X1, Gr[[1]])
+ X2 = rbind(X2, Gr[[2]])
+ }
+ X1 = rbind(X1, GenerateGeno(freq, n1_0))
+ X2 = rbind(X2, GenerateGeno(freq, n2_0))
+ X1 = scale(X1)
+ X2 = scale(X2)
+
+ ## Encryption
+ A = RandomMatrixEncryption(X1, X2, M, K, FALSE)
+ vG21hat = as.vector(tcrossprod(A[[2]], A[[1]])/ K)
+ vG21 = as.vector( tcrossprod(X2, X1) / M)
+
+ ## Save data
+ yin = qnorm(1/n1/n2)*sqrt(1/M)
+ xin = qnorm(1/n1/n2)*sqrt(1/M+1/K)
+ lab = lab_relations_true(n1, n2, n_cp)
+ dt[[p]] = data.frame(x=vG21hat, y=vG21, lab=as.factor(lab), M=M, K=K, yin=yin, xin=xin)
+
+ p=p+1
+ }
+ }
>
> dt0 = do.call("rbind", dt)
> dt0$K = as.factor(dt0$K)
> dt0$M = as.factor(dt0$M)
> ggplot(dt0, aes(x=x, y=y)) +
+ geom_point(aes(color=lab), size=0.3) +
+ geom_hline(aes(yintercept = yin), color="blue", alpha=0.5, linetype = 5) +
+ geom_hline(aes(yintercept = -yin), color="blue", alpha=0.5, linetype = 5) +
+ geom_vline(aes(xintercept = xin), color="red", alpha=0.5, linetype = 5) +
+ geom_vline(aes(xintercept = -xin), color="red", alpha=0.5, linetype = 5) +
+ scale_x_continuous(limits=c(-1, 1.15)) + xlab("Estimated relatedness") +
+ scale_y_continuous(limits=c(-1, 1.15)) + ylab("True relatedness") +
+ scale_color_manual(values=color, name="Relations",
+ labels=c("identity","1-degree","2-degree","3-degree","unrelated")) +
+ facet_grid(M~K, labeller = label_both) +
+ theme_article()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))4 encBLUP
4.1 “chain rule”
- Using GBLUP and encGBLUP as an example to examine the relationship between three squared correlation.
- “chain rule”: E(R_p2)=E(R_(p_enc)2)/P
- population: 2 cohorts of unrelated individuals
- LD: with LD
> ## genotype
> h2 = 0.5 # 0.3 and 0.8
> n1 = 200
> n2 = 200
> M = 500
> freq = rep(0.5, M)
> Dprime = runif(M, 0.5, 0.9)
> ld = DprimetoD(freq, Dprime)
> X1 = GenerateLDGeno(freq, ld, n1)
> X2 = GenerateLDGeno(freq, ld, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
>
> ## choose encP
> encP = c(0.5, 0.7, 0.8, 0.9, 0.95, 0.99)
> encPnum = length(encP)
> G11 = tcrossprod(X1) / M
> Me = 1/var(G11[lower.tri(G11)])
> ERsq1 = h2/(1+Me/n1/h2)
>
> count = 20
> Rsq1 = matrix(NA, 1, count)
> Rsq3 = matrix(NA, encPnum, count)
> for(c in 1:count){
+ ## simulate y
+ # Polygenic Model
+ sigma_g = h2
+ b = rnorm(M, sd=sqrt(sigma_g/M))
+ g1 = X1 %*% b
+ g2 = X2 %*% b
+ y1 = g1 + rnorm(n1, sd=sqrt((1/h2-1)*var(g1)))
+ y2 = g2 + rnorm(n2, sd=sqrt((1/h2-1)*var(g2)))
+
+ # Omnigenic Model
+ # sigma_g = h2
+ # b = runif(0.1*M, 10, 15)
+ # qtl = sample(1:M, 0.1*M)
+ # g1 = X1[,qtl] %*% b
+ # g2 = X2[,qtl] %*% b
+ # y1 = g1 + rnorm(n1, sd=sqrt((1/h2-1)*var(g1)))
+ # y2 = g2 + rnorm(n2, sd=sqrt((1/h2-1)*var(g2)))
+
+ ## GBLUP (Rsq1)
+ gblup = GBLUP(X1, X2, y1)
+ Rsq1[1,c] = cor(gblup, y2)^2
+
+ ## encGBLUP (Rsq3)
+ encgblup = list()
+ for(i in 1:encPnum)
+ {
+ encgblup[[i]] = encGBLUP(X1, X2, y1, encP = encP[i], Me)
+ }
+ encgblup = matrix(unlist(encgblup), n2, encPnum)
+ Rsq3[,c] = apply(encgblup, 2, function(x) cor(x, y2)^2)
+
+ # ## gwasBLUP
+ # gwas = gwasBLUP(X1, X2, y1)
+ # Rsq1[1,c] = cor(gwas, y2)^2
+ #
+ # ## encgwasBLUP (Rsq3)
+ # encgwas = list()
+ # for(i in 1:encPnum)
+ # {
+ # encgwas[[i]] = encgwasBLUP(X1, X2, y1, encP[i], Me)
+ # }
+ # encgwas = matrix(unlist(encgwas), n2, encPnum)
+ # Rsq3[,c] = apply(encgwas, 2, function(x) cor(x, y2)^2)
+ print(c)
+ }
>
> ## plot tranformed Rsq1 by dividing Rsq3 by encP
> layout(matrix(1:2, 1,2))
> dt = as.data.frame(t(Rsq3))
> boxplot(dt, ylim=c(0,1), xlab = "encP", ylab = "predicted accuracy", main = "Before transformation", names = encP)
> abline(h=ERsq1, col="blue")
> abline(h=mean(Rsq1[1,]), col="red")
> dt = as.data.frame(t(apply(Rsq3, 2, function(x) x/encP )))
> boxplot(dt, ylim=c(0,1), xlab = "encP", ylab = "predicted accuracy", main = "After transformation", names = encP)
> abline(h=ERsq1, col="blue")
> abline(h=mean(Rsq1[1,]), col="red")5 UKB encBLUP
5.1 Phenotypes
- Use ukbconv (converter) & ukb40268.enc_ukb (phenotype source data)
> ## copy tools to my own directory
+ cd /data4/zqx/encBLUP/2-phenoukb
+ ## r:format -S:the Category ID(50) for STANDING HEIGHT
+ nohup /data4/xuhm/data/ukb/update_Phe/40268/ukbconv /data4/xuhm/data/ukb/update_Phe/40268/ukb40268.enc_ukb r -S50 &
+ # Form a plink phenotype file: ukb50.phe
+ ID=50
+ echo -ne "FID\tIID\tukb${ID}\n" > ukb${ID}.phe
+ awk '(NR>1){print $1,$1,$2}' ukb40268.tab >> ukb${ID}.phe5.2 Test data
- UKB-Center British
- Output: user1.bed/.bim/.fam and user2.bed/.bim/.fam
> ## Three major cohorts in central British
+ cp /data4/xuhm/data/ukb/british/entire/cohort/Liverpool.txt .
+ cp /data4/xuhm/data/ukb/british/entire/cohort/Leeds.txt .
+ cp /data4/xuhm/data/ukb/british/entire/cohort/Manchester.txt .
+ cat Leeds.txt Liverpool.txt Manchester.txt > center.txt
+ plink --bfile /data4/xuhm/data/ukb/british/unrelated_british/urltd_British --keep center.txt --make-bed --out center
+
+ ## SNP QC (589876 variants and 53051 people pass filters and QC.)
+ plink --bfile center --maf 0.01 --make-bed --out center # 131586 variants removed
+ plink --bfile center --hwe 1e-7 --make-bed --out center # 20701 variants removed
+ plink --bfile center --geno 0.05 --make-bed --out center # 42093 variants removed
+ plink --bfile center --pheno ../2-phenoukb/ukb50.phe --make-bed --out center # merge phenotypes
+
+ ## Pruning and clumping (37734 variants remained)
+ plink --bfile center --indep-pairwise 50 5 0.8 --out center
+ plink2 --bfile center --extract center.prune.in --glm allow-no-covars --pfilter 0.05 --out center
+ awk '(NR>1){print $3}' center.PHENO1.glm.linear > candidates.txt
+ rm *~
+
+ ## Test Plink files for two users
+ awk '{print $1,$2}' center.fam > urltd-center.txt
+
+ dir=sample
+ mkdir $dir
+ shuf -n53051 urltd-center.txt > ./$dir/test.txt
+ head -n42440 ./$dir/test.txt > ./$dir/user1.txt
+ tail -n10611 ./$dir/test.txt > ./$dir/user2.txt
+ plink --bfile center --keep ./$dir/user1.txt --extract candidates.txt --make-bed --out ./$dir/user1
+ plink --bfile center --keep ./$dir/user2.txt --extract candidates.txt --make-bed --out ./$dir/user2
+ cd $dir5.3 Protocol-std
> ################# GLOBAL #################
> t0=Sys.time()
> sc = 1
> u1 = "user1" # default user1 name
> u2 = "user2" # default user2 name
> flg.me = F # if TRUE, then Me is already been given in Me.txt
> flg.std = T # if TRUE, genotypes will be standardized
> flg.u1.1 = T # Step1-User1: Me and K calculation, if R2 changed this flag should be TRUE
> flg.u1.2 = T # Step2-User1: GWAS
> flg.sv.3 = T # Step3-Server: generate random matrix
> flg.u1.4 = T # Step4-User1: Encryption beta
> flg.u2.5 = T # Step5-User2: Encryption genotypes matrix
> flg.sv.6 = T # Step6-Server: Genomic Prediction
> arg = commandArgs(T); R2 = as.numeric(arg[1]) # encryption precision
> M = system(paste0("awk 'END{print NR}' ", u1, ".bim"), intern = T)
> write.table(M, "M.txt", quote = F, col.names = F, row.names = F)
> plink = system("which plink",intern=T) # path to plink
> plink2 = system("which plink2",intern=T) # path to plink2
>
> ################# Step1-User1: Me and K calculation #################
> # Input: user1.bed/.bim/.fam
> # Output: K.txt and Me.txt
> if(flg.u1.1)
+ {
+ if(flg.me)
+ {
+ # if Me has already been calculated
+ Me = as.numeric(read.table("Me.txt"))
+ }
+ else
+ {
+ # if Me is not calculated
+ grm_cmd = paste0(plink, " --bfile ", u1, " --make-grm-gz --out ",u1)
+ system(grm_cmd)
+ gz = gzfile(paste0(u1,".grm.gz"))
+ grm = read.table(gz, as.is = T)
+ offDiag = grm[grm[,1]!=grm[,2], 4]
+ Me = 1/var(offDiag/sc, na.rm = TRUE)
+ write.table(Me, "Me.txt", quote = F, col.names = F, row.names = F)
+ }
+ K = round(Me/(1/R2-1))
+ write.table(K, "K.txt", quote = F, col.names = F, row.names = F)
+ print("========== Step1-User1: Me and K calculation finished!! ==============")
+ }
>
> #################### Step2-User1: GWAS ###################
> # Input: user1.bed/.bim/.fam
> # Output: A1.txt ( SNPID | A1 ) and beta.txt ( Beta )
> if(flg.u1.2)
+ {
+ if(flg.std)
+ {
+ cmd1 = paste0(plink, " --bfile ", u1, " --linear standard-beta --out ",u1)
+ cmd2 = paste0("awk '(NR>1){print $2,$4}' ",u1, ".assoc.linear > A1.txt")
+ cmd3 = paste0("awk '(NR>1){print $7}' ", u1, ".assoc.linear > beta.txt")
+ }else{
+ cmd1 = paste0(plink2, " --bfile ", u1, " --glm allow-no-covars --variance-standardize --out ",u1)
+ cmd2 = paste0("awk '(NR>1){print $3,$6}' ",u1, ".PHENO1.glm.linear > A1.txt")
+ cmd3 = paste0("awk '(NR>1){print $9}' ", u1, ".PHENO1.glm.linear > beta.txt")
+ }
+ system(cmd1)
+ system(cmd2)
+ system(cmd3)
+ print("========== Step2-User1: GWAS finished!! ====================")
+ }
>
> ############## Step3-Server: generate random matrix ##############
> # Input: M.txt and K.txt
> # Output: key.txt
> if(flg.sv.3)
+ {
+ M = as.numeric(read.table("M.txt"))
+ K = as.numeric(read.table("K.txt"))
+ W = matrix(rnorm(M*K, sd=sqrt(1/M)), M, K)
+ write.table(format(W, digits = 3), file="key.txt", quote = F, col.names = F, row.names = F)
+ print("========== Step3-Server: generate random matrix finished!! ==========")
+ }
>
> ##################### Step4-User1: Encryption beta #####################
> # Input: key.txt and beta.txt
> # Output: locked-beta.txt
> if(flg.u1.4)
+ {
+ W = as.matrix(read.table("key.txt"))
+ beta = read.table("beta.txt")
+ locked_beta= crossprod(as.matrix(beta), W)
+ write.table(format(t(locked_beta), digits = 4), file="locked-beta.txt", quote = F, col.names = F, row.names = F)
+ print("========== Step4-User1: Encryption beta finished!! ================")
+ }
>
> ##################### Step5-User2: Encryption Genotype Matrix #####################
> # Input: K.txt, key.txt and user2.bed/.bim/.fam
> # Output: user2.sscore
> if(flg.u2.5)
+ {
+ K = as.numeric(read.table("K.txt"))
+ cmd1 = "paste -d \" \" A1.txt key.txt > A1-key.txt"
+ if(flg.std){
+ cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-key.txt 1 2 variance-standardize --score-col-nums 3-",K+2," --variance-standardize --out ", u2)
+ }else{
+ cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-key.txt 1 2 --score-col-nums 3-",K+2," --variance-standardize --out ", u2)
+ }
+ cmd3 = paste0("rm A1-key.txt")
+ system(cmd1)
+ system(cmd2)
+
+ print("========== Step5-User2: Encryption Genotype Matrix finished!! ===============")
+ }
>
> ##################### Step6-Server: Prediction #####################
> # Input: user2.sscore and locked-beta.txt
> # Output: yhat
> if(flg.sv.6)
+ {
+ Xenc = read.table(paste0(u2, ".sscore"))
+ benc = read.table("locked-beta.txt")
+ y2 = Xenc[,3]
+ yhat = tcrossprod(as.matrix(Xenc[,-c(1:5)]), t(as.matrix(benc)))
+ write.table(cor(y2, yhat)^2, paste0("Accuracy.",R2,".txt"), quote = F, col.names = F, row.names = F)
+ print("========== Step6-Server: Prediction finished!! ===============")
+ }
>
> t1=Sys.time()
> print(t1-t0)5.4 Open-std
> ################# GLOBAL #################
> t0=Sys.time()
> u1 = "user1" # default user1 name
> u2 = "user2" # default user2 name
> flg.std = T # if TRUE, genotypes will be standardized
> plink = system("which plink",intern=T) # path to plink
> plink2 = system("which plink2",intern=T) # path to plink2
>
> ## GWAS
> # if(flg.std)
> # {
> # cmd1 = paste0(plink, " --bfile ", u1, " --linear standard-beta --out ",u1)
> # cmd2 = paste0("awk '(NR>1){print $2,$4}' ",u1, ".assoc.linear > A1.txt")
> # cmd3 = paste0("awk '(NR>1){print $7}' ", u1, ".assoc.linear > beta.txt")
> # }else{
> # cmd1 = paste0(plink2, " --bfile ", u1, " --glm allow-no-covars --variance-standardize --out ",u1)
> # cmd2 = paste0("awk '(NR>1){print $3,$6}' ",u1, ".PHENO1.glm.linear > A1.txt")
> # cmd3 = paste0("awk '(NR>1){print $9}' ", u1, ".PHENO1.glm.linear > beta.txt")
> # }
> # system(cmd1)
> # system(cmd2)
> # system(cmd3)
>
> ## Predict
> cmd1 = "paste -d \" \" A1.txt beta.txt > A1-beta.txt"
> if(flg.std){
+ cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-beta.txt 1 2 3 variance-standardize --variance-standardize --out ", u2, ".open")
+ }else{
+ cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-beta.txt 1 2 3 --variance-standardize --out ", u2, ".open")
+ }
> system(cmd1)
> system(cmd2)
> Xenc = read.table(paste0(u2, ".open.sscore"))
> y2 = Xenc[,3]
> yhat = Xenc[,6]
> write.table(cor(y2, yhat)^2, paste0("Accuracy.open.txt"), quote = F, col.names = F, row.names = F)
>
>
> t1=Sys.time()
> print(t1-t0)- old test version
- this was tested on 133server
> ## Input
> setwd("~/Desktop/Cryptography/ukb")
> geno.opts = mPhen.options("geno.input")
> # geno.opts$mPhen.batch = 5000
> Xinput = read.plink("test",opts = geno.opts)
> n = dim(Xinput)[1]
> M = dim(Xinput)[2]
> pheno.opts = mPhen.options("pheno.input")
> pheno.opts$mPhen.sep.pheno=" "
> pheno.opts$mPhen.numHeaderRows.pheno = 0
> pheno = mPhen.readPhenoFiles("test.fam", opts = pheno.opts)
> y = as.matrix(pheno$pheno[,3])
>
> ## randomly impute genotype NA
> Xinput[is.na(Xinput)] = sample(c(0,1,2), length(which(is.na(as.vector(Xinput)))), replace = T)
> X = scale(Xinput)
> y = scale(y)
> colnames(X) = colnames(Xinput)
> rm(Xinput)
>
> ## 5-fold cross-validation
> iid = rownames(y)
> cvsize=5
> cv2mat = matrix(sample(c(iid, rep(NA,cvsize-n%%cvsize)), n+cvsize-n%%cvsize), cvsize, (n+cvsize-n%%cvsize)/cvsize )
> R2 = c(0.5, 0.7, 0.8, 0.9, 0.95)
> knum = length(R2)
> rst = matrix(NA, cvsize, (knum+1)*2)
> for(c in 2:cvsize)
+ {
+ t0=Sys.time()
+ ## divide training and testing
+ cv2 = cv2mat[c,]
+ if(length(which(is.na(cv2)))>0) cv2 = cv2[-which(is.na(cv2))]
+ cv1 = iid[-which(iid %in% cv2)]
+ X1 = X[cv1,]
+ X2 = X[cv2,]
+ y1 = as.matrix(y[cv1,])
+ y2 = as.matrix(y[cv2,])
+ n1 = length(cv1)
+ n2 = length(cv2)
+ ## SBLUP
+ bhat = apply(X1, 2, function(x) cov(x, y1)/var(x))
+ y2hat = X2 %*% bhat
+ rsq1 = as.numeric(cor(y2hat, y2)^2)
+
+ ## GBLUP
+ y2hat = GBLUP(X1, X2, y1)
+ rsq2 = as.numeric(cor(y2hat, y2)^2)
+
+ Me = 1/var(K21[lower.tri(K21)])
+ Kvec = round(Me/(1/R2-1))
+
+ rsq3 = matrix(NA, 1, knum)
+ rsq4 = matrix(NA, 1, knum)
+ for(i in 1:knum)
+ {
+ K = Kvec[i]
+ ## encSBLUP
+ A = RandomMatrixEncryption(X2, bhat, M, K)
+ y2hat = A[[1]] %*% t(A[[2]])
+ rsq3[1,i] = as.numeric(cor(y2hat, y2)^2)
+ ## encGBLUP
+ A = RandomMatrixEncryption(X1, X2, M, K)
+ K21hat = A[[2]] %*% t(A[[1]]) / K
+ y2hat_enc = K21hat %*% solV %*% as.matrix(y1)
+ rsq4[1,i] = as.numeric(cor(y2hat_enc, y2)^2)
+ print(i)
+ }
+ rst[c,] = c(rsq1, rsq3, rsq2, rsq4)
+ t1=Sys.time()
+ print(c)
+ print("CV Time:")
+ print(t1-t0)
+ }
> load(f="./result.m37734.Rdata")
> df = data.frame(rsq=apply(rst, 2, mean),grp=rep(c("SBLUP","GBLUP"),each=6), ratio=rep(c(1,R2),2))
> df$ratio = as.factor(df$ratio)
> ggplot(data=df,aes(x=ratio, y=rsq, grp=grp, color=grp, fill=grp))+
+ geom_bar(stat="identity", width = 0.6, position="dodge")+
+ labs(title = "Center British",
+ subtitle = paste0("M=5000, n1=3253"),
+ x = "Encryption Precision",
+ y = "Prediction accuracy")+
+ theme_bw()+
+ theme(legend.position = "bottom",plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))6 UKB relatives
6.1 Prepare
- count paired 1-degree relatives between cities (“British_sib” may not be reliable)
> cd /data4/xuhm/data/ukb/british/sib
+ nohup awk '{if($8>0.354) print $0}' British_sib_tab.kin0 > 0-degree.txt &
+ nohup awk '{if($8>0.177 && $8<0.354) print $2,$4,$8}' British_sib_tab.kin0 > 1-degree.txt &
+ ## Extract city information
+ line=`awk 'END{print NR}' 1-degree.txt`
+ for i in `seq $line`
+ do
+ {
+ id1=`awk -v i=$i '(NR==i){print $1}' 1-degree.txt`
+ city1=`grep $id1 ../entire/cohort/*.txt | cut -d ' ' -f3`
+ if [ -z $city1 ]; then city1="NA"; fi
+ id2=`awk -v i=$i '(NR==i){print $2}' 1-degree.txt`
+ city2=`grep $id2 ../entire/cohort/*.txt | cut -d ' ' -f3`
+ if [ -z $city2 ]; then city2="NA"; fi
+ value=`awk -v i=$i '(NR==i){print $3}' 1-degree.txt`
+ echo -e "${id1}\t${id2}\t${city1}\t${city2}\t${value}" >> 1-degree-city.txt
+ }
+ done
+
+ ## Calculate cross-city 1-degree pairs
+ cohort_file=(`cat ../entire/cohort/cohort.txt`)
+ cohort_name=(${cohort_file[@]%.*})
+ cohort_num=${#cohort_file[@]}
+ for i in ${cohort_name[@]}
+ do
+ {
+ for j in ${cohort_name[@]}
+ do
+ {
+ if [ "$i" != "$j" ] ; then grep $i 1-degree-city.txt | grep $j | awk -vi=$i -vj=$j 'END{print i,j,NR}' >> pair_counts.txt ; fi
+ }
+ done
+ }
+ done6.2 Method2-KING
- Generate sample data
- city1=Manchester
- city2=Oxford
> ## form plink file including two cities
+ cd /data4/zqx/king
+ city1='Manchester'
+ city2='Oxford'
+ pwd='/data4/xuhm/data/ukb/british/entire/cohort/'
+ cat ${pwd}${city1}.txt ${pwd}${city2}.txt > indi.txt
+ plink --bfile ${pwd}ukb --keep indi.txt --maf 0.01 --hwe 1e-7 --geno 0.05 --make-bed --out ${city1}-${city2}.qc
+
+ ## make king table
+ nohup plink2 --bfile ${city1}-${city2}.qc --make-king-table --out ${city1}-${city2}.qc &
+
+ ## extract 1-degree relatives information between two cities
+ nohup awk '{if($8>0.177 && $8<0.354) print $2,$4,$8}' ${city1}-${city2}.qc.kin0 > 1-degree.txt &
+ line=`awk 'END{print NR}' 1-degree.txt`
+ ## list their city names
+ for i in `seq $line`
+ do
+ {
+ id1=`awk -v i=$i '(NR==i){print $1}' 1-degree.txt`
+ city1=`grep $id1 indi.txt | cut -d ' ' -f3`
+ if [ -z $city1 ]; then city1="NA"; fi
+ id2=`awk -v i=$i '(NR==i){print $2}' 1-degree.txt`
+ city2=`grep $id2 indi.txt | cut -d ' ' -f3`
+ if [ -z $city2 ]; then city2="NA"; fi
+ value=`awk -v i=$i '(NR==i){print $3}' 1-degree.txt`
+ echo -e "${id1}\t${id2}\t${city1}\t${city2}\t${value}" >> 1-degree-city.txt
+ }
+ done6.3 Method1-encGRM
- Generate sample data
- city1=Manchester
- city2=Oxford
> cd /data4/zqx/related
+ M=5000
+ city1='Manchester'
+ city2='Oxford'
+ pwd='/data4/xuhm/data/ukb/british/entire/cohort/'
+ plink --bfile ${pwd}ukb --chr 1 --maf 0.01 --hwe 1e-7 --geno 0.05 --make-bed --out ukb.chr1.qc
+ awk '{print $1,$2}' ukb.chr1.qc.bim | shuf -n${M} > snp${M}.txt
+
+ # generate plink file
+ plink --bfile ukb.chr1.qc --keep ${pwd}${city1}.txt --extract snp${M}.txt --make-bed --out $city1
+ plink --bfile ukb.chr1.qc --keep ${pwd}${city2}.txt --extract snp${M}.txt --make-bed --out $city2
+
+ # determine A1 allele
+ plink --bfile ${city1} --freq --out ${city1}
+ awk '(NR>1){print $2,$3}' ${city1}.frq > A1.txt
+
+ # generate .raw file
+ plink --bfile ${city1} --recode A --recode-allele A1.txt --out ${city1}
+ plink --bfile ${city2} --recode A --recode-allele A1.txt --out ${city2}
+
+ # generate .ind file
+ awk '{print $2}' ${city1}.fam > ${city1}.ind
+ awk '{print $2}' ${city2}.fam > ${city2}.ind- encryption
> source("/Users/apple/Desktop/Cryptography/source.R")
> setwd("/Users/apple/Desktop/Cryptography/ukb/grm/")
> ## function
> RandomMatrixEncryption = function(Mat1, Mat2, M, K)
+ {
+ if(ncol(Mat1)!=M & ncol(Mat1)!=M) return(NULL)
+ W = matrix(rnorm(M*K,sd=sqrt(1/M)), M, K)
+ RandomMat1 = Mat1 %*% W
+ RandomMat2 = Mat2 %*% W
+ RandomMat1 = t(apply(RandomMat1, 1, scale)) # scale by row
+ RandomMat2 = t(apply(RandomMat2, 1, scale)) # scale by row
+ return(list(RandomMat1,RandomMat2))
+ }
>
> c1 = "Manchester"
> c2 = "Oxford"
> X1 = read.table(paste0(c1,".raw"), header = T)
> X2 = read.table(paste0(c2,".raw"), header = T)
> X1 = X1[,-c(1:6)]
> X2 = X2[,-c(1:6)]
> name1 = read.table(paste0(c1,".ind"), header = F)
> name2 = read.table(paste0(c2,".ind"), header = F)
> rownames(X1) = name1[,1]
> rownames(X2) = name2[,1]
> n1=nrow(X1)
> n2=nrow(X2)
> M=ncol(X1)
>
> # fill NA
> X1[is.na(X1)] = sample(c(0,1,2), length(which(is.na(as.vector(X1)))), replace = T)
> X2[is.na(X2)] = sample(c(0,1,2), length(which(is.na(as.vector(X2)))), replace = T)
> X1 = apply(X1, 2, scale)
> X2 = apply(X2, 2, scale)
>
> # Encryption
> alpha = 0.05/(n1*n2) # type I error
> beta = 0.2 # type II error two tail test
> seta = 0.25
> K = ( ( qnorm(1-beta)*sqrt(1-seta^2) + qnorm(1-alpha/2) ) / seta )^2
> K = ceiling(K) # theoretical needed number of PC
> A = RandomMatrixEncryption(X1, X2, M, K)
> grm_hat = tcrossprod(A[[1]],A[[2]])/K
> grm = tcrossprod(X1,X2)/M
> Me = 1/var(grm[lower.tri(grm)])
>
> # summary
> r2_o = cor(as.vector(grm),as.vector(grm_hat))^2 # observed correlation
> r2_t = 1/(1 + Me/K) # theoretical correlation
> yin = qnorm(1/n1/n2)*sqrt(1/Me)
> xin = qnorm(1/n1/n2)*sqrt(1/Me+1/K)
>
> ## confirm
> thrd = -xin
> sign = which(as.vector(grm_hat)>thrd)
> value = round(as.vector(grm_hat)[sign], 4)
> row = sign %% n1
> col = sign %/% n1 + 1
> # output individual id
> id = matrix(NA, length(sign), 2)
> id[,1] = name1[row,1]
> id[,2] = name2[col,1]
> colnames(id) = c(c1,c2)
> id = cbind(id, value)
> write.table(id, file = "identified.txt", quote = F, row.names = F, col.names = F)
>
> ## plot.Rdata
> lab = array(9, dim = n1*n2)
> lab[sign] = 1
> dt = data.frame(x=as.vector(grm_hat), lab = lab, y=as.vector(grm), M=M, K=K, yin=yin, xin=xin)
> dt$K = as.factor(dt$K)
> dt$M = as.factor(dt$M)
> dt$lab = as.factor(dt$lab)
> # save(dt, file = "plot.Rdata")
>
> ## plot(on server)
> library(egg)
> library(ggplot2)
> load(file = "plot.Rdata")
> color = c("sienna1","lightyellow3")
> p = ggplot(dt[c(1:1000,11226754,15072985),], aes(x=x, y=y)) +
+ geom_point(aes(color=lab), size=0.3) +
+ geom_hline(aes(yintercept = yin), color="blue", alpha=0.5, linetype = 5) +
+ geom_hline(aes(yintercept = -yin), color="blue", alpha=0.5, linetype = 5) +
+ geom_vline(aes(xintercept = xin), color="red", alpha=0.5, linetype = 5) +
+ geom_vline(aes(xintercept = -xin), color="red", alpha=0.5, linetype = 5) +
+ scale_x_continuous(limits=c(-1, 1.15)) + xlab("Estimated relatedness") +
+ scale_y_continuous(limits=c(-1, 1.15)) + ylab("True relatedness") +
+ facet_grid(M~K, labeller = label_both) +
+ scale_color_manual(values=color, name="Relations", labels=c("1-degree","unrelated")) +
+ theme_article()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))
>
> ggsave(p, filename = "Manchester-Oxford-reduce.pdf", dpi = 600, width = 12.4, height = 13.3)
> ggsave(p, filename = "Manchester-Oxford-reduce.jpeg",dpi = 600, width = 12.4, height = 13.3)7 Appendix
7.1 Appendix1: Me
For 100 equifrequent biallelic loci , if the correlation for each pair of consecutive markers is 0,0.25,0.5,0.75, the effective number of markers is approximately 102, 90, 62, 30
> n1=50
> n2=50
> M=100
> freq = rep(0.5, M)
> count = 50
> for(r in c(0, 0.25, 0.5, 0.75))
+ {
+ rho = rep(r, M)
+ ld = freq^2*rho
+ me = c()
+ for(c in 1:count)
+ {
+ X1 = GenerateLDGeno(freq, ld, n1)
+ X2 = GenerateLDGeno(freq, ld, n2)
+ G = rbind(X1,X2)
+ G = t(apply(G, 1, scale)) # scale by row
+ X1 = G[1:n1,]
+ X2 = G[(n1+1):(n1+n2),]
+
+ grm = X1 %*% t(X2) / M
+ me = c(me,1/var(grm[lower.tri(grm)]))
+ }
+ print(mean(me))
+ }7.2 Appendix2: scale
What is the difference between scaling by row and by column
> n1 = 30 # sample size in cohort1
> n2 = 30 # sample size in cohort2
> n0 = 30
> alpha = 0.05/(n1*n2) # type I error
> beta = 0.2 # type II error two tail test
> seta = 0.45
> M = 100
> K = ( ( qnorm(1-beta/2)*sqrt(1-seta^2) + qnorm(1-alpha) ) / seta )^2
> K = round(K)
> freq = rep(0.5, M) # MAF of each marker
>
> ## generate genotypes of identity pairs
> Gr = GenerateGeno_r(freq, n0, 0)
> X1 = Gr[[1]]
> X2 = Gr[[2]]
> X1 = rbind(X1, GenerateGeno(freq, n1-n0))
> X2 = rbind(X2, GenerateGeno(freq, n1-n0))
> G = rbind(X1,X2)
> Gstd = list()
> Gstd[[1]] = scale(G) # scale by column
> Gstd[[2]] = t(apply(G, 1, scale)) # scale by row
>
> unrelated = list()
> identity = list()
> for(i in 1:2)
+ {
+ X1 = Gstd[[i]][1:n1,]
+ X2 = Gstd[[i]][(n1+1):(n1+n2),]
+ grm = X1 %*% t(X2) / M
+ vgrm = as.vector(grm)
+ ## seperate unrelated and related pairs in grm
+ unrelated[[i]] = vgrm[-seq(1,n1*n2,n2+1)]
+ identity[[i]] = vgrm[seq(1,n1*n2,n2+1)]
+ }
>
> layout(matrix(1:2,1,2))
> plot(identity[[2]], identity[[1]], ylim = c(-0.5,1.1), xlim = c(-0.5,1.1),
+ ylab = "Scale by column", xlab = "Scale by row", main = "Identity pairs")
> plot(unrelated[[2]], unrelated[[1]], ylim = c(-0.5,0.5), xlim = c(-0.5,0.5),
+ ylab = "Scale by column", xlab = "Scale by row", main = "Unrelated pairs")7.3 Appendix3: Rsq(relative)
- population: 2 cohorts with 1-degree relations
- LD: without LD
- x-asis: K
> flag = "simu"
> color = c("indianred","sienna1","goldenrod2","olivedrab3","lightyellow3")
> n1 = n2 = 300
> seta = 0.5
> M = 500 # number of markers
> freq = rep(0.5, M) # MAF of each marker
> D = 4
> r = c(0, 1, 2, 3)
> n_cp = c(0, n1, 0, 0) # numbers for each degree of relations
> n1_0 = n1-sum(n_cp)
> n2_0 = n2-sum(n_cp)
> X1 = matrix(NA, 0, M)
> X2 = matrix(NA, 0, M)
> for(i in 1:D)
+ {
+ Gr = GenerateGeno_r(freq, n_cp[i], r[i], FALSE)
+ X1 = rbind(X1, Gr[[1]])
+ X2 = rbind(X2, Gr[[2]])
+ }
> X1 = rbind(X1, GenerateGeno(freq, n1_0))
> X2 = rbind(X2, GenerateGeno(freq, n2_0))
> X = rbind(X1,X2)
> X = t(apply(X, 1, scale)) # scale by row
> X1 = X[1:n1,]
> X2 = X[(n1+1):(n1+n2),]
> ## real relationship calculated from individual genotype
> grm = X1 %*% t(X2) / M
> vgrm = as.vector(grm)
>
> pic = list()
> count = 20
> Kvec = seq(M, 10*M, M)
> knum = length(Kvec)
> rsqr.o = matrix(NA, knum, count)
> yvar.o = matrix(NA, knum, count)
> xvar.o = matrix(NA, knum, count)
> for(i in 1:knum)
+ {
+ K = Kvec[i]
+ for(c in 1:count)
+ {
+ ## generate random weight matrix W(M*K) w_ij~N(0,1/M)
+ W = matrix(rnorm(M*K,sd=sqrt(1/M)), M, K)
+
+ ## calculate individual PPS A1(n1*K) A2(n2*K)
+ A1 = X1 %*% W
+ A2 = X2 %*% W
+ A1 = t(apply(A1, 1, scale)) # scale by row
+ A2 = t(apply(A2, 1, scale)) # scale by row
+
+ ## Estimated GRM
+ grm_hat = A1 %*% t(A2) / K
+ vgrm_hat = as.vector(grm_hat)
+
+ ## separate unrelated and related pairs in grm(y) and grm_hat(x)
+ u.yvec = vgrm[-seq(1,n1*n2,n2+1)]
+ r.yvec = vgrm[seq(1,n1*n2,n2+1)]
+ u.xvec = vgrm_hat[-seq(1,n1*n2,n2+1)]
+ r.xvec = vgrm_hat[seq(1,n1*n2,n2+1)]
+
+ ## correlation
+ rsqr.o[i,c] = cor(r.xvec, r.yvec)^2
+ yvar.o[i,c] = var(r.yvec)
+ xvar.o[i,c] = var(r.xvec)
+ yci = qnorm(1/n1/n2)*sqrt(yvar.o)
+ xci = qnorm(1/n1/n2)*sqrt(xvar.o)
+ }
+ print(K)
+
+ # label simulated relations
+ # lab = lab_relations_true(n1, n2, n_cp)
+ # dt = data.frame(x=vgrm_hat, y=vgrm, lab=as.character(lab))
+ # s = 2
+ # pic[[k]] = ggplot(dt, aes(x=x, y=y, color=lab)) +
+ # geom_point(aes(color=lab), size=0.3) +
+ # geom_hline(aes(yintercept = u.yci), color=color[5], alpha=0.5, linetype=5, size =0.3) +
+ # geom_hline(aes(yintercept = -u.yci), color=color[5], alpha=0.5, linetype=5, size =0.3) +
+ # geom_vline(aes(xintercept = u.xci), color=color[5], alpha=0.5, linetype=5, size =0.5) +
+ # geom_vline(aes(xintercept = -u.xci), color=color[5], alpha=0.5, linetype=5, size =0.5) +
+ # geom_hline(aes(yintercept = seta+r.yci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+ # geom_hline(aes(yintercept = seta-r.yci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+ # geom_vline(aes(xintercept = seta+r.xci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+ # geom_vline(aes(xintercept = seta-r.xci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+ # scale_x_continuous(limits=c(-1.5, 1.5)) + xlab("PPSR coefficient") +
+ # scale_y_continuous(limits=c(-1.5, 1.5)) + ylab("True relatedness") +
+ # scale_color_manual(values=color[c(s,5)], name="Relations",
+ # labels=c("0-degree","1-degree","2-degree","3-degree","unrelated")[c(s,5)]) +
+ # annotate("text", x = -1, y = -1, size = 2,
+ # label = paste0("M=",M,", K=",K,"\nn1=",n1,", n2=",n2,
+ # "\nrsq.o=",round(r.rsq.o,6),"\nrsq.t=",round(r.rsq.t,6),
+ # "\nvar(x.o)=",round(r.xvar.o,6),"\nvar(x.t)=",round(r.xvar.t,6),
+ # "\nvar(y.o)=",round(r.yvar.o,6),"\nvar(y.t)=",round(r.yvar.t,6))) +
+ # theme_article()+
+ # theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))
+ #
+ # pic[[k]] = ggMarginal(pic[[k]], groupColour = T, groupFill = T,type = "density")
+ }
>
> # grid.arrange(pic[[1]],pic[[2]],pic[[3]], ncol=3)
>
> rsqr.t = 1/(1 + M/Kvec)
> yvar.t = rep((1+seta^2)/M, knum)
> xvar.t = (1+seta^2)/Kvec+(1+seta^2)/M
> apply(yvar.o, 1, mean)
> dt = data.frame(grp=rep(c("observed","theoretical"), each = knum),
+ K=rep(Kvec, 2),
+ rsq=c(apply(xvar.o, 1, mean), xvar.t),
+ sd= c(apply(xvar.o, 1, sd),rep(0,knum)))
> ggplot(dt, aes(x=K, y=rsq, group = grp, color=grp))+
+ geom_point()+
+ geom_line()+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), width=.2,
+ position=position_dodge(0.05))+
+ scale_color_lancet(name="")+
+ labs(subtitle = paste0("M=",M," n1=",n1," n2=", n2),
+ y="Variance of Ghat")+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
>
> dt = data.frame(grp=rep(c("observed","theoretical"), each = knum),
+ K=rep(Kvec, 2),
+ rsq=c(apply(rsqr.o, 1, mean), rsqr.t),
+ sd= c(apply(rsqr.o, 1, sd),rep(0,knum)))
> ggplot(dt, aes(x=K, y=rsq, group = grp, color=grp))+
+ geom_point()+
+ geom_line()+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), width=.2,
+ position=position_dodge(0.05))+
+ scale_color_lancet(name="")+
+ labs(subtitle = paste0("M=",M," n1=",n1," n2=", n2),
+ y=expression(R^2))+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 7.4 BLUP
- without LD
- no overlapping
> ## genotype
> h2 = 0.5
> n1 = 200
> n2 = 200
> M = 500 # number of markers
> freq = rep(0.5, M) # MAF of each marker
> X1 = GenerateGeno(freq, n1)
> X2 = GenerateGeno(freq, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
>
> ## K = M/(1/R2-1)
> R2 = c(0.5, 0.7, 0.8, 0.9, 0.95, 0.99)
> Kvec = round(M/(1/R2-1))
> knum = length(Kvec)
> K11 = A.mat(X1)
>
> ## phenotype
> count = 20
> rsq0 = matrix(NA, 1, count)
> rsq1 = matrix(NA, 1, count)
> rsq2 = matrix(NA, 1, count)
> rsq3 = matrix(NA, knum, count)
> t0=Sys.time()
> for(c in 1:count)
+ {
+ sigma_g = h2
+ b = rnorm(M, sd=sqrt(sigma_g/M))
+ g = X %*% b
+ sigma_e = (1/h2-1)*var(g)
+ error = rnorm(n1+n2, sd=sqrt(sigma_e))
+ y = g + error
+ y = scale(y)
+ y1 = as.data.frame(y[1:n1], row.names = rownames(X1))
+ y2 = as.data.frame(y[(n1+1):(n1+n2)], row.names = rownames(X2))
+
+ ## gwasBLUP
+ y2hat = gwasBLUP(X1, X2, y1)
+ rsq1[1,c] = as.numeric(cor(y2hat, y2)^2)
+ rsq0[1,c] = h2/(1+M/n1/h2)
+
+ ## encGBLUP
+ dt = data.frame(y1, id=rownames(y1))
+ colnames(dt)[1]="y1"
+ mod = mmer(y1~1,
+ random = ~vs(id, Gu=K11),
+ rcov = ~units, data = dt, verbose = FALSE, date.warning = FALSE)
+ sigmahat = mod$sigmaVector
+ # V
+ V = K11+diag(sigmahat[2]/sigmahat[1], n1, n1)
+ solV = solve(V)
+ # K21
+ K21 = X2 %*% t(X1) / M
+ y2hat = K21 %*% solV %*% as.matrix(y1)
+ rsq2[1,c] = as.numeric(cor(y2hat, y2)^2)
+ # K21 hat
+ for(i in 1:knum)
+ {
+ K = Kvec[i]
+ A = RandomMatrixEncryption(X1, X2, M, K, ScaleRow = T)
+ K21hat = tcrossprod(A[[2]], A[[1]]) / K
+ y2hat_enc = K21hat %*% solV %*% as.matrix(y1)
+ rsq3[i,c] = as.numeric(cor(y2hat_enc, y2)^2)
+ }
+ t1=Sys.time()
+ print(paste("Repeat Time:",c,"Collapse Time:"))
+ print(t1-t0)
+ }
>
> dt = data.frame(grp=rep(c("Upper Bound","sBLUP","GBLUP","encGBLUP"), each=knum), K=rep(R2, 4),
+ rsq=c(rep(mean(rsq0),knum), rep(mean(rsq1), knum), rep(mean(rsq2), knum), apply(rsq3, 1, mean)),
+ sd=c(rep(0,knum),rep(sd(rsq1), knum), rep(sd(rsq2), knum), apply(rsq3, 1, sd)))
> dt$K = as.factor(dt$K)
> ggplot(dt, aes(x=K, y=rsq, group = grp, color=grp))+
+ geom_point()+
+ geom_line()+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), width=.2,
+ position=position_dodge(0.05))+
+ scale_color_lancet(name="")+
+ labs(title = "Prediction accuracy",
+ subtitle = paste0("M=",M," n1=",n1," n2=", n2, " h2=", h2),
+ x = expression(1/(1+M/K)),
+ y = "Prediction accuracy")+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) > setwd("~/Desktop/Cryptography/results")
> n1=2000
> M=3000
> dt0 = data.frame()
> for(h2 in c(0.5))
+ {
+ for(ratio in c(1:3))
+ {
+ n2=n1*ratio
+ f = paste0("BLUP_n2_ratio_",ratio,"_h2_",h2,".RData")
+ load(file = f)
+ # dt$h2=rep(h2, nrow(dt))
+ # dt$n1=rep(n1, nrow(dt))
+ # dt$n2=rep(n2, nrow(dt))
+ dt0 = rbind(dt0,dt)
+ }
+ }
> color = c("sienna1","goldenrod2","olivedrab3","lightyellow3")
> ggplot(dt0, aes(x=K, y=rsq, fill = grp, color=grp))+
+ geom_bar(stat="identity", width = 0.8 , position=position_dodge(width = 0.8))+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="grey40", width=0.2, position=position_dodge(0.8))+
+ scale_color_manual(values=color, name="") +
+ scale_fill_manual(values=color, name="") +
+ labs(title = "Prediction accuracy",
+ subtitle = paste0("M=",M," n1=", n1),
+ x = expression(R^2),
+ y = "Prediction accuracy")+
+ theme_bw()+
+ facet_grid(h2~n2, labeller = label_both)+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))7.5 Appendix4: geom_point() vs geom_bar()
> Me_m1=Me11
> corrG21hat.t = apply(Me_m2,1,mean)/Me_m1/(1+Me_m1/K)/(1+apply(Me_m2,1,mean)/K)
> dt = data.frame(m2=rep(m2vec,2),
+ grp=rep(c("observed","theoretical"), each=m2num),
+ rsq=c(apply(corrG21hat, 1, mean), corrG21hat.t),
+ sd=c(apply(corrG21hat, 1, sd), rep(0, m2num)))
> ## geom_point()
> ggplot(dt, aes(x=m2,y=rsq, grp=grp, color=grp))+
+ geom_point()+
+ geom_line()+
+ scale_color_lancet(name="")+
+ labs(title = expression(paste("Accuracy between ",hat(G[21])(m1)," and ", hat(G[21])(m2))),
+ subtitle = paste("K=",K, " m1=",m1," n1=",n1," n2=", n2),
+ y=expression(R^2))+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
> ## geom_bar()
> ggplot(dt, aes(x=m2,y=rsq, fill=grp, color=grp))+
+ geom_bar(stat="identity", width = 60 , position=position_dodge(width = 80))+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=20, position=position_dodge(80))+
+ geom_text(aes(label=round(rsq,4)),vjust=-1.1, color="black", size=2.5, position=position_dodge(80))+
+ scale_color_manual(values=color, name="") +
+ scale_fill_manual(values=color, name="") +
+ labs(title = expression(paste("Accuracy between ",hat(G[21])(m1)," and ", hat(G[21])(m2))),
+ subtitle = paste("K=",K, " m1=",m1," n1=",n1," n2=", n2),
+ y=expression(R^2))+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 8 SNP sets
8.1 SNP set(G)
- R2(G, Ghat)
- SNP set: m2 not equal to m1 and {SNP2} belong to {SNP1}
- population: 2 cohorts of unrelated individuals
- LD: with LD
> n1 = 200
> n2 = 200
> m1 = 500 # the snp number for training sample
> SNPset1 = 1:m1
> freq = rep(0.5, m1) # MAF of each marker
> rho = rep(0.75, m1) # correlation coefficients between adjacent markers
> ld = freq^2*rho # ld = rho*sqrt(f1*(1-f1)*f2*(1-f2))
> X1_m1 = t(apply(GenerateLDGeno(freq, ld, n1), 1, scale)) # scale by row
> X2_m1 = t(apply(GenerateLDGeno(freq, ld, n2), 1, scale)) # scale by row
> rownames(X1_m1) = paste0("TRN", 1:n1)
> rownames(X2_m1) = paste0("TST", 1:n2)
> colnames(X1_m1) = colnames(X2_m1) = SNPset1
> ## m2
> m2vec = round(seq(0.2,1,0.2)*m1) # the snp number for testing sample
> m2num = length(m2vec)
> count =20
>
> ########## realized GRM ############
> ## G21_m1
> G21_m1 = tcrossprod(X2_m1,X1_m1) / m1
> ## Me11
> G11 = tcrossprod(X1_m1,X1_m1) / m1
> Me11 = 1/var(G11[lower.tri(G11)])
> rho11 = m1^2/Me11
>
> ## under different number of m2
> corrG21.o = matrix(NA, m2num, count)
> Me22 = matrix(NA, m2num, count)
> Me = matrix(NA, m2num, count)
> rho21 = matrix(NA, m2num, count)
> rho22 = matrix(NA, m2num, count)
> rho = matrix(NA, m2num, count)
> for(c in 1:count)
+ {
+ for(i in 1:m2num)
+ {
+ m2 = m2vec[i]
+ SNPset2 = sort.int(sample(SNPset1, m2))
+ X1_m2 = X1_m1[,SNPset2]
+ X2_m2 = X2_m1[,SNPset2]
+
+ G22 = tcrossprod(X1_m2, X1_m2) / m2
+ Me22[i,c] = 1/var(G22[lower.tri(G22)])
+ rho22[i,c] = m2^2/Me22[i,c]
+
+ X1_m1m2 = cbind(X1_m1,X1_m2)
+ X2_m1m2 = cbind(X2_m1,X2_m2)
+ G = tcrossprod(X1_m1m2, X1_m1m2) / (m1+m2)
+ Me[i,c] = 1/var(G[lower.tri(G)])
+ rho[i,c] = (m1+m2)^2/Me[i,c]
+
+ G21_m2 = tcrossprod(X2_m2, X1_m2) / m2
+ corrG21.o[i,c] = cor(as.vector(G21_m1),as.vector(G21_m2))^2
+ rho21[i,c] = ((m1+m2)^2/Me[i,c]-m1^2/Me11-m2^2/Me22[i,c])/2
+ }
+ print(c)
+ }
>
> # corrG21.t = apply(rho21, 1, mean)/(rho11+apply(rho21, 1, mean))*(1+m2vec/m1)
> corrG21.t = apply(Me22, 1, mean)/Me11
> dt = data.frame(m2=rep(m2vec,2),
+ grp=rep(c("observed","theoretical"), each=m2num),
+ rsq=c(apply(corrG21.o, 1, mean), corrG21.t),
+ sd=c(apply(corrG21.o, 1, sd), rep(0,m2num)))
> ## plot
> color = c("goldenrod2","lightyellow3")
> ggplot(dt, aes(x=m2,y=rsq, fill=grp, color=grp))+
+ geom_bar(stat="identity", width = 60 , position=position_dodge(width = 80))+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=20, position=position_dodge(80))+
+ geom_text(aes(label=round(rsq,4)),vjust=-1.1, color="black", size=2.5, position=position_dodge(80))+
+ scale_color_manual(values=color, name="") +
+ scale_fill_manual(values=color, name="") +
+ labs(title = expression(paste("Accuracy between ",G[21](m[1])," and ", G[21](m[2]))),
+ subtitle = paste("m1=",m1," n1=",n1," n2=", n2),
+ y=expression(R^2))+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
>
> ########## realized GRM ############
> K = 5*m1
> ## G21hat_m1
> Xhat_m1 = RandomMatrixEncryption(X1_m1, X2_m1, m1, K)
> G21hat_m1 = Xhat_m1[[2]] %*% t(Xhat_m1[[1]]) / K
> ## Encryption
> corrG21hat = matrix(NA, m2num, count)
> Me_m2 = matrix(NA, m2num, count)
> for(c in 1:count)
+ {
+ for(i in 1:m2num)
+ {
+ m2 = m2vec[i]
+ SNPset2 = sort.int(sample(SNPset1, m2))
+ X1_m2 = X1_m1[,SNPset2]
+ X2_m2 = X2_m1[,SNPset2]
+
+ G21_m2 = X2_m2 %*% t(X1_m2) / m2
+ Me_m2[i,c] = 1/var(G21_m2[lower.tri(G21_m2)])
+
+ Xhat_m2 = RandomMatrixEncryption(X1_m2, X2_m2, m2, K)
+ G21hat_m2 = Xhat_m2[[2]] %*% t(Xhat_m2[[1]]) / K
+ corrG21hat[i,c] = cor(as.vector(G21hat_m1), as.vector(G21hat_m2))^2
+ }
+ print(c)
+ }
>
> ## plot
> Me_m1=Me11
> corrG21hat.t = apply(Me_m2,1,mean)/Me_m1/(1+Me_m1/K)/(1+apply(Me_m2,1,mean)/K)
> dt = data.frame(m2=rep(m2vec,2),
+ grp=rep(c("observed","theoretical"), each=m2num),
+ rsq=c(apply(corrG21hat, 1, mean), corrG21hat.t),
+ sd=c(apply(corrG21hat, 1, sd), rep(0, m2num)))
> ggplot(dt, aes(x=m2,y=rsq, fill=grp, color=grp))+
+ geom_bar(stat="identity", width = 60 , position=position_dodge(width = 80))+
+ geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=20, position=position_dodge(80))+
+ geom_text(aes(label=round(rsq,4)),vjust=-1.1, color="black", size=2.5, position=position_dodge(80))+
+ scale_color_manual(values=color, name="") +
+ scale_fill_manual(values=color, name="") +
+ labs(title = expression(paste("Accuracy between ",hat(G[21])(m1)," and ", hat(G[21])(m2))),
+ subtitle = paste("K=",K, " m1=",m1," n1=",n1," n2=", n2),
+ y=expression(R^2))+
+ theme_bw()+
+ theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 8.2 SNP set(y)
- GBLUP vs gwasBLUP
- LD: with vs without LD
- SNP set: {SNP2} belong to {SNP1} , and {SNP2} do not contribute to phenotype
> for(s in 1:2)
+ {
+ n1 = 200
+ n2 = 200
+ m1 = 500 # the snp number for training sample
+ m2 = m1/2
+ SNPset1 = 1:m1
+ SNPset2 = seq(2,m1,2)
+ freq = rep(0.5, m1) # MAF of each marker
+ if(s==1) { ## Scenario1
+ X1_m1 = GenerateGeno(freq, n1)
+ X2_m1 = GenerateGeno(freq, n2)
+ } else { ## Scenario2
+ Dprime = rep(0.8, m1) # correlation coefficients between adjacent markers
+ ld = Dprime*freq^2 # ld = Dprime*min(PaPB,PAPb)
+ X1_m1 = GenerateLDGeno(freq, ld, n1)
+ X2_m1 = GenerateLDGeno(freq, ld, n2)
+ }
+ X_m1 = rbind(X1_m1,X2_m1)
+ X_m1 = t(apply(X_m1, 1, scale)) # scale by row
+ X1_m1 = X_m1[1:n1,]
+ X2_m1 = X_m1[(n1+1):(n1+n2),]
+ rownames(X1_m1) = paste0("TRN", 1:n1)
+ rownames(X2_m1) = paste0("TST", 1:n2)
+ colnames(X1_m1) = colnames(X2_m1) = SNPset1
+ X1_m2 = X1_m1[,SNPset2]
+ X2_m2 = X2_m1[,SNPset2]
+
+ ## simulate y
+ h2 = 0.5
+ b = rnorm(m1-m2, sd=sqrt(h2/(m1-m2)))
+ g = X_m1[,-SNPset2] %*% b
+ sigma_e = (1/h2-1)*var(g)
+ error = rnorm(n1+n2, sd=sqrt(sigma_e))
+ y = g + error
+ y = scale(y)
+ y1 = as.data.frame(y[1:n1], row.names = rownames(X1_m1))
+ y2 = as.data.frame(y[(n1+1):(n1+n2)], row.names = rownames(X2_m1))
+
+ rst = matrix(NA, 1, 4)
+ ## gwasBLUP
+ y2hat=gwasBLUP(X1_m1,X2_m1,y1)
+ rst[1,1]=cor(y2hat,y2)^2
+ y2hat=gwasBLUP(X1_m2,X2_m2,y1)
+ rst[1,2]=cor(y2hat,y2)^2
+ ## GBLUP
+ y2hat=GBLUP(X1_m1,X2_m1,y1)
+ rst[1,3]=cor(y2hat,y2)^2
+ y2hat=GBLUP2(X1_m1,X1_m2,X2_m2,y1)
+ rst[1,4]=cor(y2hat,y2)^2
+
+ ## Output
+ if(s==1) Scenario1 = rst
+ if(s==2) Scenario2 = rst
+ }
>
> rst = rbind(Scenario1,Scenario2)
> colnames(rst) = c("gwasBLUP-m1","gwasBLUP-m2","GBLUP-m1","GBLUP-m2")
> rownames(rst) = c("ScenarioI(LE)","ScenarioII(LD)")
> rst9 Session Information
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] MultiPhen_2.0.3 meta_4.13-0 epitools_0.5-10.1 abind_1.4-5
## [5] sommer_4.1.3 crayon_1.3.4 lattice_0.20-41 MASS_7.3-53
## [9] Matrix_1.2-18 kableExtra_1.3.4 egg_0.4.5 viridis_0.5.1
## [13] viridisLite_0.3.0 gridExtra_2.3 ggExtra_0.9 ggsci_2.9
## [17] ggplot2_3.3.5 rmdformats_1.0.2 knitr_1.30
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 tidyr_1.1.2 HardyWeinberg_1.6.3
## [4] splines_4.0.0 shiny_1.5.0 statmod_1.4.34
## [7] highr_0.8 metafor_2.4-0 yaml_2.2.1
## [10] gdtools_0.2.2 pillar_1.4.4 backports_1.1.7
## [13] glue_1.4.1 digest_0.6.25 RColorBrewer_1.1-2
## [16] promises_1.1.1 rvest_0.3.6 minqa_1.2.4
## [19] colorspace_1.4-1 htmltools_0.5.0 httpuv_1.5.4
## [22] pkgconfig_2.0.3 broom_0.7.0 bookdown_0.22
## [25] purrr_0.3.4 xtable_1.8-4 scales_1.1.1
## [28] webshot_0.5.2 svglite_1.2.3.2 later_1.1.0.1
## [31] lme4_1.1-23 tibble_3.0.1 generics_0.0.2
## [34] ellipsis_0.3.1 withr_2.2.0 magrittr_1.5
## [37] mime_0.9 evaluate_0.14 mice_3.11.0
## [40] nlme_3.1-148 xml2_1.3.2 truncnorm_1.0-8
## [43] tools_4.0.0 lifecycle_0.2.0 stringr_1.4.0
## [46] munsell_0.5.0 compiler_4.0.0 systemfonts_0.3.2
## [49] rlang_0.4.11 grid_4.0.0 nloptr_1.2.2.2
## [52] rstudioapi_0.11 CompQuadForm_1.4.3 Rsolnp_1.16
## [55] miniUI_0.1.1.1 rmarkdown_2.10 boot_1.3-25
## [58] gtable_0.3.0 R6_2.4.1 dplyr_1.0.0
## [61] fastmap_1.0.1 stringi_1.4.6 parallel_4.0.0
## [64] Rcpp_1.0.6 png_0.1-7 vctrs_0.3.1
## [67] tidyselect_1.1.0 xfun_0.25