DNA encryption

qixin

2021-11-11

1 Setting

2 Random matrix theory

> M = 100
> K = 500
> S = matrix(rnorm(M*K,sd=sqrt(1/K)), M, K)
> SS = tcrossprod(S, S)
> heatmap(SS, symm = T, revC = T, 
+         Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "S-by-t(S)",
+         col = hcl.colors(250, "Blues", rev = T))
> heatmap(diag(M), symm = T, revC = T, 
+         Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "Identity matrix",
+         col = hcl.colors(250, "Blues", rev = T))
> 
> n1=20
> n2=25
> freq=runif(M, 0.05, 0.5)
> X1 = GenerateGenoMatrix(n1, freq, F)
> X2 = GenerateGenoMatrix(n2, freq, F)
> X1SSX2 = X1 %*% SS %*% t(X2)
> X1X2 = X1 %*% t(X2)
> heatmap(X1SSX2, revC = T, 
+         Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "X1SSX2",
+         col = hcl.colors(250, "Blues", rev = T))
> heatmap(X1X2, revC = T, 
+         Rowv = NA, Colv = NA, labRow = NA, labCol = NA, main = "X1X2",
+         col = hcl.colors(250, "Blues", rev = T))

3 encGRM

3.1 var(GRM)

  • population: 2 cohorts with (0-3)-degree relations under increasing number of markers.
  • LD: without LD
> ScaleRow = T
> n1 = 1000
> n2 = 1000
> n_cp = n1
> n1_0 = n1-n_cp
> n2_0 = n2-n_cp
> Mvec = seq(n1, n1*2, n1/5)
> Mnum = length(Mvec)
> dt0 = data.frame()
> for(r in 0:3)
+ {
+   rst = vector()
+   for(i in 1:Mnum)
+   {
+     M = Mvec[i]
+     freq = runif(M, 0.05, 0.5)
+     ## r-degree relatives
+     Gr = GenerateGeno_r(freq, n_cp, r, FALSE)
+     X1 = Gr[[1]]
+     X2 = Gr[[2]]
+     X1 = rbind(X1, GenerateGeno(freq, n1_0))
+     X2 = rbind(X2, GenerateGeno(freq, n2_0))
+     if (ScaleRow)
+     {
+       X1 = t(apply(X1, 1, scale))
+       X2 = t(apply(X2, 1, scale))      
+     } else {
+       X1 = scale(X1)
+       X2 = scale(X2)
+     }
+     
+     ## real relationship calculated from individual genotype
+     K12 = tcrossprod(X1, X2) / M
+     rst = c(rst, var(diag(K12)))
+   }
+   dt = data.frame(grp = rep(c("Theoretical", "Observed"), each = Mnum),
+                   r = r,
+                   M = rep(Mvec, 2),
+                   var = c((1-(0.5)^r)/Mvec, rst))
+   dt0 = rbind(dt0, dt)
+ }
> dt0$r = factor(dt0$r, labels = paste0("Degree:",0:3))
> ggplot(data = dt0, aes(x=M, y=var, grp=grp))+
+   geom_point(aes(shape=grp, color=grp), size = 2)+
+   scale_color_lancet(name="") +
+   scale_shape_manual(values = c(16,17), name="")+
+   ylab("Var(GRM)")+
+   facet_wrap(~r, ncol = 2)+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))

3.2 var(encGRM)

  • population: 2 cohorts with (0-3)-degree relations under increasing number of markers.
  • LD: without LD
> ScaleRow = T
> n1 = 1000
> n2 = 1000
> n_cp = n1
> n1_0 = n1-n_cp
> n2_0 = n2-n_cp
> Mvec = seq(n1, n1*2, n1/5)
> Kvec = ceiling(Mvec/(1/0.8-1))
> Mnum = length(Mvec)
> dt0 = data.frame()
> for(r in 0:3)
+ {
+   rst = vector()
+   for(i in 1:Mnum)
+   {
+     M = Mvec[i]
+     K = Kvec[i]
+     freq = runif(M, 0.05, 0.5)
+     ## r-degree relatives
+     Gr = GenerateGeno_r(freq, n_cp, r, FALSE)
+     X1 = Gr[[1]]
+     X2 = Gr[[2]]
+     X1 = rbind(X1, GenerateGeno(freq, n1_0))
+     X2 = rbind(X2, GenerateGeno(freq, n2_0))
+     if (ScaleRow)
+     {
+       X1 = t(apply(X1, 1, scale))
+       X2 = t(apply(X2, 1, scale))      
+     } else {
+       X1 = scale(X1)
+       X2 = scale(X2)
+     }
+     
+     ## encryption
+     S = matrix(rnorm(M*K,sd=sqrt(1/M)), M, K)
+     X1hat = tcrossprod(X1, t(S))
+     X2hat = tcrossprod(X2, t(S))
+     if (ScaleRow)
+     {
+       X1hat = t(apply(X1hat, 1, scale))
+       X2hat = t(apply(X2hat, 1, scale))      
+     } else {
+       X1hat = scale(X1hat)
+       X2hat = scale(X2hat)
+     }   
+     
+     ## realized relationship calculated from encrypted genotype
+     K12hat = tcrossprod(X1hat, X2hat) / K
+     rst = c(rst, var(diag(K12hat)))
+   }
+   dt = data.frame(grp = rep(c("Theoretical", "Observed"), each = Mnum),
+                   r = r,
+                   M = rep(Mvec, 2),
+                   var = c((1-(0.5)^r)/Mvec+(1-(0.5)^r)/Kvec, rst))
+   dt0 = rbind(dt0, dt)
+ }
> dt0$r = factor(dt0$r, labels = paste0("Degree:",0:3))
> ggplot(data = dt0, aes(x=M, y=var, grp=grp))+
+   geom_point(aes(shape=grp, color=grp), size = 2)+
+   scale_color_lancet(name="") +
+   scale_shape_manual(values = c(16,17), name="")+
+   ylab("Var(encGRM)")+
+   facet_wrap(~r, ncol = 2)+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))

3.3 M table

  • Required number of markers in distinguishing relatives
  • Upon the number of markers included, the statistical power to identify r-degree relatives is as below.
> n1 = n2 = 200
> m = c(50,500,5000,10000)
> alpha = 0.05 # type I error
> Kseta = c(0.95, 0.45, 0.2, 0.075)
> D = length(Kseta)
> M = length(m)
> P = matrix(NA, D, M)
> P = as.data.frame(P)
> for(i in 1:D)
+ {
+   for(j in 1:M)
+   {
+     P[i,j] = pnorm((sqrt(m[j])*Kseta[i]-qnorm(1-alpha/n1/n2/2))/sqrt(1-Kseta[i]^2)) #???
+   }
+ }
> rownames(P) = c("identity","1-degree","2-degree","3-degree")
> colnames(P) = paste0("M=",m)
> P %>%
+   kbl(caption = paste0("Table1 Statistical power to identify r-degree relatives ($\\alpha$=", alpha, ", n1=",n1,", n2=",n2,")"),
+       escape = FALSE, digits = 3) %>%
+   kable_classic(html_font = "Cambria", font_size = 15)
Table 3.1: Table1 Statistical power to identify r-degree relatives (\(\alpha\)=0.05, n1=200, n2=200)
M=50 M=500 M=5000 M=10000
identity 1.000 1.000 1.000 1.000
1-degree 0.031 1.000 1.000 1.000
2-degree 0.000 0.351 1.000 1.000
3-degree 0.000 0.001 0.676 0.996

3.4 K table

  • Minimal column number of random matrix
> n1 = n2 = 200
> alpha = 0.05 
> beta = 0.2
> Kseta = c(0.95, 0.45, 0.2, 0.075, 0)
> D = length(Kseta)
> K = matrix(NA, 1, D-1)
> for(i in 1:(D-1))
+ {
+   tmp = ( ( qnorm(1-beta)*sqrt(1-Kseta[i]^2) + qnorm(1-alpha/n1/n2/2) ) / Kseta[i] )^2
+   K[1,i] = as.character(round(tmp))
+ }
> colnames(K) = c("identity","1-degree","2-degree","3-degree")
> rownames(K) = "unrelated"
> 
> K %>%
+   kbl(caption = paste0("Table2 Minimal column number of random matrix(K) ($\\alpha$=", alpha,", $\\beta$=", beta, ", n1=",n1,", n2=",n2,")"),escape = FALSE) %>%
+   kable_classic(html_font = "Cambria", font_size = 15)
Table 3.2: Table2 Minimal column number of random matrix(K) (\(\alpha\)=0.05, \(\beta\)=0.2, n1=200, n2=200)
identity 1-degree 2-degree 3-degree
unrelated 29 155 804 5749

3.5 Rsq

Purpose: to measure the encryption accuracy between real GRM and encrypted GRM, and computational cost under different encryption levels.
- R2(G,Ghat)
- population: 2 cohorts of unrelated individuals
- LD: with or without LD

> n1 = 200
> n2 = 200
> M = 500           # number of markers
> freq = runif(M, 0.05, 0.5)
> Kvec = seq(2*M, 10*M, 2*M)
> knum = length(Kvec)
> count = 20
> ############### LE ##################
> ## genotype
> X1 = GenerateGeno(freq, n1)
> X2 = GenerateGeno(freq, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
> K21 = tcrossprod(X2, X1) / M
> K11 = tcrossprod(X1, X1) / M
> Me = 1/var(K11[lower.tri(K11)])
> ## Encryption
> error = matrix(NA, knum, count)
> corrK21 = matrix(NA, knum, count)
> time_LE = matrix(NA, knum, count)
> for(c in 1:count)
+ {
+   for(i in 1:knum)
+   {
+     t0 = proc.time()
+     K = Kvec[i]
+     A = RandomMatrixEncryption(X1, X2, M, K, FALSE)
+     K21hat = tcrossprod(A[[2]], A[[1]])/ K
+     t1 = proc.time()
+     time_LE[i,c] = (t1-t0)[1]
+     error[i,c] = sqrt(sum((as.vector(K21)-as.vector(K21hat))^2))
+     corrK21[i,c] = cor(as.vector(K21), as.vector(K21hat))^2
+   }
+ }
> corrK21.t = 1/(1+Me/Kvec)
> corr2 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)), 
+                  rsq=c(apply(corrK21, 1, mean), corrK21.t),
+                  sd=c(apply(corrK21, 1, sd), rep(0,knum)))
> error.t=sqrt(n1*n2/Kvec)
> error2 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)), 
+                  error=c(apply(error, 1, mean), error.t),
+                  sd=c(apply(error, 1, sd), rep(0,knum)))
> 
> ############### LD ##################
> ## genotype
> Dprime = runif(M, 0.5, 0.9)
> ld = DprimetoD(freq, Dprime)
> X1 = GenerateLDGeno(freq, ld, n1)
> X2 = GenerateLDGeno(freq, ld, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
> K21 = tcrossprod(X2, X1) / M
> K11 = tcrossprod(X1, X1) / M
> Me = 1/var(K11[lower.tri(K11)])
> ## Encryption
> error = matrix(NA, knum, count)
> corrK21 = matrix(NA, knum, count)
> time_LD = matrix(NA, knum, count)
> for(c in 1:count)
+ {
+   for(i in 1:knum)
+   {
+     t0 = proc.time()
+     K = Kvec[i]
+     A = RandomMatrixEncryption(X1, X2, M, K, FALSE)
+     K21hat = tcrossprod(A[[2]], A[[1]])/ K
+     error[i,c] = sqrt(sum((as.vector(K21)-as.vector(K21hat))^2))
+     corrK21[i,c] = cor(as.vector(K21), as.vector(K21hat))^2
+     t1 = proc.time()
+     time_LD[i,c] = (t1-t0)[1]
+   }
+ }
> corrK21.t = 1/(1+Me/Kvec)
> corr1 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)), 
+                  rsq=c(apply(corrK21, 1, mean), corrK21.t),
+                  sd=c(apply(corrK21, 1, sd), rep(0,knum)))
> error.t = sqrt(n1*n2/Kvec)
> error1 = data.frame(grp=rep(c("observed","theoretical"), each=knum), K=as.factor(rep(Kvec, 2)), 
+                  error=c(apply(error, 1, mean), error.t),
+                  sd=c(apply(error, 1, sd), rep(0,knum)))
> 
> ################# Plot ################
> color = c("olivedrab3","lightyellow3")
> ## GRM accuracy
> dt = rbind(corr2,corr1)
> dt$ld = rep(c("without LD", "with LD"), each=2*knum)
> ggplot(dt, aes(x=K, y=rsq, fill=grp, color=grp))+
+   geom_bar(stat="identity", width = 0.6, position=position_dodge(width = 0.8))+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=0.2, position=position_dodge(0.8))+
+   geom_text(aes(label=round(rsq,4)),vjust=-1.2, color="black", size=2.5, position=position_dodge(0.8))+
+   scale_color_manual(values=color, name="") +
+   scale_fill_manual(values=color, name="") +
+   labs(title = expression(paste("Accuracy between G and ", hat(G))),
+        subtitle = paste("M=",M," n1=",n1," n2=", n2),
+        y=expression(R^2))+
+   facet_wrap(~ld)+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))  
> ## error
> dt = rbind(error2,error1)
> dt$ld = rep(c("without LD", "with LD"), each=2*knum)
> ggplot(dt, aes(x=K, y=error, fill=grp, color=grp))+
+   geom_bar(stat="identity", width = 0.6, position=position_dodge(width = 0.8))+
+   geom_errorbar(aes(ymin=error-sd, ymax=error+sd), color="black", width=0.2, position=position_dodge(0.8))+
+   geom_text(aes(label=round(error,4)),vjust=-1.2, color="black", size=2.5, position=position_dodge(0.8))+
+   scale_color_manual(values=color, name="") +
+   scale_fill_manual(values=color, name="") +
+   labs(title = expression(paste("Error between G and ", hat(G))),
+        subtitle = paste("M=",M," n1=",n1," n2=", n2),
+        y="error")+
+   facet_wrap(~ld)+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 
> 
> ## CPU time
> time = data.frame(grp=rep(c("without LD", "with LD"), each=knum), K=as.factor(rep(Kvec, 2)),
+                    t=c(apply(time_LE, 1, mean), apply(time_LD, 1, mean)),
+                    sd=c(apply(time_LE, 1, sd), apply(time_LD, 1, sd)))
> ggplot(time, aes(x=K, y=t, group = grp))+
+   geom_point()+
+   geom_line()+
+   geom_errorbar(aes(ymin=t-sd, ymax=t+sd), width=.2,
+                 position=position_dodge(0.05))+
+   scale_color_lancet(name="")+
+   labs(title = "CPU time for Encryption",
+        subtitle = paste("M=",M," n1=",n1," n2=", n2),
+        y="CPU time(secs)")+
+   facet_wrap(~grp)+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 

3.6 M vs K

  • population: 2 cohorts with identity, 1-degree, 2-degree and 3-degree relations
  • LD: without LD
> color = c("indianred","sienna1","goldenrod2","olivedrab3","lightyellow3")
> n1 = 200   # sample size in cohort1
> n2 = 200   # sample size in cohort2
> alpha = 0.05/(n1*n2) # type I error
> beta = 0.2           # type II error two tail test
> D = 4 # top degree of relations included in simulation
> r = c(0:(D-1))  
> n_cp = rep(10, D) # numbers for each degree of relations
> n1_0 = n1-sum(n_cp)
> n2_0 = n2-sum(n_cp)
> 
> p=1
> dt = list()
> Mvec=c(50, 200, 900, 6200)
> setavec=c(0.95, 0.45, 0.2, 0.075)
> for(m in 1:length(Mvec))
+ {
+   for(k in 1:length(setavec))
+   {
+     # Determine K
+     seta = setavec[k]
+     K = ( ( qnorm(1-beta)*sqrt(1-seta^2) + qnorm(1-alpha/2) ) / seta )^2 
+     K = ceiling(K)
+     # Genotypes 
+     M = Mvec[m]
+     freq = rep(0.5, M)
+     X1 = matrix(NA, 0, M)
+     X2 = matrix(NA, 0, M)
+     for(i in 1:D)
+     {
+       Gr = GenerateGeno_r(freq, n_cp[i], r[i], FALSE)
+       X1 = rbind(X1, Gr[[1]])
+       X2 = rbind(X2, Gr[[2]])
+     }
+     X1 = rbind(X1, GenerateGeno(freq, n1_0))
+     X2 = rbind(X2, GenerateGeno(freq, n2_0))
+     X1 = scale(X1)
+     X2 = scale(X2)
+ 
+     ## Encryption
+     A = RandomMatrixEncryption(X1, X2, M, K, FALSE)
+     vG21hat = as.vector(tcrossprod(A[[2]], A[[1]])/ K)
+     vG21 = as.vector( tcrossprod(X2, X1) / M)
+     
+     ## Save data
+     yin = qnorm(1/n1/n2)*sqrt(1/M)
+     xin = qnorm(1/n1/n2)*sqrt(1/M+1/K)
+     lab = lab_relations_true(n1, n2, n_cp)
+     dt[[p]] = data.frame(x=vG21hat, y=vG21, lab=as.factor(lab), M=M, K=K, yin=yin, xin=xin)
+     
+     p=p+1
+   }
+ }
> 
> dt0 = do.call("rbind", dt)
> dt0$K = as.factor(dt0$K)
> dt0$M = as.factor(dt0$M)
> ggplot(dt0, aes(x=x, y=y)) + 
+   geom_point(aes(color=lab), size=0.3) +
+   geom_hline(aes(yintercept = yin), color="blue", alpha=0.5, linetype = 5) +
+   geom_hline(aes(yintercept = -yin), color="blue", alpha=0.5, linetype = 5) +
+   geom_vline(aes(xintercept = xin), color="red", alpha=0.5, linetype = 5) +
+   geom_vline(aes(xintercept = -xin), color="red", alpha=0.5, linetype = 5) +
+   scale_x_continuous(limits=c(-1, 1.15)) + xlab("Estimated relatedness") +
+   scale_y_continuous(limits=c(-1, 1.15)) + ylab("True relatedness") +
+   scale_color_manual(values=color, name="Relations", 
+                      labels=c("identity","1-degree","2-degree","3-degree","unrelated")) +
+   facet_grid(M~K, labeller = label_both) + 
+   theme_article()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))

4 encBLUP

4.1 “chain rule”

  • Using GBLUP and encGBLUP as an example to examine the relationship between three squared correlation.
  • “chain rule”: E(R_p2)=E(R_(p_enc)2)/P
  • population: 2 cohorts of unrelated individuals
  • LD: with LD
> ## genotype
> h2 = 0.5 # 0.3 and 0.8
> n1 = 200
> n2 = 200
> M = 500
> freq = rep(0.5, M)
> Dprime = runif(M, 0.5, 0.9)
> ld = DprimetoD(freq, Dprime)
> X1 = GenerateLDGeno(freq, ld, n1)
> X2 = GenerateLDGeno(freq, ld, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
> 
> ## choose encP
> encP = c(0.5, 0.7, 0.8, 0.9, 0.95, 0.99)
> encPnum = length(encP)
> G11 = tcrossprod(X1) / M
> Me = 1/var(G11[lower.tri(G11)])
> ERsq1 = h2/(1+Me/n1/h2)
> 
> count = 20
> Rsq1 = matrix(NA, 1, count)
> Rsq3 = matrix(NA, encPnum, count)
> for(c in 1:count){
+   ## simulate y
+   # Polygenic Model
+   sigma_g = h2
+   b = rnorm(M, sd=sqrt(sigma_g/M))
+   g1 = X1 %*% b
+   g2 = X2 %*% b
+   y1 = g1 + rnorm(n1, sd=sqrt((1/h2-1)*var(g1)))
+   y2 = g2 + rnorm(n2, sd=sqrt((1/h2-1)*var(g2)))
+   
+   # Omnigenic Model
+   # sigma_g = h2
+   # b = runif(0.1*M, 10, 15)
+   # qtl = sample(1:M, 0.1*M)
+   # g1 = X1[,qtl] %*% b
+   # g2 = X2[,qtl] %*% b
+   # y1 = g1 + rnorm(n1, sd=sqrt((1/h2-1)*var(g1)))
+   # y2 = g2 + rnorm(n2, sd=sqrt((1/h2-1)*var(g2)))
+   
+   ## GBLUP (Rsq1)
+   gblup = GBLUP(X1, X2, y1)
+   Rsq1[1,c] = cor(gblup, y2)^2
+ 
+   ## encGBLUP (Rsq3)
+   encgblup = list()
+   for(i in 1:encPnum)
+   {
+     encgblup[[i]] = encGBLUP(X1, X2, y1, encP = encP[i], Me)
+   }
+   encgblup = matrix(unlist(encgblup), n2, encPnum)
+   Rsq3[,c] = apply(encgblup, 2, function(x) cor(x, y2)^2)
+   
+   # ## gwasBLUP
+   # gwas = gwasBLUP(X1, X2, y1)
+   # Rsq1[1,c] = cor(gwas, y2)^2
+   # 
+   # ## encgwasBLUP (Rsq3)
+   # encgwas = list()
+   # for(i in 1:encPnum)
+   # {
+   #   encgwas[[i]] = encgwasBLUP(X1, X2, y1, encP[i], Me)
+   # }
+   # encgwas = matrix(unlist(encgwas), n2, encPnum)
+   # Rsq3[,c] = apply(encgwas, 2, function(x) cor(x, y2)^2)
+   print(c)
+ }
> 
> ## plot tranformed Rsq1 by dividing Rsq3 by encP
> layout(matrix(1:2, 1,2))
> dt = as.data.frame(t(Rsq3))
> boxplot(dt, ylim=c(0,1), xlab = "encP", ylab = "predicted accuracy", main = "Before transformation", names = encP)
> abline(h=ERsq1, col="blue")
> abline(h=mean(Rsq1[1,]), col="red")
> dt = as.data.frame(t(apply(Rsq3, 2, function(x) x/encP )))
> boxplot(dt, ylim=c(0,1), xlab = "encP", ylab = "predicted accuracy", main = "After transformation", names = encP)
> abline(h=ERsq1, col="blue")
> abline(h=mean(Rsq1[1,]), col="red")

5 UKB encBLUP

5.1 Phenotypes

  • Use ukbconv (converter) & ukb40268.enc_ukb (phenotype source data)
> ## copy tools to my own directory
+ cd /data4/zqx/encBLUP/2-phenoukb
+ ## r:format -S:the Category ID(50) for STANDING HEIGHT
+ nohup /data4/xuhm/data/ukb/update_Phe/40268/ukbconv /data4/xuhm/data/ukb/update_Phe/40268/ukb40268.enc_ukb r -S50 &
+ # Form a plink phenotype file: ukb50.phe
+ ID=50
+ echo -ne "FID\tIID\tukb${ID}\n" > ukb${ID}.phe
+ awk '(NR>1){print $1,$1,$2}' ukb40268.tab >> ukb${ID}.phe

5.2 Test data

  • UKB-Center British
  • Output: user1.bed/.bim/.fam and user2.bed/.bim/.fam
> ## Three major cohorts in central British
+ cp /data4/xuhm/data/ukb/british/entire/cohort/Liverpool.txt .
+ cp /data4/xuhm/data/ukb/british/entire/cohort/Leeds.txt .
+ cp /data4/xuhm/data/ukb/british/entire/cohort/Manchester.txt .
+ cat Leeds.txt Liverpool.txt Manchester.txt > center.txt
+ plink --bfile /data4/xuhm/data/ukb/british/unrelated_british/urltd_British --keep center.txt --make-bed --out center
+ 
+ ## SNP QC (589876 variants and 53051 people pass filters and QC.)
+ plink --bfile center --maf 0.01 --make-bed --out center # 131586 variants removed
+ plink --bfile center --hwe 1e-7 --make-bed --out center # 20701 variants removed
+ plink --bfile center --geno 0.05 --make-bed --out center # 42093 variants removed
+ plink --bfile center --pheno ../2-phenoukb/ukb50.phe --make-bed --out center # merge phenotypes
+ 
+ ## Pruning and clumping (37734 variants remained)
+ plink --bfile center --indep-pairwise 50 5 0.8 --out center
+ plink2 --bfile center --extract center.prune.in --glm allow-no-covars --pfilter 0.05 --out center 
+ awk '(NR>1){print $3}' center.PHENO1.glm.linear > candidates.txt 
+ rm *~
+ 
+ ## Test Plink files for two users
+ awk '{print $1,$2}' center.fam > urltd-center.txt
+ 
+ dir=sample
+ mkdir $dir
+ shuf -n53051 urltd-center.txt > ./$dir/test.txt
+ head -n42440 ./$dir/test.txt > ./$dir/user1.txt
+ tail -n10611 ./$dir/test.txt > ./$dir/user2.txt
+ plink --bfile center --keep ./$dir/user1.txt --extract candidates.txt --make-bed --out ./$dir/user1
+ plink --bfile center --keep ./$dir/user2.txt --extract candidates.txt --make-bed --out ./$dir/user2
+ cd $dir

5.3 Protocol-std

> ################# GLOBAL #################
> t0=Sys.time()
> sc = 1
> u1 = "user1"      # default user1 name
> u2 = "user2"      # default user2 name
> flg.me = F        # if TRUE, then Me is already been given in Me.txt
> flg.std = T       # if TRUE, genotypes will be standardized
> flg.u1.1 = T      # Step1-User1: Me and K calculation, if R2 changed this flag should be TRUE
> flg.u1.2 = T      # Step2-User1: GWAS
> flg.sv.3 = T      # Step3-Server: generate random matrix
> flg.u1.4 = T      # Step4-User1: Encryption beta
> flg.u2.5 = T      # Step5-User2: Encryption genotypes matrix
> flg.sv.6 = T      # Step6-Server: Genomic Prediction
> arg = commandArgs(T); R2 = as.numeric(arg[1]) # encryption precision
> M = system(paste0("awk 'END{print NR}' ", u1, ".bim"), intern = T)
> write.table(M, "M.txt", quote = F, col.names = F, row.names = F)
> plink = system("which plink",intern=T) # path to plink
> plink2 = system("which plink2",intern=T) # path to plink2
> 
> ################# Step1-User1: Me and K calculation #################
> # Input: user1.bed/.bim/.fam
> # Output: K.txt and Me.txt
> if(flg.u1.1)
+ {
+   if(flg.me)
+   {
+     # if Me has already been calculated
+     Me = as.numeric(read.table("Me.txt"))
+   }
+   else
+   {
+     # if Me is not calculated
+     grm_cmd = paste0(plink, " --bfile ", u1, " --make-grm-gz --out ",u1)
+     system(grm_cmd)
+     gz = gzfile(paste0(u1,".grm.gz"))
+     grm = read.table(gz, as.is = T)
+     offDiag = grm[grm[,1]!=grm[,2], 4]
+     Me = 1/var(offDiag/sc, na.rm = TRUE)
+     write.table(Me, "Me.txt", quote = F, col.names = F, row.names = F)
+   }
+   K = round(Me/(1/R2-1))
+   write.table(K, "K.txt", quote = F, col.names = F, row.names = F)
+   print("========== Step1-User1: Me and K calculation finished!! ==============")
+ }
> 
> #################### Step2-User1: GWAS ###################
> # Input: user1.bed/.bim/.fam
> # Output: A1.txt ( SNPID | A1 ) and beta.txt ( Beta )
> if(flg.u1.2)
+ {
+   if(flg.std)
+   {
+     cmd1 = paste0(plink, " --bfile ", u1, " --linear standard-beta --out ",u1)
+     cmd2 = paste0("awk '(NR>1){print $2,$4}' ",u1, ".assoc.linear > A1.txt")
+     cmd3 = paste0("awk '(NR>1){print $7}' ", u1, ".assoc.linear > beta.txt")
+   }else{
+     cmd1 = paste0(plink2, " --bfile ", u1, " --glm allow-no-covars --variance-standardize --out ",u1)
+     cmd2 = paste0("awk '(NR>1){print $3,$6}' ",u1, ".PHENO1.glm.linear > A1.txt")
+     cmd3 = paste0("awk '(NR>1){print $9}' ", u1, ".PHENO1.glm.linear > beta.txt")
+   }
+   system(cmd1)
+   system(cmd2)
+   system(cmd3)
+   print("========== Step2-User1: GWAS finished!! ====================")
+ }
> 
> ############## Step3-Server: generate random matrix ##############
> # Input: M.txt and K.txt
> # Output: key.txt
> if(flg.sv.3)
+ {
+   M = as.numeric(read.table("M.txt"))
+   K = as.numeric(read.table("K.txt"))
+   W = matrix(rnorm(M*K, sd=sqrt(1/M)), M, K)
+   write.table(format(W, digits = 3), file="key.txt", quote = F, col.names = F, row.names = F)
+   print("========== Step3-Server: generate random matrix finished!! ==========")
+ }
> 
> ##################### Step4-User1: Encryption beta #####################
> # Input: key.txt and beta.txt
> # Output: locked-beta.txt
> if(flg.u1.4)
+ {
+   W = as.matrix(read.table("key.txt"))
+   beta = read.table("beta.txt")
+   locked_beta= crossprod(as.matrix(beta), W)
+   write.table(format(t(locked_beta), digits = 4), file="locked-beta.txt", quote = F, col.names = F, row.names = F)
+   print("========== Step4-User1: Encryption beta finished!! ================")
+ }
> 
> ##################### Step5-User2: Encryption Genotype Matrix  #####################
> # Input: K.txt, key.txt and user2.bed/.bim/.fam
> # Output: user2.sscore
> if(flg.u2.5)
+ {
+   K = as.numeric(read.table("K.txt"))
+   cmd1 = "paste -d \" \" A1.txt key.txt > A1-key.txt"
+   if(flg.std){
+     cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-key.txt 1 2 variance-standardize --score-col-nums 3-",K+2," --variance-standardize --out ", u2)
+   }else{
+     cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-key.txt 1 2 --score-col-nums 3-",K+2," --variance-standardize --out ", u2)
+   }
+   cmd3 = paste0("rm A1-key.txt")
+   system(cmd1)
+   system(cmd2)
+ 
+   print("========== Step5-User2: Encryption Genotype Matrix finished!! ===============")
+ }
> 
> ##################### Step6-Server: Prediction #####################
> # Input: user2.sscore and locked-beta.txt
> # Output: yhat
> if(flg.sv.6)
+ {
+   Xenc = read.table(paste0(u2, ".sscore"))
+   benc = read.table("locked-beta.txt")
+   y2 = Xenc[,3]
+   yhat = tcrossprod(as.matrix(Xenc[,-c(1:5)]), t(as.matrix(benc)))
+   write.table(cor(y2, yhat)^2, paste0("Accuracy.",R2,".txt"), quote = F, col.names = F, row.names = F)
+   print("========== Step6-Server: Prediction finished!! ===============")
+ }
> 
> t1=Sys.time()
> print(t1-t0)

5.4 Open-std

> ################# GLOBAL #################
> t0=Sys.time()
> u1 = "user1"      # default user1 name
> u2 = "user2"      # default user2 name
> flg.std = T       # if TRUE, genotypes will be standardized
> plink = system("which plink",intern=T) # path to plink
> plink2 = system("which plink2",intern=T) # path to plink2
> 
> ## GWAS
> # if(flg.std)
> # {
> #   cmd1 = paste0(plink, " --bfile ", u1, " --linear standard-beta --out ",u1)
> #   cmd2 = paste0("awk '(NR>1){print $2,$4}' ",u1, ".assoc.linear > A1.txt")
> #   cmd3 = paste0("awk '(NR>1){print $7}' ", u1, ".assoc.linear > beta.txt")
> # }else{
> #   cmd1 = paste0(plink2, " --bfile ", u1, " --glm allow-no-covars --variance-standardize --out ",u1)
> #   cmd2 = paste0("awk '(NR>1){print $3,$6}' ",u1, ".PHENO1.glm.linear > A1.txt")
> #   cmd3 = paste0("awk '(NR>1){print $9}' ", u1, ".PHENO1.glm.linear > beta.txt")
> # }
> # system(cmd1)
> # system(cmd2)
> # system(cmd3)
> 
> ## Predict
> cmd1 = "paste -d \" \" A1.txt beta.txt > A1-beta.txt"
> if(flg.std){
+   cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-beta.txt 1 2 3 variance-standardize --variance-standardize --out ", u2, ".open")
+ }else{
+   cmd2 = paste0(plink2, " --bfile ", u2, " --score A1-beta.txt 1 2 3 --variance-standardize --out ", u2, ".open")
+ }
> system(cmd1)
> system(cmd2)
> Xenc = read.table(paste0(u2, ".open.sscore"))
> y2 = Xenc[,3]
> yhat = Xenc[,6]
> write.table(cor(y2, yhat)^2, paste0("Accuracy.open.txt"), quote = F, col.names = F, row.names = F)
> 
> 
> t1=Sys.time()
> print(t1-t0)
  • old test version
  • this was tested on 133server
> ## Input
> setwd("~/Desktop/Cryptography/ukb")
> geno.opts = mPhen.options("geno.input")
> # geno.opts$mPhen.batch = 5000
> Xinput = read.plink("test",opts = geno.opts)
> n = dim(Xinput)[1]
> M = dim(Xinput)[2]
> pheno.opts = mPhen.options("pheno.input")
> pheno.opts$mPhen.sep.pheno=" "
> pheno.opts$mPhen.numHeaderRows.pheno = 0
> pheno = mPhen.readPhenoFiles("test.fam", opts = pheno.opts)
> y = as.matrix(pheno$pheno[,3])
> 
> ## randomly impute genotype NA
> Xinput[is.na(Xinput)] = sample(c(0,1,2), length(which(is.na(as.vector(Xinput)))), replace = T)
> X = scale(Xinput)
> y = scale(y)
> colnames(X) = colnames(Xinput)
> rm(Xinput)
> 
> ## 5-fold cross-validation
> iid = rownames(y)
> cvsize=5
> cv2mat = matrix(sample(c(iid, rep(NA,cvsize-n%%cvsize)), n+cvsize-n%%cvsize), cvsize, (n+cvsize-n%%cvsize)/cvsize )
> R2 = c(0.5, 0.7, 0.8, 0.9, 0.95)
> knum = length(R2)
> rst = matrix(NA, cvsize, (knum+1)*2)
> for(c in 2:cvsize)
+ {
+   t0=Sys.time()
+   ## divide training and testing
+   cv2 = cv2mat[c,]
+   if(length(which(is.na(cv2)))>0) cv2 = cv2[-which(is.na(cv2))]
+   cv1 = iid[-which(iid %in% cv2)]
+   X1 = X[cv1,]
+   X2 = X[cv2,]
+   y1 = as.matrix(y[cv1,])
+   y2 = as.matrix(y[cv2,])
+   n1 = length(cv1)
+   n2 = length(cv2)
+   ## SBLUP
+   bhat = apply(X1, 2, function(x) cov(x, y1)/var(x))
+   y2hat = X2 %*% bhat
+   rsq1 = as.numeric(cor(y2hat, y2)^2)
+   
+   ## GBLUP
+   y2hat = GBLUP(X1, X2, y1)
+   rsq2 = as.numeric(cor(y2hat, y2)^2)
+   
+   Me = 1/var(K21[lower.tri(K21)])
+   Kvec = round(Me/(1/R2-1))
+ 
+   rsq3 = matrix(NA, 1, knum)
+   rsq4 = matrix(NA, 1, knum)
+   for(i in 1:knum)
+   {
+     K = Kvec[i]
+     ## encSBLUP
+     A = RandomMatrixEncryption(X2, bhat, M, K)
+     y2hat = A[[1]] %*% t(A[[2]])
+     rsq3[1,i] = as.numeric(cor(y2hat, y2)^2)
+     ## encGBLUP
+     A = RandomMatrixEncryption(X1, X2, M, K)
+     K21hat = A[[2]] %*% t(A[[1]]) / K
+     y2hat_enc = K21hat %*% solV %*% as.matrix(y1)
+     rsq4[1,i] = as.numeric(cor(y2hat_enc, y2)^2)
+     print(i)
+   }
+   rst[c,] = c(rsq1, rsq3, rsq2, rsq4)
+   t1=Sys.time()
+   print(c)
+   print("CV Time:")
+   print(t1-t0)
+ }
> load(f="./result.m37734.Rdata")
> df = data.frame(rsq=apply(rst, 2, mean),grp=rep(c("SBLUP","GBLUP"),each=6), ratio=rep(c(1,R2),2))
> df$ratio = as.factor(df$ratio)
> ggplot(data=df,aes(x=ratio, y=rsq, grp=grp, color=grp, fill=grp))+
+   geom_bar(stat="identity", width = 0.6, position="dodge")+
+   labs(title = "Center British",
+        subtitle = paste0("M=5000, n1=3253"), 
+        x = "Encryption Precision",
+        y = "Prediction accuracy")+
+   theme_bw()+
+   theme(legend.position = "bottom",plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))

6 UKB relatives

6.1 Prepare

  • count paired 1-degree relatives between cities (“British_sib” may not be reliable)
> cd /data4/xuhm/data/ukb/british/sib
+ nohup awk '{if($8>0.354) print $0}' British_sib_tab.kin0 > 0-degree.txt &
+ nohup awk '{if($8>0.177 && $8<0.354) print $2,$4,$8}' British_sib_tab.kin0 > 1-degree.txt &
+ ## Extract city information
+ line=`awk 'END{print NR}' 1-degree.txt`
+ for i in `seq $line`
+ do
+ {
+   id1=`awk -v i=$i '(NR==i){print $1}' 1-degree.txt`
+   city1=`grep $id1 ../entire/cohort/*.txt | cut -d ' ' -f3`
+   if [ -z $city1 ]; then city1="NA"; fi
+   id2=`awk -v i=$i '(NR==i){print $2}' 1-degree.txt`
+   city2=`grep $id2 ../entire/cohort/*.txt | cut -d ' ' -f3`
+   if [ -z $city2 ]; then city2="NA"; fi
+   value=`awk -v i=$i '(NR==i){print $3}' 1-degree.txt`
+   echo -e "${id1}\t${id2}\t${city1}\t${city2}\t${value}" >> 1-degree-city.txt
+ }
+ done
+ 
+ ## Calculate cross-city 1-degree pairs
+ cohort_file=(`cat ../entire/cohort/cohort.txt`)
+ cohort_name=(${cohort_file[@]%.*})
+ cohort_num=${#cohort_file[@]}
+ for i in ${cohort_name[@]}
+ do
+ {
+   for j in ${cohort_name[@]}
+   do
+   {
+     if [ "$i" != "$j" ] ; then grep $i 1-degree-city.txt | grep $j | awk -vi=$i -vj=$j 'END{print i,j,NR}' >> pair_counts.txt ; fi
+   }
+   done
+ }
+ done

6.2 Method2-KING

  • Generate sample data
  • city1=Manchester
  • city2=Oxford
> ## form plink file including two cities
+ cd /data4/zqx/king
+ city1='Manchester'
+ city2='Oxford'
+ pwd='/data4/xuhm/data/ukb/british/entire/cohort/'
+ cat ${pwd}${city1}.txt ${pwd}${city2}.txt > indi.txt
+ plink --bfile ${pwd}ukb --keep indi.txt --maf 0.01 --hwe 1e-7 --geno 0.05 --make-bed --out ${city1}-${city2}.qc
+ 
+ ## make king table
+ nohup plink2 --bfile ${city1}-${city2}.qc --make-king-table --out ${city1}-${city2}.qc &
+ 
+ ## extract 1-degree relatives information between two cities
+ nohup awk '{if($8>0.177 && $8<0.354) print $2,$4,$8}' ${city1}-${city2}.qc.kin0 > 1-degree.txt &
+ line=`awk 'END{print NR}' 1-degree.txt`
+ ## list their city names
+ for i in `seq $line`
+ do
+ {
+   id1=`awk -v i=$i '(NR==i){print $1}' 1-degree.txt`
+   city1=`grep $id1 indi.txt | cut -d ' ' -f3`
+   if [ -z $city1 ]; then city1="NA"; fi
+   id2=`awk -v i=$i '(NR==i){print $2}' 1-degree.txt`
+   city2=`grep $id2 indi.txt | cut -d ' ' -f3`
+   if [ -z $city2 ]; then city2="NA"; fi
+   value=`awk -v i=$i '(NR==i){print $3}' 1-degree.txt`
+   echo -e "${id1}\t${id2}\t${city1}\t${city2}\t${value}" >> 1-degree-city.txt
+ }
+ done

6.3 Method1-encGRM

  • Generate sample data
  • city1=Manchester
  • city2=Oxford
> cd /data4/zqx/related
+ M=5000
+ city1='Manchester'
+ city2='Oxford'
+ pwd='/data4/xuhm/data/ukb/british/entire/cohort/'
+ plink --bfile ${pwd}ukb --chr 1 --maf 0.01 --hwe 1e-7 --geno 0.05 --make-bed --out ukb.chr1.qc
+ awk '{print $1,$2}' ukb.chr1.qc.bim | shuf -n${M} > snp${M}.txt
+ 
+ # generate plink file
+ plink --bfile ukb.chr1.qc --keep ${pwd}${city1}.txt --extract snp${M}.txt --make-bed --out $city1
+ plink --bfile ukb.chr1.qc --keep ${pwd}${city2}.txt --extract snp${M}.txt --make-bed --out $city2
+ 
+ # determine A1 allele
+ plink --bfile ${city1} --freq --out ${city1}
+ awk '(NR>1){print $2,$3}' ${city1}.frq > A1.txt
+ 
+ # generate .raw file
+ plink --bfile ${city1} --recode A --recode-allele A1.txt --out ${city1}
+ plink --bfile ${city2} --recode A --recode-allele A1.txt --out ${city2}
+ 
+ # generate .ind file
+ awk '{print $2}' ${city1}.fam > ${city1}.ind
+ awk '{print $2}' ${city2}.fam > ${city2}.ind
  • encryption
> source("/Users/apple/Desktop/Cryptography/source.R")
> setwd("/Users/apple/Desktop/Cryptography/ukb/grm/")
> ## function
> RandomMatrixEncryption = function(Mat1, Mat2, M, K)
+ {
+   if(ncol(Mat1)!=M & ncol(Mat1)!=M) return(NULL)
+   W = matrix(rnorm(M*K,sd=sqrt(1/M)), M, K)
+   RandomMat1 = Mat1 %*% W
+   RandomMat2 = Mat2 %*% W
+   RandomMat1 = t(apply(RandomMat1, 1, scale)) # scale by row
+   RandomMat2 = t(apply(RandomMat2, 1, scale)) # scale by row
+   return(list(RandomMat1,RandomMat2))
+ }
> 
> c1 = "Manchester"
> c2 = "Oxford"
> X1 = read.table(paste0(c1,".raw"), header = T)
> X2 = read.table(paste0(c2,".raw"), header = T)
> X1 = X1[,-c(1:6)]
> X2 = X2[,-c(1:6)]
> name1 = read.table(paste0(c1,".ind"), header = F)
> name2 = read.table(paste0(c2,".ind"), header = F)
> rownames(X1) = name1[,1]
> rownames(X2) = name2[,1]
> n1=nrow(X1)
> n2=nrow(X2)
> M=ncol(X1)
> 
> # fill NA
> X1[is.na(X1)] = sample(c(0,1,2), length(which(is.na(as.vector(X1)))), replace = T)
> X2[is.na(X2)] = sample(c(0,1,2), length(which(is.na(as.vector(X2)))), replace = T)
> X1 = apply(X1, 2, scale)
> X2 = apply(X2, 2, scale)
> 
> # Encryption
> alpha = 0.05/(n1*n2) # type I error
> beta = 0.2           # type II error two tail test
> seta = 0.25         
> K = ( ( qnorm(1-beta)*sqrt(1-seta^2) + qnorm(1-alpha/2) ) / seta )^2 
> K = ceiling(K) # theoretical needed number of PC
> A = RandomMatrixEncryption(X1, X2, M, K)
> grm_hat = tcrossprod(A[[1]],A[[2]])/K
> grm = tcrossprod(X1,X2)/M
> Me = 1/var(grm[lower.tri(grm)])
> 
> # summary
> r2_o = cor(as.vector(grm),as.vector(grm_hat))^2  # observed correlation
> r2_t = 1/(1 + Me/K)                              # theoretical correlation
> yin = qnorm(1/n1/n2)*sqrt(1/Me)
> xin = qnorm(1/n1/n2)*sqrt(1/Me+1/K)
> 
> ## confirm
> thrd = -xin
> sign = which(as.vector(grm_hat)>thrd)
> value = round(as.vector(grm_hat)[sign], 4)
> row = sign %% n1 
> col = sign %/% n1 + 1
> # output individual id
> id = matrix(NA, length(sign), 2)
> id[,1] = name1[row,1]
> id[,2] = name2[col,1]
> colnames(id) = c(c1,c2)
> id = cbind(id, value)
> write.table(id, file = "identified.txt", quote = F, row.names = F, col.names = F)
> 
> ## plot.Rdata
> lab = array(9, dim = n1*n2)
> lab[sign] = 1
> dt = data.frame(x=as.vector(grm_hat), lab = lab, y=as.vector(grm), M=M, K=K, yin=yin, xin=xin)
> dt$K = as.factor(dt$K)
> dt$M = as.factor(dt$M)
> dt$lab = as.factor(dt$lab)
> # save(dt, file = "plot.Rdata")
> 
> ## plot(on server)
> library(egg)
> library(ggplot2)
> load(file = "plot.Rdata")
> color = c("sienna1","lightyellow3")
> p = ggplot(dt[c(1:1000,11226754,15072985),], aes(x=x, y=y)) +
+   geom_point(aes(color=lab), size=0.3) +
+   geom_hline(aes(yintercept = yin), color="blue", alpha=0.5, linetype = 5) +
+   geom_hline(aes(yintercept = -yin), color="blue", alpha=0.5, linetype = 5) +
+   geom_vline(aes(xintercept = xin), color="red", alpha=0.5, linetype = 5) +
+   geom_vline(aes(xintercept = -xin), color="red", alpha=0.5, linetype = 5) +
+   scale_x_continuous(limits=c(-1, 1.15)) + xlab("Estimated relatedness") +
+   scale_y_continuous(limits=c(-1, 1.15)) + ylab("True relatedness") +
+   facet_grid(M~K, labeller = label_both) +
+   scale_color_manual(values=color, name="Relations", labels=c("1-degree","unrelated")) +
+   theme_article()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))
> 
> ggsave(p, filename = "Manchester-Oxford-reduce.pdf", dpi = 600, width = 12.4, height = 13.3)
> ggsave(p, filename = "Manchester-Oxford-reduce.jpeg",dpi = 600, width = 12.4, height = 13.3)

7 Appendix

7.1 Appendix1: Me

For 100 equifrequent biallelic loci , if the correlation for each pair of consecutive markers is 0,0.25,0.5,0.75, the effective number of markers is approximately 102, 90, 62, 30

> n1=50
> n2=50
> M=100
> freq = rep(0.5, M)
> count = 50
> for(r in c(0, 0.25, 0.5, 0.75))
+ {
+   rho = rep(r, M)
+   ld = freq^2*rho
+   me = c()
+   for(c in 1:count)
+   {
+     X1 = GenerateLDGeno(freq, ld, n1)
+     X2 = GenerateLDGeno(freq, ld, n2)
+     G = rbind(X1,X2)
+     G = t(apply(G, 1, scale)) # scale by row
+     X1 = G[1:n1,]
+     X2 = G[(n1+1):(n1+n2),]
+     
+     grm = X1 %*% t(X2) / M
+     me = c(me,1/var(grm[lower.tri(grm)]))
+   }
+   print(mean(me))
+ }

7.2 Appendix2: scale

What is the difference between scaling by row and by column

> n1 = 30   # sample size in cohort1
> n2 = 30   # sample size in cohort2
> n0 = 30
> alpha = 0.05/(n1*n2) # type I error
> beta = 0.2        # type II error two tail test
> seta = 0.45
> M = 100
> K = ( ( qnorm(1-beta/2)*sqrt(1-seta^2) + qnorm(1-alpha) ) / seta )^2 
> K = round(K)
> freq = rep(0.5, M)   # MAF of each marker
> 
> ## generate genotypes of identity pairs
> Gr = GenerateGeno_r(freq, n0, 0)
> X1 = Gr[[1]]
> X2 = Gr[[2]]
> X1 = rbind(X1, GenerateGeno(freq, n1-n0))
> X2 = rbind(X2, GenerateGeno(freq, n1-n0))
> G = rbind(X1,X2)
> Gstd = list()
> Gstd[[1]] = scale(G)   # scale by column
> Gstd[[2]] = t(apply(G, 1, scale)) # scale by row
> 
> unrelated = list()
> identity = list()
> for(i in 1:2)
+ {
+   X1 = Gstd[[i]][1:n1,]
+   X2 = Gstd[[i]][(n1+1):(n1+n2),]
+   grm = X1 %*% t(X2) / M
+   vgrm = as.vector(grm)
+   ## seperate unrelated and related pairs in grm
+   unrelated[[i]] = vgrm[-seq(1,n1*n2,n2+1)]
+   identity[[i]] = vgrm[seq(1,n1*n2,n2+1)]
+ }
> 
> layout(matrix(1:2,1,2))
> plot(identity[[2]], identity[[1]], ylim = c(-0.5,1.1), xlim = c(-0.5,1.1),
+      ylab = "Scale by column", xlab = "Scale by row", main = "Identity pairs")
> plot(unrelated[[2]], unrelated[[1]], ylim = c(-0.5,0.5), xlim = c(-0.5,0.5),
+      ylab = "Scale by column", xlab = "Scale by row", main = "Unrelated pairs")

7.3 Appendix3: Rsq(relative)

  • population: 2 cohorts with 1-degree relations
  • LD: without LD
  • x-asis: K
> flag = "simu"
> color = c("indianred","sienna1","goldenrod2","olivedrab3","lightyellow3")
> n1 = n2 = 300
> seta = 0.5
> M = 500              # number of markers
> freq = rep(0.5, M)   # MAF of each marker
> D = 4 
> r = c(0, 1, 2, 3)  
> n_cp = c(0, n1, 0, 0) # numbers for each degree of relations
> n1_0 = n1-sum(n_cp)
> n2_0 = n2-sum(n_cp)
> X1 = matrix(NA, 0, M)
> X2 = matrix(NA, 0, M)
> for(i in 1:D)
+ {
+   Gr = GenerateGeno_r(freq, n_cp[i], r[i], FALSE)
+   X1 = rbind(X1, Gr[[1]])
+   X2 = rbind(X2, Gr[[2]])
+ }
> X1 = rbind(X1, GenerateGeno(freq, n1_0))
> X2 = rbind(X2, GenerateGeno(freq, n2_0))
> X = rbind(X1,X2)
> X = t(apply(X, 1, scale)) # scale by row
> X1 = X[1:n1,]
> X2 = X[(n1+1):(n1+n2),]
> ## real relationship calculated from individual genotype
> grm = X1 %*% t(X2) / M
> vgrm = as.vector(grm)
> 
> pic = list()
> count = 20
> Kvec = seq(M, 10*M, M)
> knum = length(Kvec)
> rsqr.o = matrix(NA, knum, count)
> yvar.o = matrix(NA, knum, count)
> xvar.o = matrix(NA, knum, count)
> for(i in 1:knum)
+ {
+   K  = Kvec[i]
+   for(c in 1:count)
+   {
+     ## generate random weight matrix W(M*K) w_ij~N(0,1/M) 
+     W = matrix(rnorm(M*K,sd=sqrt(1/M)), M, K)
+     
+     ## calculate individual PPS A1(n1*K) A2(n2*K)
+     A1 = X1 %*% W
+     A2 = X2 %*% W
+     A1 = t(apply(A1, 1, scale)) # scale by row
+     A2 = t(apply(A2, 1, scale)) # scale by row
+     
+     ## Estimated GRM
+     grm_hat = A1 %*% t(A2) / K
+     vgrm_hat = as.vector(grm_hat)
+ 
+     ## separate unrelated and related pairs in grm(y) and grm_hat(x)
+     u.yvec = vgrm[-seq(1,n1*n2,n2+1)]
+     r.yvec = vgrm[seq(1,n1*n2,n2+1)]
+     u.xvec = vgrm_hat[-seq(1,n1*n2,n2+1)]
+     r.xvec = vgrm_hat[seq(1,n1*n2,n2+1)]
+     
+     ## correlation
+     rsqr.o[i,c] = cor(r.xvec, r.yvec)^2
+     yvar.o[i,c] = var(r.yvec)
+     xvar.o[i,c] = var(r.xvec)
+     yci = qnorm(1/n1/n2)*sqrt(yvar.o)
+     xci = qnorm(1/n1/n2)*sqrt(xvar.o)    
+   }
+   print(K)
+   
+   # label simulated relations
+   # lab = lab_relations_true(n1, n2, n_cp)
+   # dt = data.frame(x=vgrm_hat, y=vgrm, lab=as.character(lab))
+   # s = 2
+   # pic[[k]] = ggplot(dt, aes(x=x, y=y, color=lab)) +
+   #   geom_point(aes(color=lab), size=0.3) +
+   #   geom_hline(aes(yintercept = u.yci), color=color[5], alpha=0.5, linetype=5, size =0.3) +
+   #   geom_hline(aes(yintercept = -u.yci), color=color[5], alpha=0.5, linetype=5, size =0.3) +
+   #   geom_vline(aes(xintercept = u.xci), color=color[5], alpha=0.5, linetype=5, size =0.5) +
+   #   geom_vline(aes(xintercept = -u.xci), color=color[5], alpha=0.5, linetype=5, size =0.5) +
+   #   geom_hline(aes(yintercept = seta+r.yci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+   #   geom_hline(aes(yintercept = seta-r.yci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+   #   geom_vline(aes(xintercept = seta+r.xci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+   #   geom_vline(aes(xintercept = seta-r.xci), color=color[s], alpha=0.5, linetype=5, size =0.5) +
+   #   scale_x_continuous(limits=c(-1.5, 1.5)) + xlab("PPSR coefficient") +
+   #   scale_y_continuous(limits=c(-1.5, 1.5)) + ylab("True relatedness") +
+   #   scale_color_manual(values=color[c(s,5)], name="Relations",
+   #                      labels=c("0-degree","1-degree","2-degree","3-degree","unrelated")[c(s,5)]) +
+   #   annotate("text", x = -1, y = -1, size = 2,
+   #            label = paste0("M=",M,", K=",K,"\nn1=",n1,", n2=",n2,
+   #                           "\nrsq.o=",round(r.rsq.o,6),"\nrsq.t=",round(r.rsq.t,6),
+   #                           "\nvar(x.o)=",round(r.xvar.o,6),"\nvar(x.t)=",round(r.xvar.t,6),
+   #                           "\nvar(y.o)=",round(r.yvar.o,6),"\nvar(y.t)=",round(r.yvar.t,6))) +
+   #   theme_article()+
+   #   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, face = "bold"))
+   # 
+   # pic[[k]] = ggMarginal(pic[[k]], groupColour = T, groupFill = T,type = "density")  
+ }
> 
> # grid.arrange(pic[[1]],pic[[2]],pic[[3]], ncol=3)
> 
> rsqr.t = 1/(1 + M/Kvec)
> yvar.t = rep((1+seta^2)/M,  knum)
> xvar.t = (1+seta^2)/Kvec+(1+seta^2)/M
> apply(yvar.o, 1, mean)
> dt = data.frame(grp=rep(c("observed","theoretical"), each = knum),
+                  K=rep(Kvec, 2),
+                  rsq=c(apply(xvar.o, 1, mean), xvar.t),
+                  sd= c(apply(xvar.o, 1, sd),rep(0,knum)))
> ggplot(dt, aes(x=K, y=rsq, group = grp, color=grp))+
+   geom_point()+
+   geom_line()+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), width=.2,
+                 position=position_dodge(0.05))+
+   scale_color_lancet(name="")+
+   labs(subtitle = paste0("M=",M," n1=",n1," n2=", n2),
+        y="Variance of Ghat")+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 
> 
> dt = data.frame(grp=rep(c("observed","theoretical"), each = knum),
+                  K=rep(Kvec, 2),
+                  rsq=c(apply(rsqr.o, 1, mean), rsqr.t),
+                  sd= c(apply(rsqr.o, 1, sd),rep(0,knum)))
> ggplot(dt, aes(x=K, y=rsq, group = grp, color=grp))+
+   geom_point()+
+   geom_line()+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), width=.2,
+                 position=position_dodge(0.05))+
+   scale_color_lancet(name="")+
+   labs(subtitle = paste0("M=",M," n1=",n1," n2=", n2),
+        y=expression(R^2))+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))  

7.4 BLUP

  • without LD
  • no overlapping
> ## genotype
> h2 = 0.5
> n1 = 200
> n2 = 200
> M = 500           # number of markers
> freq = rep(0.5, M)    # MAF of each marker
> X1 = GenerateGeno(freq, n1)
> X2 = GenerateGeno(freq, n2)
> X1 = scale(X1)
> X2 = scale(X2)
> rownames(X1) = paste0("TRN", 1:n1)
> rownames(X2) = paste0("TST", 1:n2)
> colnames(X1) = colnames(X2) = paste0("SNP", 1:M)
> 
> ## K = M/(1/R2-1)
> R2 = c(0.5, 0.7, 0.8, 0.9, 0.95, 0.99)
> Kvec = round(M/(1/R2-1))
> knum = length(Kvec)
> K11 = A.mat(X1)
> 
> ## phenotype 
> count = 20
> rsq0 = matrix(NA, 1, count)
> rsq1 = matrix(NA, 1, count)
> rsq2 = matrix(NA, 1, count)
> rsq3 = matrix(NA, knum, count)
> t0=Sys.time()
> for(c in 1:count)
+ {
+   sigma_g = h2
+   b = rnorm(M, sd=sqrt(sigma_g/M))
+   g = X %*% b
+   sigma_e = (1/h2-1)*var(g)
+   error = rnorm(n1+n2, sd=sqrt(sigma_e))
+   y = g + error
+   y = scale(y)
+   y1 = as.data.frame(y[1:n1], row.names = rownames(X1))
+   y2 = as.data.frame(y[(n1+1):(n1+n2)], row.names = rownames(X2))
+   
+   ## gwasBLUP
+   y2hat = gwasBLUP(X1, X2, y1)
+   rsq1[1,c] = as.numeric(cor(y2hat, y2)^2)
+   rsq0[1,c] = h2/(1+M/n1/h2)
+   
+   ## encGBLUP
+   dt = data.frame(y1, id=rownames(y1))
+   colnames(dt)[1]="y1"
+   mod = mmer(y1~1,
+               random = ~vs(id, Gu=K11),
+               rcov = ~units, data = dt, verbose = FALSE, date.warning = FALSE)
+   sigmahat = mod$sigmaVector
+   # V
+   V = K11+diag(sigmahat[2]/sigmahat[1], n1, n1)
+   solV = solve(V)
+   # K21
+   K21 = X2 %*% t(X1) / M
+   y2hat = K21 %*% solV %*% as.matrix(y1)
+   rsq2[1,c] = as.numeric(cor(y2hat, y2)^2)
+   # K21 hat
+   for(i in 1:knum)
+   {
+     K = Kvec[i]
+     A = RandomMatrixEncryption(X1, X2, M, K, ScaleRow = T)
+     K21hat = tcrossprod(A[[2]], A[[1]]) / K
+     y2hat_enc = K21hat %*% solV %*% as.matrix(y1)
+     rsq3[i,c] = as.numeric(cor(y2hat_enc, y2)^2)
+   }
+   t1=Sys.time()
+   print(paste("Repeat Time:",c,"Collapse Time:"))
+   print(t1-t0)
+ }
> 
> dt = data.frame(grp=rep(c("Upper Bound","sBLUP","GBLUP","encGBLUP"), each=knum), K=rep(R2, 4), 
+                  rsq=c(rep(mean(rsq0),knum), rep(mean(rsq1), knum), rep(mean(rsq2), knum), apply(rsq3, 1, mean)),
+                  sd=c(rep(0,knum),rep(sd(rsq1), knum), rep(sd(rsq2), knum), apply(rsq3, 1, sd)))
> dt$K = as.factor(dt$K)
> ggplot(dt, aes(x=K, y=rsq, group = grp, color=grp))+
+   geom_point()+
+   geom_line()+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), width=.2,
+                  position=position_dodge(0.05))+
+   scale_color_lancet(name="")+
+   labs(title = "Prediction accuracy",
+        subtitle = paste0("M=",M," n1=",n1," n2=", n2, " h2=", h2), 
+        x = expression(1/(1+M/K)),
+        y = "Prediction accuracy")+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))  
> setwd("~/Desktop/Cryptography/results")
> n1=2000
> M=3000
> dt0 = data.frame()
> for(h2 in c(0.5))
+ {
+   for(ratio in c(1:3))
+   {
+     n2=n1*ratio
+     f = paste0("BLUP_n2_ratio_",ratio,"_h2_",h2,".RData")
+     load(file = f)
+     # dt$h2=rep(h2, nrow(dt))
+     # dt$n1=rep(n1, nrow(dt))
+     # dt$n2=rep(n2, nrow(dt))
+     dt0 = rbind(dt0,dt)
+   }
+ }
> color = c("sienna1","goldenrod2","olivedrab3","lightyellow3")
> ggplot(dt0, aes(x=K, y=rsq, fill = grp, color=grp))+
+   geom_bar(stat="identity", width = 0.8 , position=position_dodge(width = 0.8))+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="grey40", width=0.2, position=position_dodge(0.8))+
+   scale_color_manual(values=color, name="") +
+   scale_fill_manual(values=color, name="") +
+   labs(title = "Prediction accuracy",
+        subtitle = paste0("M=",M," n1=", n1), 
+        x = expression(R^2),
+        y = "Prediction accuracy")+
+   theme_bw()+
+   facet_grid(h2~n2, labeller = label_both)+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))

7.5 Appendix4: geom_point() vs geom_bar()

> Me_m1=Me11
> corrG21hat.t = apply(Me_m2,1,mean)/Me_m1/(1+Me_m1/K)/(1+apply(Me_m2,1,mean)/K)
> dt = data.frame(m2=rep(m2vec,2),
+                  grp=rep(c("observed","theoretical"), each=m2num),
+                  rsq=c(apply(corrG21hat, 1, mean), corrG21hat.t),
+                  sd=c(apply(corrG21hat, 1, sd), rep(0, m2num)))
> ## geom_point() 
> ggplot(dt, aes(x=m2,y=rsq, grp=grp, color=grp))+
+   geom_point()+
+   geom_line()+
+   scale_color_lancet(name="")+
+   labs(title = expression(paste("Accuracy between ",hat(G[21])(m1)," and ", hat(G[21])(m2))),
+        subtitle = paste("K=",K, " m1=",m1," n1=",n1," n2=", n2),
+        y=expression(R^2))+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 
> ## geom_bar()
> ggplot(dt, aes(x=m2,y=rsq, fill=grp, color=grp))+
+   geom_bar(stat="identity", width = 60 , position=position_dodge(width = 80))+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=20, position=position_dodge(80))+
+   geom_text(aes(label=round(rsq,4)),vjust=-1.1, color="black", size=2.5, position=position_dodge(80))+
+   scale_color_manual(values=color, name="") +
+   scale_fill_manual(values=color, name="") +
+   labs(title = expression(paste("Accuracy between ",hat(G[21])(m1)," and ", hat(G[21])(m2))),
+        subtitle = paste("K=",K, " m1=",m1," n1=",n1," n2=", n2),
+        y=expression(R^2))+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 

8 SNP sets

8.1 SNP set(G)

  • R2(G, Ghat)
  • SNP set: m2 not equal to m1 and {SNP2} belong to {SNP1}
  • population: 2 cohorts of unrelated individuals
  • LD: with LD
> n1 = 200
> n2 = 200
> m1 = 500 # the snp number for training sample
> SNPset1 = 1:m1
> freq = rep(0.5, m1)    # MAF of each marker
> rho = rep(0.75, m1)   # correlation coefficients between adjacent markers
> ld = freq^2*rho      # ld = rho*sqrt(f1*(1-f1)*f2*(1-f2))
> X1_m1 = t(apply(GenerateLDGeno(freq, ld, n1), 1, scale)) # scale by row
> X2_m1 = t(apply(GenerateLDGeno(freq, ld, n2), 1, scale)) # scale by row
> rownames(X1_m1) = paste0("TRN", 1:n1)
> rownames(X2_m1) = paste0("TST", 1:n2)
> colnames(X1_m1) = colnames(X2_m1) = SNPset1
> ## m2
> m2vec = round(seq(0.2,1,0.2)*m1) # the snp number for testing sample
> m2num = length(m2vec)
> count =20
> 
> ########## realized GRM ############
> ## G21_m1
> G21_m1 = tcrossprod(X2_m1,X1_m1) / m1
> ## Me11
> G11 = tcrossprod(X1_m1,X1_m1) / m1
> Me11 = 1/var(G11[lower.tri(G11)])
> rho11 = m1^2/Me11
> 
> ## under different number of m2
> corrG21.o = matrix(NA, m2num, count)
> Me22 = matrix(NA, m2num, count)
> Me = matrix(NA, m2num, count)
> rho21 = matrix(NA, m2num, count)
> rho22 = matrix(NA, m2num, count)
> rho = matrix(NA, m2num, count)
> for(c in 1:count)
+ {
+   for(i in 1:m2num)
+   {
+     m2 = m2vec[i]
+     SNPset2 = sort.int(sample(SNPset1, m2))
+     X1_m2 = X1_m1[,SNPset2]
+     X2_m2 = X2_m1[,SNPset2]
+ 
+     G22 = tcrossprod(X1_m2, X1_m2) / m2
+     Me22[i,c] = 1/var(G22[lower.tri(G22)])
+     rho22[i,c] = m2^2/Me22[i,c]
+ 
+     X1_m1m2 = cbind(X1_m1,X1_m2)
+     X2_m1m2 = cbind(X2_m1,X2_m2)
+     G = tcrossprod(X1_m1m2, X1_m1m2) / (m1+m2)
+     Me[i,c] = 1/var(G[lower.tri(G)])
+     rho[i,c] = (m1+m2)^2/Me[i,c]
+ 
+     G21_m2 = tcrossprod(X2_m2, X1_m2) / m2
+     corrG21.o[i,c] = cor(as.vector(G21_m1),as.vector(G21_m2))^2
+     rho21[i,c] = ((m1+m2)^2/Me[i,c]-m1^2/Me11-m2^2/Me22[i,c])/2
+   }
+   print(c)
+ }
> 
> # corrG21.t = apply(rho21, 1, mean)/(rho11+apply(rho21, 1, mean))*(1+m2vec/m1)
> corrG21.t = apply(Me22, 1, mean)/Me11
> dt = data.frame(m2=rep(m2vec,2),
+                  grp=rep(c("observed","theoretical"), each=m2num),
+                  rsq=c(apply(corrG21.o, 1, mean), corrG21.t),
+                  sd=c(apply(corrG21.o, 1, sd), rep(0,m2num)))
> ## plot
> color = c("goldenrod2","lightyellow3")
> ggplot(dt, aes(x=m2,y=rsq, fill=grp, color=grp))+
+   geom_bar(stat="identity", width = 60 , position=position_dodge(width = 80))+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=20, position=position_dodge(80))+
+   geom_text(aes(label=round(rsq,4)),vjust=-1.1, color="black", size=2.5, position=position_dodge(80))+
+   scale_color_manual(values=color, name="") +
+   scale_fill_manual(values=color, name="") +
+   labs(title = expression(paste("Accuracy between ",G[21](m[1])," and ", G[21](m[2]))),
+        subtitle = paste("m1=",m1," n1=",n1," n2=", n2),
+        y=expression(R^2))+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 
> 
> ########## realized GRM ############
> K = 5*m1
> ## G21hat_m1
> Xhat_m1 = RandomMatrixEncryption(X1_m1, X2_m1, m1, K)
> G21hat_m1 = Xhat_m1[[2]] %*% t(Xhat_m1[[1]]) / K
> ## Encryption
> corrG21hat = matrix(NA, m2num, count)
> Me_m2 = matrix(NA, m2num, count)
> for(c in 1:count)
+ {
+   for(i in 1:m2num)
+   {
+     m2 = m2vec[i]
+     SNPset2 = sort.int(sample(SNPset1, m2))
+     X1_m2 = X1_m1[,SNPset2]
+     X2_m2 = X2_m1[,SNPset2]
+ 
+     G21_m2 = X2_m2 %*% t(X1_m2) / m2
+     Me_m2[i,c] = 1/var(G21_m2[lower.tri(G21_m2)])
+ 
+     Xhat_m2 = RandomMatrixEncryption(X1_m2, X2_m2, m2, K)
+     G21hat_m2 = Xhat_m2[[2]] %*% t(Xhat_m2[[1]]) / K
+     corrG21hat[i,c] = cor(as.vector(G21hat_m1), as.vector(G21hat_m2))^2
+   }
+   print(c)
+ }
> 
> ## plot
> Me_m1=Me11
> corrG21hat.t = apply(Me_m2,1,mean)/Me_m1/(1+Me_m1/K)/(1+apply(Me_m2,1,mean)/K)
> dt = data.frame(m2=rep(m2vec,2),
+                  grp=rep(c("observed","theoretical"), each=m2num),
+                  rsq=c(apply(corrG21hat, 1, mean), corrG21hat.t),
+                  sd=c(apply(corrG21hat, 1, sd), rep(0, m2num)))
> ggplot(dt, aes(x=m2,y=rsq, fill=grp, color=grp))+
+   geom_bar(stat="identity", width = 60 , position=position_dodge(width = 80))+
+   geom_errorbar(aes(ymin=rsq-sd, ymax=rsq+sd), color="black", width=20, position=position_dodge(80))+
+   geom_text(aes(label=round(rsq,4)),vjust=-1.1, color="black", size=2.5, position=position_dodge(80))+
+   scale_color_manual(values=color, name="") +
+   scale_fill_manual(values=color, name="") +
+   labs(title = expression(paste("Accuracy between ",hat(G[21])(m1)," and ", hat(G[21])(m2))),
+        subtitle = paste("K=",K, " m1=",m1," n1=",n1," n2=", n2),
+        y=expression(R^2))+
+   theme_bw()+
+   theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) 

8.2 SNP set(y)

  • GBLUP vs gwasBLUP
  • LD: with vs without LD
  • SNP set: {SNP2} belong to {SNP1} , and {SNP2} do not contribute to phenotype
> for(s in 1:2)
+ {
+   n1 = 200
+   n2 = 200
+   m1 = 500 # the snp number for training sample
+   m2 = m1/2
+   SNPset1 = 1:m1
+   SNPset2 = seq(2,m1,2)
+   freq = rep(0.5, m1)    # MAF of each marker
+   if(s==1) { ## Scenario1
+     X1_m1 = GenerateGeno(freq, n1)
+     X2_m1 = GenerateGeno(freq, n2)
+   } else { ## Scenario2
+     Dprime = rep(0.8, m1)   # correlation coefficients between adjacent markers
+     ld = Dprime*freq^2      # ld = Dprime*min(PaPB,PAPb)
+     X1_m1 = GenerateLDGeno(freq, ld, n1)
+     X2_m1 = GenerateLDGeno(freq, ld, n2)
+   }
+   X_m1 = rbind(X1_m1,X2_m1)
+   X_m1 = t(apply(X_m1, 1, scale)) # scale by row
+   X1_m1 = X_m1[1:n1,]
+   X2_m1 = X_m1[(n1+1):(n1+n2),]
+   rownames(X1_m1) = paste0("TRN", 1:n1)
+   rownames(X2_m1) = paste0("TST", 1:n2)
+   colnames(X1_m1) = colnames(X2_m1) = SNPset1
+   X1_m2 = X1_m1[,SNPset2]
+   X2_m2 = X2_m1[,SNPset2]
+   
+   ## simulate y
+   h2 = 0.5
+   b = rnorm(m1-m2, sd=sqrt(h2/(m1-m2)))
+   g = X_m1[,-SNPset2] %*% b
+   sigma_e = (1/h2-1)*var(g)
+   error = rnorm(n1+n2, sd=sqrt(sigma_e))
+   y = g + error
+   y = scale(y)
+   y1 = as.data.frame(y[1:n1], row.names = rownames(X1_m1))
+   y2 = as.data.frame(y[(n1+1):(n1+n2)], row.names = rownames(X2_m1))
+   
+   rst = matrix(NA, 1, 4)
+   ## gwasBLUP
+   y2hat=gwasBLUP(X1_m1,X2_m1,y1)
+   rst[1,1]=cor(y2hat,y2)^2
+   y2hat=gwasBLUP(X1_m2,X2_m2,y1)
+   rst[1,2]=cor(y2hat,y2)^2
+   ## GBLUP
+   y2hat=GBLUP(X1_m1,X2_m1,y1)
+   rst[1,3]=cor(y2hat,y2)^2
+   y2hat=GBLUP2(X1_m1,X1_m2,X2_m2,y1)
+   rst[1,4]=cor(y2hat,y2)^2
+   
+   ## Output
+   if(s==1) Scenario1 = rst
+   if(s==2) Scenario2 = rst
+ }
> 
> rst = rbind(Scenario1,Scenario2)
> colnames(rst) = c("gwasBLUP-m1","gwasBLUP-m2","GBLUP-m1","GBLUP-m2")
> rownames(rst) = c("ScenarioI(LE)","ScenarioII(LD)")
> rst

9 Session Information

> sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] MultiPhen_2.0.3   meta_4.13-0       epitools_0.5-10.1 abind_1.4-5      
##  [5] sommer_4.1.3      crayon_1.3.4      lattice_0.20-41   MASS_7.3-53      
##  [9] Matrix_1.2-18     kableExtra_1.3.4  egg_0.4.5         viridis_0.5.1    
## [13] viridisLite_0.3.0 gridExtra_2.3     ggExtra_0.9       ggsci_2.9        
## [17] ggplot2_3.3.5     rmdformats_1.0.2  knitr_1.30       
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2          tidyr_1.1.2         HardyWeinberg_1.6.3
##  [4] splines_4.0.0       shiny_1.5.0         statmod_1.4.34     
##  [7] highr_0.8           metafor_2.4-0       yaml_2.2.1         
## [10] gdtools_0.2.2       pillar_1.4.4        backports_1.1.7    
## [13] glue_1.4.1          digest_0.6.25       RColorBrewer_1.1-2 
## [16] promises_1.1.1      rvest_0.3.6         minqa_1.2.4        
## [19] colorspace_1.4-1    htmltools_0.5.0     httpuv_1.5.4       
## [22] pkgconfig_2.0.3     broom_0.7.0         bookdown_0.22      
## [25] purrr_0.3.4         xtable_1.8-4        scales_1.1.1       
## [28] webshot_0.5.2       svglite_1.2.3.2     later_1.1.0.1      
## [31] lme4_1.1-23         tibble_3.0.1        generics_0.0.2     
## [34] ellipsis_0.3.1      withr_2.2.0         magrittr_1.5       
## [37] mime_0.9            evaluate_0.14       mice_3.11.0        
## [40] nlme_3.1-148        xml2_1.3.2          truncnorm_1.0-8    
## [43] tools_4.0.0         lifecycle_0.2.0     stringr_1.4.0      
## [46] munsell_0.5.0       compiler_4.0.0      systemfonts_0.3.2  
## [49] rlang_0.4.11        grid_4.0.0          nloptr_1.2.2.2     
## [52] rstudioapi_0.11     CompQuadForm_1.4.3  Rsolnp_1.16        
## [55] miniUI_0.1.1.1      rmarkdown_2.10      boot_1.3-25        
## [58] gtable_0.3.0        R6_2.4.1            dplyr_1.0.0        
## [61] fastmap_1.0.1       stringi_1.4.6       parallel_4.0.0     
## [64] Rcpp_1.0.6          png_0.1-7           vctrs_0.3.1        
## [67] tidyselect_1.1.0    xfun_0.25