setwd("D:/Federated Learning/")
rm(list=ls())
library(qvalue)
library(data.table)
library(gwaR)
library(regress)
library(Matrix)
library(knitr)
library(magrittr)
library(ggplot2)
source("Toolkit_function.R")
source("Toolkit_function_Get.R")Transfer and federated learning using GBLUP-GWA
Basic assumptions
Genotype and phenotype data are available from several populations. In this examples we will download the data to perform computations locally and then transfer results. But in practice, each node can perform their own analyses and transfer only the quantities needed for federated learning or transfer learning.
Setting up code
Loading needed packages, in this case gwaR is the core mixed model package used.Current transfer learning and federated learning functions are available in the Toolkit scripts and they will be integrated in a package later.
Example dataset
We will load only one data and split it into two parts to do transfer learning. first input parameters are provided in a config file:
source_pop3 = 'comm'
pheno_name3= 'https://www.dropbox.com/s/1e2bw6yb9viha1w/pheno_comm.csv?dl=1'
geno_name3 = 'https://www.dropbox.com/s/oas827ngl39yxrd/genos_commercial.csv?dl=1'
map_name3 = 'https://www.dropbox.com/s/ym39ln7qii84gfe/map.csv?dl=1'
cg_name3 = 'cg' #variable for random effects/contemporary groups
id_name3 = 'animal' #name of the animal ID
phenotype3 = 'plantA' #name of phenotype
na_geno3 = NA #missing value character
na_pheno3 = NA #missing value character
na_per_row3 = 0.05 #maximum prop. missing value
na_per_col3 = 0.05 #idem
MAF3=0.0 #maf
Then data are read and setup. Some summaries are presented by the input functions
comm_genot<-read_genotypes(geno_name3)-------------------------------
Read genotype file https://www.dropbox.com/s/oas827ngl39yxrd/genos_commercial.csv?dl=1
sample output:
MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1 1 2 1 1 1
10 0 2 2 2 2
1001 2 2 0 0 0
1003 0 0 0 0 0
1004 1 1 1 1 1
number of subjects: 1892
number of SNP: 37069
-------------------------------
comm_phenot<-read_pheno(pheno_name3, source_pop3, id_name3,
cg_name=cg_name3, phenotype = phenotype3)-------------------------------
Read phenotype file https://www.dropbox.com/s/1e2bw6yb9viha1w/pheno_comm.csv?dl=1
sample output:
cg plantA
1 7003 -2.2747
2 7003 -2.4326
3 7005 -0.0843
4 7003 0.8251
5 7005 -1.5025
6 7003 -2.3384
number of subjects: 1920
deleting rows with missing values
-------------------------------
number of subjects with complete observations: 1919
-------------------------------
Phenotypic summary
cg plantA
31 :304 Min. :-6.12230
41 :303 1st Qu.:-1.91655
11 :150 Median :-1.08580
12 :150 Mean :-0.91278
23 :141 3rd Qu.:-0.06025
21 :102 Max. : 6.28180
(Other):769
-------------------------------
comm_mpt<- read_map(map_name3)..................................
Summary of map:
chr pos
MARC0044150 1 286933
ASGA0000014 1 342481
ASGA0000021 1 489855
ALGA0000009 1 538161
ALGA0000014 1 565627
H3GA0000032 1 573088
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416
17 18
1211 837
.................................
comm_kt <- filter_and_match(comm_genot, comm_phenot, map=comm_mpt, n_maf = MAF3,
n_row=na_per_row3, n_col=na_per_col3)-------------------------------
number of deleted row: 0
number of deleted column: 0
number of deleted MAF: 0
-------------------------------
Sample Output for genotype:
[1] 1891 37069
MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1 1 2 1 1 1
10 0 2 2 2 2
1001 2 2 0 0 0
1003 0 0 0 0 0
1004 1 1 1 1 1
Sample Output for phenotype:
cg plantA
1 7003 -2.2747
10 7003 -2.1520
1001 12 -1.7266
1003 12 -0.8032
1004 12 -1.3983
number of subjects for genotype: 1891
number of subjects for phenotype: 1891
-------------------------------
[1] "A map file was provided!"
chr pos
MARC0044150 1 286933
ASGA0000014 1 342481
ASGA0000021 1 489855
ALGA0000009 1 538161
ALGA0000014 1 565627
H3GA0000032 1 573088
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416
17 18
1211 837
comm_matr <- read_G_matrix(comm_kt$geno)---------------------------------
G matrix computed from genotypes
Summary of diagonal:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.7096 0.7983 0.8257 0.9355 1.1573 1.6005
[1] 1891 1891
---------------------------------
Summary off-diagonal:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.367810 -0.120544 0.008824 -0.000495 0.056939 1.121283
---------------------------------
In this example only, we partition the data into two parts for the transfer learning, then use the toolkit functions to re-arrange the genotype and phenotype file
set.seed(1973)
id<-sample(x = 1:2,size = nrow(comm_phenot),replace = T)
table(id)id
1 2
951 968
comm_phenot1<-comm_phenot[id==1,]
comm_phenot2<-comm_phenot[id==2,]
comm_kt1<- filter_and_match(comm_genot, comm_phenot1, map=comm_mpt, n_maf = MAF3,
n_row=na_per_row3, n_col=na_per_col3)-------------------------------
number of deleted row: 0
number of deleted column: 0
number of deleted MAF: 0
-------------------------------
Sample Output for genotype:
[1] 942 37069
MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1 1 2 1 1 1
10 0 2 2 2 2
1001 2 2 0 0 0
1003 0 0 0 0 0
1006 1 1 1 1 1
Sample Output for phenotype:
cg plantA
1 7003 -2.2747
10 7003 -2.1520
1001 12 -1.7266
1003 12 -0.8032
1006 12 -0.4731
number of subjects for genotype: 942
number of subjects for phenotype: 942
-------------------------------
[1] "A map file was provided!"
chr pos
MARC0044150 1 286933
ASGA0000014 1 342481
ASGA0000021 1 489855
ALGA0000009 1 538161
ALGA0000014 1 565627
H3GA0000032 1 573088
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416
17 18
1211 837
comm_kt2<- filter_and_match(comm_genot, comm_phenot2, map=comm_mpt, n_maf = MAF3,
n_row=na_per_row3, n_col=na_per_col3)-------------------------------
number of deleted row: 0
number of deleted column: 0
number of deleted MAF: 0
-------------------------------
Sample Output for genotype:
[1] 949 37069
MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1004 1 1 1 1.0000 1.0000
1005 0 2 2 2.0000 2.0000
101 1 2 1 1.0000 1.0000
1011 0 1 0 0.0000 0.0000
1012 2 2 0 1.0034 1.0034
Sample Output for phenotype:
cg plantA
1004 12 -1.3983
1005 12 -1.3345
101 999 -1.6654
1011 12 -1.1395
1012 12 -1.6189
number of subjects for genotype: 949
number of subjects for phenotype: 949
-------------------------------
[1] "A map file was provided!"
chr pos
MARC0044150 1 286933
ASGA0000014 1 342481
ASGA0000021 1 489855
ALGA0000009 1 538161
ALGA0000014 1 565627
H3GA0000032 1 573088
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416
17 18
1211 837
comm_matr1 <- read_G_matrix(comm_kt1$geno)---------------------------------
G matrix computed from genotypes
Summary of diagonal:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.7100 0.8000 0.8296 0.9371 1.1534 1.5956
[1] 942 942
---------------------------------
Summary off-diagonal:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.3548028 -0.1217803 0.0082645 -0.0009959 0.0573448 1.0519618
---------------------------------
comm_matr2 <- read_G_matrix(comm_kt2$geno)---------------------------------
G matrix computed from genotypes
Summary of diagonal:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.7284 0.7974 0.8225 0.9334 1.1604 1.6046
[1] 949 949
---------------------------------
Summary off-diagonal:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.3523270 -0.1203371 0.0083564 -0.0009846 0.0557958 0.8581609
---------------------------------
Each station then fits a gblup model to their data. and performs GWA (obtain SNP effects)
Station 1
comm_gbt1<- model_gb(comm_matr1, comm_kt$pheno, y_name=phenotype3,random=cg_name3)..........................................................
summary gblup analysis of trait: plantA
fixed effects equation:
y ~ 1
random effects equation:
~G + cg + In
log-likelihood: -578.8135 converged in: 5 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 942 first 5 animals shown:
[,1]
1 -0.02451225
10 -0.07362341
1001 -0.15491582
1003 0.19934545
1006 0.23993607
1007 -0.06806581
..........................................................
varcomp(comm_gbt1) Estimate StdError prop.var se
G 0.2349349 0.09991544 0.1032521 0.04604754
cg 1.0346197 0.37724424 0.4547076 0.09427433
In 1.0057972 0.08427155 0.4420403 0.08536277
comm_Gwt1<-run_gwa(comm_gbt1, comm_kt1$geno).........................................
Estimated Genetic Effect and its variance:
ghat var_ghat
MARC0044150 1.786788410 19.18701
ASGA0000014 -1.792135398 13.42618
ASGA0000021 -6.028165270 19.84738
ALGA0000009 0.003734211 18.02730
ALGA0000014 0.004092145 18.02721
.........................................
Summary of GWA:
<0.001 <0.01 <0.025 <0.05 <0.1
signif.values 0 0 0 0 0
attr(,"class")
[1] "summary.gwas"
.........................................
head(comm_Gwt1) ghat var_ghat
MARC0044150 1.786788410 19.18701
ASGA0000014 -1.792135398 13.42618
ASGA0000021 -6.028165270 19.84738
ALGA0000009 0.003734211 18.02730
ALGA0000014 0.004092145 18.02721
H3GA0000032 3.606499649 13.86291
pv1<-getpvalue(comm_Gwt1,log.p = F)
qv1<-qvalue(pv1)
manhattan_plot(qv1$qvalues,map = comm_mpt)Station 2
comm_gbt2<- model_gb(comm_matr2, comm_kt$pheno, y_name=phenotype3,random=cg_name3)..........................................................
summary gblup analysis of trait: plantA
fixed effects equation:
y ~ 1
random effects equation:
~G + cg + In
log-likelihood: -554.7967 converged in: 7 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 949 first 5 animals shown:
[,1]
1004 -0.31632547
1005 0.05550869
101 0.04673985
1011 -0.05442077
1012 -0.10448062
1013 -0.27930277
..........................................................
varcomp(comm_gbt2) Estimate StdError prop.var se
G 0.2225149 0.09414325 0.1058684 0.04692323
cg 0.9300038 0.34470243 0.4424783 0.09518403
In 0.9492876 0.07941447 0.4516532 0.08661636
comm_Gwt2<-run_gwa(comm_gbt2, comm_kt2$geno).........................................
Estimated Genetic Effect and its variance:
ghat var_ghat
MARC0044150 -2.8897587 16.91784
ASGA0000014 -1.5374774 13.48458
ASGA0000021 4.1358112 17.31271
ALGA0000009 0.1580059 15.33220
ALGA0000014 0.1576774 15.33208
.........................................
Summary of GWA:
<0.001 <0.01 <0.025 <0.05 <0.1
signif.values 0 0 0 0 0
attr(,"class")
[1] "summary.gwas"
.........................................
head(comm_Gwt2) ghat var_ghat
MARC0044150 -2.8897587 16.91784
ASGA0000014 -1.5374774 13.48458
ASGA0000021 4.1358112 17.31271
ALGA0000009 0.1580059 15.33220
ALGA0000014 0.1576774 15.33208
H3GA0000032 1.3756023 12.73988
pv2<-getpvalue(comm_Gwt2,log.p = F)
qv2<-qvalue(pv2)
manhattan_plot(qv2$qvalues,map = comm_mpt)GWA results (SNP effects) are transmitted between Stations and the polygenic scores are computed and added to the phenotypic dataset
comm1_comm2_p <- get_polys(comm_kt1$geno, comm_Gwt2)
Pheno_P1.1 <- add_poly(comm_kt1$pheno, comm1_comm2_p)
head(Pheno_P1.1) cg plantA poly1
1 7003 -2.2747 -13.802173
10 7003 -2.1520 -16.106521
1001 12 -1.7266 -1.448680
1003 12 -0.8032 -30.234787
1006 12 -0.4731 1.286598
1007 12 -1.7173 2.307809
comm2_comm1_p <- get_polys(comm_kt2$geno, comm_Gwt1)
Pheno_P1.2 <- add_poly(comm_kt2$pheno, comm2_comm1_p)
head(Pheno_P1.2) cg plantA poly1
1004 12 -1.3983 -30.639754
1005 12 -1.3345 -3.620011
101 999 -1.6654 13.921584
1011 12 -1.1395 17.587270
1012 12 -1.6189 -50.472837
1013 12 -2.5077 -8.481851
Gblup is re-fit including polygenic scores
population 1
comm_gb_p1<- model_gb(G=comm_matr1, pheno=Pheno_P1.1, y_name=phenotype3,fixed=c("poly1"), random=cg_name3)..........................................................
summary gblup analysis of trait: plantA
fixed effects equation:
y ~ poly1
random effects equation:
~G + cg + In
log-likelihood: -571.5995 converged in: 5 iterations
estimated fixed effects:
Estimate StdError test.st p.value
(Intercept) -1.000643670 0.242332458 -4.129219 3.639984e-05
poly1 0.009733358 0.002615501 3.721413 1.981115e-04
estimated variance components:
Estimate StdError prop.var
G 0.2266953 0.09851946 0.1025966
cg 0.9863568 0.36041016 0.4464002
In 0.9965275 0.08325468 0.4510032
breeding value predicted for 942 first 5 animals shown:
[,1]
1 -0.01354543
10 -0.03252721
1001 -0.20816901
1003 0.35448994
1006 0.28312932
1007 -0.08778297
..........................................................
summary(comm_gb_p1,fe=T)summary gblup analysis of trait: plantA
fixed effects equation:
y ~ poly1
random effects equation:
~G + cg + In
log-likelihood: -571.5995 converged in: 5 iterations
estimated fixed effects:
Estimate StdError test.st p.value
(Intercept) -1.000643670 0.242332458 -4.129219 3.639984e-05
poly1 0.009733358 0.002615501 3.721413 1.981115e-04
estimated variance components:
Estimate StdError prop.var
G 0.2266953 0.09851946 0.1025966
cg 0.9863568 0.36041016 0.4464002
In 0.9965275 0.08325468 0.4510032
breeding value predicted for 942 first 5 animals shown:
[,1]
1 -0.01354543
10 -0.03252721
1001 -0.20816901
1003 0.35448994
1006 0.28312932
1007 -0.08778297
varcomp(comm_gb_p1) Estimate StdError prop.var se
G 0.2266953 0.09851946 0.1025966 0.04652951
cg 0.9863568 0.36041016 0.4464002 0.09415488
In 0.9965275 0.08325468 0.4510032 0.08611436
population 2
comm_gb_p2<- model_gb(G=comm_matr2, pheno=Pheno_P1.2, y_name=phenotype3,fixed=c("poly1"), random=cg_name3)..........................................................
summary gblup analysis of trait: plantA
fixed effects equation:
y ~ poly1
random effects equation:
~G + cg + In
log-likelihood: -549.1349 converged in: 7 iterations
estimated fixed effects:
Estimate StdError test.st p.value
(Intercept) -0.999466420 0.238773824 -4.185829 2.841271e-05
poly1 0.007190564 0.002188899 3.285014 1.019774e-03
estimated variance components:
Estimate StdError prop.var
G 0.2131445 0.09297474 0.1023778
cg 0.9242567 0.34255210 0.4439399
In 0.9445400 0.07868958 0.4536824
breeding value predicted for 949 first 5 animals shown:
[,1]
1004 -0.215469793
1005 0.037903266
101 -0.001960094
1011 -0.146138523
1012 0.027425418
1013 -0.242736152
..........................................................
summary(comm_gb_p2,fe=T)summary gblup analysis of trait: plantA
fixed effects equation:
y ~ poly1
random effects equation:
~G + cg + In
log-likelihood: -549.1349 converged in: 7 iterations
estimated fixed effects:
Estimate StdError test.st p.value
(Intercept) -0.999466420 0.238773824 -4.185829 2.841271e-05
poly1 0.007190564 0.002188899 3.285014 1.019774e-03
estimated variance components:
Estimate StdError prop.var
G 0.2131445 0.09297474 0.1023778
cg 0.9242567 0.34255210 0.4439399
In 0.9445400 0.07868958 0.4536824
breeding value predicted for 949 first 5 animals shown:
[,1]
1004 -0.215469793
1005 0.037903266
101 -0.001960094
1011 -0.146138523
1012 0.027425418
1013 -0.242736152
varcomp(comm_gb_p2) Estimate StdError prop.var se
G 0.2131445 0.09297474 0.1023778 0.04664792
cg 0.9242567 0.34255210 0.4439399 0.09525035
In 0.9445400 0.07868958 0.4536824 0.08711103
Finally prediction accuracies are obtained through cross validation
comm_cv1.1 <- cross_validation(comm_gbt1$model$G, iterate = 10, comm_kt1$pheno, y_name=phenotype3, rand_var = cg_name3)
comm_cv1.2 <- cross_validation(comm_gbt2$model$G, iterate = 10, comm_kt2$pheno, y_name=phenotype3, rand_var = cg_name3)
comm_cv_p1 <- cross_validation(comm_gb_p1$model$G, Pheno_P1.1, y_name=phenotype3, fixed=c("poly1"), rand_var = cg_name3,poly = T)
comm_cv_p2 <- cross_validation(comm_gb_p2$model$G, Pheno_P1.2, y_name=phenotype3, fixed=c("poly1"), rand_var = cg_name3,poly = T)
cbind(mean(comm_cv1.1),
mean(comm_cv1.2),
mean(comm_cv_p1),
mean(comm_cv_p2)
)
cbind(sd(comm_cv1.1),
sd(comm_cv1.2),
sd(comm_cv_p1),
sd(comm_cv_p2)
)