Transfer and federated learning using GBLUP-GWA

Author

Juan Steibel

Basic assumptions

Genotype and phenotype data are available from several populations. In this examples we will download the data to perform computations locally and then transfer results. But in practice, each node can perform their own analyses and transfer only the quantities needed for federated learning or transfer learning.

Setting up code

Loading needed packages, in this case gwaR is the core mixed model package used.Current transfer learning and federated learning functions are available in the Toolkit scripts and they will be integrated in a package later.

setwd("D:/Federated Learning/")
rm(list=ls())
library(qvalue)
library(data.table)
library(gwaR)
library(regress)
library(Matrix)
library(knitr)
library(magrittr)
library(ggplot2)
source("Toolkit_function.R")
source("Toolkit_function_Get.R")

Example dataset

We will load only one data and split it into two parts to do transfer learning. first input parameters are provided in a config file:

source_pop3 = 'comm'
pheno_name3= 'https://www.dropbox.com/s/1e2bw6yb9viha1w/pheno_comm.csv?dl=1'
geno_name3 = 'https://www.dropbox.com/s/oas827ngl39yxrd/genos_commercial.csv?dl=1'
map_name3 = 'https://www.dropbox.com/s/ym39ln7qii84gfe/map.csv?dl=1'
cg_name3 = 'cg'     #variable for random effects/contemporary groups
id_name3 = 'animal' #name of the animal ID
phenotype3 = 'plantA'  #name of phenotype
na_geno3 = NA          #missing value character
na_pheno3 = NA         #missing value character
na_per_row3 = 0.05     #maximum prop. missing value
na_per_col3 = 0.05     #idem
MAF3=0.0               #maf

Then data are read and setup. Some summaries are presented by the input functions

comm_genot<-read_genotypes(geno_name3)

------------------------------- 
Read genotype file https://www.dropbox.com/s/oas827ngl39yxrd/genos_commercial.csv?dl=1 
sample output: 
     MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1              1           2           1           1           1
10             0           2           2           2           2
1001           2           2           0           0           0
1003           0           0           0           0           0
1004           1           1           1           1           1
number of subjects: 1892 
number of SNP: 37069 
-------------------------------

comm_phenot<-read_pheno(pheno_name3, source_pop3, id_name3, 
                       cg_name=cg_name3, phenotype = phenotype3)

------------------------------- 
Read phenotype file https://www.dropbox.com/s/1e2bw6yb9viha1w/pheno_comm.csv?dl=1 
sample output: 
    cg  plantA
1 7003 -2.2747
2 7003 -2.4326
3 7005 -0.0843
4 7003  0.8251
5 7005 -1.5025
6 7003 -2.3384
number of subjects: 1920 
deleting rows with missing values 
------------------------------- 
number of subjects with complete observations: 1919 
------------------------------- 
Phenotypic summary 
       cg          plantA        
 31     :304   Min.   :-6.12230  
 41     :303   1st Qu.:-1.91655  
 11     :150   Median :-1.08580  
 12     :150   Mean   :-0.91278  
 23     :141   3rd Qu.:-0.06025  
 21     :102   Max.   : 6.28180  
 (Other):769                     
-------------------------------

comm_mpt<- read_map(map_name3)

.................................. 
Summary of map: 
            chr    pos
MARC0044150   1 286933
ASGA0000014   1 342481
ASGA0000021   1 489855
ALGA0000009   1 538161
ALGA0000014   1 565627
H3GA0000032   1 573088

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416 
  17   18 
1211  837 
.................................

comm_kt <- filter_and_match(comm_genot, comm_phenot, map=comm_mpt, n_maf = MAF3,
                            n_row=na_per_row3, n_col=na_per_col3)

------------------------------- 
number of deleted row: 0 
number of deleted column: 0 
number of deleted MAF: 0 
------------------------------- 
Sample Output for genotype: 
[1]  1891 37069
     MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1              1           2           1           1           1
10             0           2           2           2           2
1001           2           2           0           0           0
1003           0           0           0           0           0
1004           1           1           1           1           1
Sample Output for phenotype: 
       cg  plantA
1    7003 -2.2747
10   7003 -2.1520
1001   12 -1.7266
1003   12 -0.8032
1004   12 -1.3983
number of subjects for genotype: 1891 
number of subjects for phenotype: 1891 
------------------------------- 
[1] "A map file was provided!"
            chr    pos
MARC0044150   1 286933
ASGA0000014   1 342481
ASGA0000021   1 489855
ALGA0000009   1 538161
ALGA0000014   1 565627
H3GA0000032   1 573088

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416 
  17   18 
1211  837

comm_matr <- read_G_matrix(comm_kt$geno)

--------------------------------- 
G matrix computed from genotypes 
Summary of diagonal: 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.7096  0.7983  0.8257  0.9355  1.1573  1.6005 
[1] 1891 1891
--------------------------------- 
Summary off-diagonal: 
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-0.367810 -0.120544  0.008824 -0.000495  0.056939  1.121283 
---------------------------------

In this example only, we partition the data into two parts for the transfer learning, then use the toolkit functions to re-arrange the genotype and phenotype file

set.seed(1973)
id<-sample(x = 1:2,size = nrow(comm_phenot),replace = T)
table(id)

id
  1   2 
951 968

comm_phenot1<-comm_phenot[id==1,]
comm_phenot2<-comm_phenot[id==2,]

comm_kt1<- filter_and_match(comm_genot, comm_phenot1, map=comm_mpt, n_maf = MAF3,
                            n_row=na_per_row3, n_col=na_per_col3)

------------------------------- 
number of deleted row: 0 
number of deleted column: 0 
number of deleted MAF: 0 
------------------------------- 
Sample Output for genotype: 
[1]   942 37069
     MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1              1           2           1           1           1
10             0           2           2           2           2
1001           2           2           0           0           0
1003           0           0           0           0           0
1006           1           1           1           1           1
Sample Output for phenotype: 
       cg  plantA
1    7003 -2.2747
10   7003 -2.1520
1001   12 -1.7266
1003   12 -0.8032
1006   12 -0.4731
number of subjects for genotype: 942 
number of subjects for phenotype: 942 
------------------------------- 
[1] "A map file was provided!"
            chr    pos
MARC0044150   1 286933
ASGA0000014   1 342481
ASGA0000021   1 489855
ALGA0000009   1 538161
ALGA0000014   1 565627
H3GA0000032   1 573088

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416 
  17   18 
1211  837

comm_kt2<- filter_and_match(comm_genot, comm_phenot2, map=comm_mpt, n_maf = MAF3,
                            n_row=na_per_row3, n_col=na_per_col3)

------------------------------- 
number of deleted row: 0 
number of deleted column: 0 
number of deleted MAF: 0 
------------------------------- 
Sample Output for genotype: 
[1]   949 37069
     MARC0044150 ASGA0000014 ASGA0000021 ALGA0000009 ALGA0000014
1004           1           1           1      1.0000      1.0000
1005           0           2           2      2.0000      2.0000
101            1           2           1      1.0000      1.0000
1011           0           1           0      0.0000      0.0000
1012           2           2           0      1.0034      1.0034
Sample Output for phenotype: 
      cg  plantA
1004  12 -1.3983
1005  12 -1.3345
101  999 -1.6654
1011  12 -1.1395
1012  12 -1.6189
number of subjects for genotype: 949 
number of subjects for phenotype: 949 
------------------------------- 
[1] "A map file was provided!"
            chr    pos
MARC0044150   1 286933
ASGA0000014   1 342481
ASGA0000021   1 489855
ALGA0000009   1 538161
ALGA0000014   1 565627
H3GA0000032   1 573088

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
4409 2545 1867 2648 1611 2370 2283 1901 2293 1368 1335 1217 2905 2849 2004 1416 
  17   18 
1211  837

comm_matr1 <- read_G_matrix(comm_kt1$geno)

--------------------------------- 
G matrix computed from genotypes 
Summary of diagonal: 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.7100  0.8000  0.8296  0.9371  1.1534  1.5956 
[1] 942 942
--------------------------------- 
Summary off-diagonal: 
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.3548028 -0.1217803  0.0082645 -0.0009959  0.0573448  1.0519618 
---------------------------------

comm_matr2 <- read_G_matrix(comm_kt2$geno)

--------------------------------- 
G matrix computed from genotypes 
Summary of diagonal: 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.7284  0.7974  0.8225  0.9334  1.1604  1.6046 
[1] 949 949
--------------------------------- 
Summary off-diagonal: 
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.3523270 -0.1203371  0.0083564 -0.0009846  0.0557958  0.8581609 
---------------------------------

Each station then fits a gblup model to their data. and performs GWA (obtain SNP effects)

Station 1

comm_gbt1<- model_gb(comm_matr1, comm_kt$pheno, y_name=phenotype3,random=cg_name3)

.......................................................... 
summary gblup analysis of trait: plantA 

fixed effects equation:
y ~ 1

random effects equation:
~G + cg + In

log-likelihood: -578.8135 converged in: 5 iterations 

estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)

estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)

breeding value predicted for 942 first 5 animals shown:
            [,1]
1    -0.02451225
10   -0.07362341
1001 -0.15491582
1003  0.19934545
1006  0.23993607
1007 -0.06806581
..........................................................

varcomp(comm_gbt1)

    Estimate   StdError  prop.var         se
G  0.2349349 0.09991544 0.1032521 0.04604754
cg 1.0346197 0.37724424 0.4547076 0.09427433
In 1.0057972 0.08427155 0.4420403 0.08536277

comm_Gwt1<-run_gwa(comm_gbt1, comm_kt1$geno)

......................................... 
Estimated Genetic Effect and its variance: 
                    ghat var_ghat
MARC0044150  1.786788410 19.18701
ASGA0000014 -1.792135398 13.42618
ASGA0000021 -6.028165270 19.84738
ALGA0000009  0.003734211 18.02730
ALGA0000014  0.004092145 18.02721
......................................... 
Summary of GWA: 
              <0.001 <0.01 <0.025 <0.05 <0.1
signif.values      0     0      0     0    0
attr(,"class")
[1] "summary.gwas"
.........................................

head(comm_Gwt1)

                    ghat var_ghat
MARC0044150  1.786788410 19.18701
ASGA0000014 -1.792135398 13.42618
ASGA0000021 -6.028165270 19.84738
ALGA0000009  0.003734211 18.02730
ALGA0000014  0.004092145 18.02721
H3GA0000032  3.606499649 13.86291

pv1<-getpvalue(comm_Gwt1,log.p = F)
qv1<-qvalue(pv1)
manhattan_plot(qv1$qvalues,map = comm_mpt)

Station 2

comm_gbt2<- model_gb(comm_matr2, comm_kt$pheno, y_name=phenotype3,random=cg_name3)

.......................................................... 
summary gblup analysis of trait: plantA 

fixed effects equation:
y ~ 1

random effects equation:
~G + cg + In

log-likelihood: -554.7967 converged in: 7 iterations 

estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)

estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)

breeding value predicted for 949 first 5 animals shown:
            [,1]
1004 -0.31632547
1005  0.05550869
101   0.04673985
1011 -0.05442077
1012 -0.10448062
1013 -0.27930277
..........................................................

varcomp(comm_gbt2)

    Estimate   StdError  prop.var         se
G  0.2225149 0.09414325 0.1058684 0.04692323
cg 0.9300038 0.34470243 0.4424783 0.09518403
In 0.9492876 0.07941447 0.4516532 0.08661636

comm_Gwt2<-run_gwa(comm_gbt2, comm_kt2$geno)

......................................... 
Estimated Genetic Effect and its variance: 
                  ghat var_ghat
MARC0044150 -2.8897587 16.91784
ASGA0000014 -1.5374774 13.48458
ASGA0000021  4.1358112 17.31271
ALGA0000009  0.1580059 15.33220
ALGA0000014  0.1576774 15.33208
......................................... 
Summary of GWA: 
              <0.001 <0.01 <0.025 <0.05 <0.1
signif.values      0     0      0     0    0
attr(,"class")
[1] "summary.gwas"
.........................................

head(comm_Gwt2)

                  ghat var_ghat
MARC0044150 -2.8897587 16.91784
ASGA0000014 -1.5374774 13.48458
ASGA0000021  4.1358112 17.31271
ALGA0000009  0.1580059 15.33220
ALGA0000014  0.1576774 15.33208
H3GA0000032  1.3756023 12.73988

pv2<-getpvalue(comm_Gwt2,log.p = F)
qv2<-qvalue(pv2)
manhattan_plot(qv2$qvalues,map = comm_mpt)

GWA results (SNP effects) are transmitted between Stations and the polygenic scores are computed and added to the phenotypic dataset

comm1_comm2_p <- get_polys(comm_kt1$geno, comm_Gwt2)
Pheno_P1.1 <- add_poly(comm_kt1$pheno, comm1_comm2_p)
head(Pheno_P1.1)

       cg  plantA      poly1
1    7003 -2.2747 -13.802173
10   7003 -2.1520 -16.106521
1001   12 -1.7266  -1.448680
1003   12 -0.8032 -30.234787
1006   12 -0.4731   1.286598
1007   12 -1.7173   2.307809

comm2_comm1_p <- get_polys(comm_kt2$geno, comm_Gwt1)
Pheno_P1.2 <- add_poly(comm_kt2$pheno, comm2_comm1_p)
head(Pheno_P1.2)

      cg  plantA      poly1
1004  12 -1.3983 -30.639754
1005  12 -1.3345  -3.620011
101  999 -1.6654  13.921584
1011  12 -1.1395  17.587270
1012  12 -1.6189 -50.472837
1013  12 -2.5077  -8.481851

Gblup is re-fit including polygenic scores

population 1

comm_gb_p1<- model_gb(G=comm_matr1, pheno=Pheno_P1.1, y_name=phenotype3,fixed=c("poly1"), random=cg_name3)

.......................................................... 
summary gblup analysis of trait: plantA 

fixed effects equation:
y ~ poly1

random effects equation:
~G + cg + In

log-likelihood: -571.5995 converged in: 5 iterations 

estimated fixed effects:
                Estimate    StdError   test.st      p.value
(Intercept) -1.000643670 0.242332458 -4.129219 3.639984e-05
poly1        0.009733358 0.002615501  3.721413 1.981115e-04

estimated variance components:
    Estimate   StdError  prop.var
G  0.2266953 0.09851946 0.1025966
cg 0.9863568 0.36041016 0.4464002
In 0.9965275 0.08325468 0.4510032

breeding value predicted for 942 first 5 animals shown:
            [,1]
1    -0.01354543
10   -0.03252721
1001 -0.20816901
1003  0.35448994
1006  0.28312932
1007 -0.08778297
..........................................................

summary(comm_gb_p1,fe=T)

summary gblup analysis of trait: plantA 

fixed effects equation:
y ~ poly1

random effects equation:
~G + cg + In

log-likelihood: -571.5995 converged in: 5 iterations 

estimated fixed effects:
                Estimate    StdError   test.st      p.value
(Intercept) -1.000643670 0.242332458 -4.129219 3.639984e-05
poly1        0.009733358 0.002615501  3.721413 1.981115e-04

estimated variance components:
    Estimate   StdError  prop.var
G  0.2266953 0.09851946 0.1025966
cg 0.9863568 0.36041016 0.4464002
In 0.9965275 0.08325468 0.4510032

breeding value predicted for 942 first 5 animals shown:
            [,1]
1    -0.01354543
10   -0.03252721
1001 -0.20816901
1003  0.35448994
1006  0.28312932
1007 -0.08778297

varcomp(comm_gb_p1)

    Estimate   StdError  prop.var         se
G  0.2266953 0.09851946 0.1025966 0.04652951
cg 0.9863568 0.36041016 0.4464002 0.09415488
In 0.9965275 0.08325468 0.4510032 0.08611436

population 2

comm_gb_p2<- model_gb(G=comm_matr2, pheno=Pheno_P1.2, y_name=phenotype3,fixed=c("poly1"), random=cg_name3)

.......................................................... 
summary gblup analysis of trait: plantA 

fixed effects equation:
y ~ poly1

random effects equation:
~G + cg + In

log-likelihood: -549.1349 converged in: 7 iterations 

estimated fixed effects:
                Estimate    StdError   test.st      p.value
(Intercept) -0.999466420 0.238773824 -4.185829 2.841271e-05
poly1        0.007190564 0.002188899  3.285014 1.019774e-03

estimated variance components:
    Estimate   StdError  prop.var
G  0.2131445 0.09297474 0.1023778
cg 0.9242567 0.34255210 0.4439399
In 0.9445400 0.07868958 0.4536824

breeding value predicted for 949 first 5 animals shown:
             [,1]
1004 -0.215469793
1005  0.037903266
101  -0.001960094
1011 -0.146138523
1012  0.027425418
1013 -0.242736152
..........................................................

summary(comm_gb_p2,fe=T)

summary gblup analysis of trait: plantA 

fixed effects equation:
y ~ poly1

random effects equation:
~G + cg + In

log-likelihood: -549.1349 converged in: 7 iterations 

estimated fixed effects:
                Estimate    StdError   test.st      p.value
(Intercept) -0.999466420 0.238773824 -4.185829 2.841271e-05
poly1        0.007190564 0.002188899  3.285014 1.019774e-03

estimated variance components:
    Estimate   StdError  prop.var
G  0.2131445 0.09297474 0.1023778
cg 0.9242567 0.34255210 0.4439399
In 0.9445400 0.07868958 0.4536824

breeding value predicted for 949 first 5 animals shown:
             [,1]
1004 -0.215469793
1005  0.037903266
101  -0.001960094
1011 -0.146138523
1012  0.027425418
1013 -0.242736152

varcomp(comm_gb_p2)

    Estimate   StdError  prop.var         se
G  0.2131445 0.09297474 0.1023778 0.04664792
cg 0.9242567 0.34255210 0.4439399 0.09525035
In 0.9445400 0.07868958 0.4536824 0.08711103

Finally prediction accuracies are obtained through cross validation

comm_cv1.1 <- cross_validation(comm_gbt1$model$G, iterate = 10, comm_kt1$pheno, y_name=phenotype3, rand_var = cg_name3)
comm_cv1.2 <- cross_validation(comm_gbt2$model$G, iterate = 10, comm_kt2$pheno, y_name=phenotype3, rand_var = cg_name3)

comm_cv_p1 <- cross_validation(comm_gb_p1$model$G, Pheno_P1.1, y_name=phenotype3, fixed=c("poly1"), rand_var = cg_name3,poly = T)
comm_cv_p2 <- cross_validation(comm_gb_p2$model$G, Pheno_P1.2, y_name=phenotype3, fixed=c("poly1"), rand_var = cg_name3,poly = T)

cbind(mean(comm_cv1.1),
mean(comm_cv1.2),
mean(comm_cv_p1),
mean(comm_cv_p2)
)


cbind(sd(comm_cv1.1),
      sd(comm_cv1.2),
      sd(comm_cv_p1),
      sd(comm_cv_p2)
)