Code
raw_pheno <- read_xlsx("Raw_phenotypes.xlsx")
dim(raw_pheno)[1] 11424 19
Code
#raw_pheno[1:5, 1:5]A summary of variance components, SNP effects, transfer learning, and cross-validation of wheat traits
raw_pheno <- read_xlsx("Raw_phenotypes.xlsx")
dim(raw_pheno)[1] 11424 19
#raw_pheno[1:5, 1:5]adj_pheno <- read.csv("adj_phenotypes.csv")
adj_pheno <- adj_pheno[,-c(8:20)]
dim(adj_pheno)[1] 11424 20
#Unique phenotype
unique_pheno <- adj_pheno[!duplicated(adj_pheno$ID), ]
dim(unique_pheno)[1] 10375 20
Pheno_dup <- adj_pheno|>
group_by(ID) |>
summarise(count = n()) |>
filter((count >1)) #adj_phenotypes w/ dupllicates
Pheno_dup# A tibble: 8 × 2
ID count
<chr> <int>
1 AYT-O-1994 150
2 AYT-O-2008 150
3 AYT-O-2021 150
4 AYT-O-2031 150
5 AYT-O-2033 150
6 AYT-O-2048 150
7 AYT-O-2052 7
8 AYT-O-2062 150
#Phenotype without IDs in Pheno_dup(ID==NAs)
y_check <- adj_pheno[!(adj_pheno$ID%in%Pheno_dup$ID),]
dim(y_check)[1] 10367 20
summary(y_check) X ID Range Row
Min. : 1 Length:10367 Min. : 1.00 Min. : 1.0
1st Qu.: 2854 Class :character 1st Qu.: 6.00 1st Qu.:120.0
Median : 5707 Mode :character Median :12.00 Median :238.0
Mean : 5709 Mean :12.49 Mean :238.5
3rd Qu.: 8564 3rd Qu.:18.00 3rd Qu.:357.0
Max. :11423 Max. :24.00 Max. :476.0
Location Year GrowthHabit adj_NDVI
Length:10367 Min. :2014 Min. :1.000 Min. :-0.495391
Class :character 1st Qu.:2014 1st Qu.:2.000 1st Qu.:-0.048276
Mode :character Median :2014 Median :2.000 Median : 0.004227
Mean :2014 Mean :2.365 Mean :-0.000125
3rd Qu.:2014 3rd Qu.:3.000 3rd Qu.: 0.050554
Max. :2014 Max. :8.000 Max. : 0.266729
NA's :9 NA's :6
adj_PlantHeight adj_LeafLoss adj_Glaucousness
Min. :-4.064010 Min. :-4.538660 Min. :-4.027074
1st Qu.:-0.627596 1st Qu.:-1.150594 1st Qu.:-1.485844
Median :-0.026288 Median :-0.113439 Median :-0.373189
Mean :-0.000589 Mean : 0.000217 Mean : 0.001182
3rd Qu.: 0.727739 3rd Qu.: 1.175993 3rd Qu.: 1.251936
Max. : 3.513630 Max. : 4.686780 Max. : 6.144128
adj_Greenness adj_LeafWidth adj_Biomass adj_Yellows
Min. :-4.768577 Min. :-3.942787 Min. :-4.87902 Min. :-1.417275
1st Qu.:-1.097327 1st Qu.:-0.949922 1st Qu.:-0.69445 1st Qu.:-0.605954
Median : 0.084101 Median : 0.018754 Median : 0.04168 Median :-0.402283
Mean : 0.000695 Mean : 0.000468 Mean :-0.00068 Mean : 0.000276
3rd Qu.: 1.049599 3rd Qu.: 0.950542 3rd Qu.: 0.72300 3rd Qu.: 0.415370
Max. : 3.755015 Max. : 3.388017 Max. : 3.33733 Max. : 7.033529
NA's :107
adj_Maturity adj_GrainYield adj_TestWeight
Min. :-22.252293 Min. :-2643.450 Min. :-10.367262
1st Qu.: -4.352765 1st Qu.: -365.178 1st Qu.: -0.984365
Median : 0.584405 Median : 27.237 Median : 0.185297
Mean : -0.001202 Mean : 0.168 Mean : -0.000197
3rd Qu.: 4.641852 3rd Qu.: 389.405 3rd Qu.: 1.165534
Max. : 15.284852 Max. : 2080.314 Max. : 4.942738
NA's :123 NA's :26
adj_GrainSizeTKW adj_GrainProtein
Min. :-14.646842 Min. :-2.85376
1st Qu.: -2.922147 1st Qu.:-0.51878
Median : -0.136602 Median :-0.02669
Mean : 0.000636 Mean : 0.00060
3rd Qu.: 2.790503 3rd Qu.: 0.49063
Max. : 20.142392 Max. : 3.73942
NA's :27 NA's :72
R_file <- readRDS("Genotypes.rds")
R_file[1:5, 1:5] #non-standardized markers AXAFFSNP-00001 AXAFFSNP-00002 AXAFFSNP-00003 AXAFFSNP-00004
AYT-O-1848 1 -1 -1 1
AYT-O-0974 -1 -1 -1 1
AYT-S-0579 1 -1 -1 1
AYT-S-0299 1 -1 -1 1
AYT-S-0664 1 -1 -1 1
AXAFFSNP-00005
AYT-O-1848 1
AYT-O-0974 1
AYT-S-0579 1
AYT-S-0299 1
AYT-S-0664 1
length(unique(adj_pheno$ID))[1] 10375
all(rownames(R_file)%in%adj_pheno$ID)[1] TRUE
all(unique(adj_pheno$ID)%in%rownames(R_file))[1] TRUE
table(table(adj_pheno$ID))
1 7 150
10367 1 7
table(table(rownames(R_file)))
1
10375
#Genotype with phenotypes that contain some variables
Geno_n_pheo <- R_file[(rownames(R_file)%in%y_check$ID),]
dim(Geno_n_pheo)[1] 10367 17181
set.seed(234)
smp <- sample(1:nrow(y_check), 2000)
yp <- y_check[smp, ]
smp2 <- sample(1:ncol(Geno_n_pheo), 1000)
Gp <- Geno_n_pheo[yp$ID, smp2]
all.equal(rownames(Gp), yp$ID)[1] TRUE
\(P = 2(p_i - 0.5)\) as pclust_all. Check Efficient methods to compute genomic predictions for more details.
\(Z = M - P\), where \(M\) is the marker allele individual inherited \(\{-1/0/1\}\)
\(G = \frac{ZZ'}{2pq}\)
Pc <- colMeans(Gp+1)/2
summary(Pc) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.1940 0.5208 0.5048 0.8110 1.0000
Qc <- 1 - Pc
summary(Qc) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.1890 0.4792 0.4952 0.8060 1.0000
MFc <- pmin(Pc, 1-Pc)
MFc_1 <- MFc > 0.05
Gpc <- Gp[, MFc_1]
Pc_1 <- colMeans(Gpc+1)/2
summary(Pc_1) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.05033 0.26970 0.51969 0.50428 0.74035 0.94971
Qc_1 <- 1 - Pc_1
Pz <- 2*(Pc_1 - 0.5)
Zc <- sweep(Gpc, 2, Pz, FUN = "-")
Zt <- tcrossprod(Zc)
s2pq <- sum(2*Pc_1*Qc_1)
G_all <- Zt/s2pq
summary(diag(G_all)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8887 1.6445 1.9015 1.8592 2.0794 2.5086
set.seed(78)
ks <- kmeans(Zc, centers = 3, nstart = 10)
mk_rand1 <- sample(ncol(Zc), 350)
mk_rand2 <- sample(ncol(Zc), 670)
mk_rand3 <- sample(ncol(Zc), 490)
gh1 <- Zc[ks$cluster ==1,mk_rand1]
gh2 <- Zc[ks$cluster ==2,mk_rand2]
gh3 <- Zc[ks$cluster ==3,mk_rand3]
cat("Zcombined:", dim(Zc), "\n") #Z completeZcombined: 2000 762
cat("Z for pop 1:", dim(gh1), "\n") #Z1 for population 1Z for pop 1: 523 350
cat("Z2:", dim(gh2), "\n") #Z2 for population 2Z2: 781 670
cat("Z3:", dim(gh3), "\n") #Z3 for population 3Z3: 696 490
\[T_i = (Z'_iZ_i)^{-1}Z'_iZ_c\]
\[Z_c = Z_iT_i\]
T <- solve(t(gh1)%*%gh1)%*%t(gh1)%*%Zc[rownames(gh1),] #T for population 1
cat("T1:", dim(T), "\n") #T1 for population 1T1: 350 762
T2 <- solve(t(gh2)%*%gh2)%*%t(gh2)%*%Zc[rownames(gh2),] #T for population 2
cat("T2:", dim(T2), "\n") #T2 for population 2T2: 670 762
T3 <- solve(t(gh3)%*%gh3)%*%t(gh3)%*%Zc[rownames(gh3),] #T for population 3
cat("T3:", dim(T3), "\n") #T3 for population 3T3: 490 762
#Check
Z_c <- gh1%*%T
cat("Z_complete for individuals in pop 1:", dim(Z_c), "\n") #Z completeZ_complete for individuals in pop 1: 523 762
Z_c2 <- gh2%*%T2
Z_c3 <- gh3%*%T3P1 <- Pc_1[mk_rand1]
P2 <- Pc_1[mk_rand2]
P3 <- Pc_1[mk_rand3]
Gm1 <- tcrossprod(gh1)/sum(2*P1*(1-P1))
Gm2 <- tcrossprod(gh2)/sum(2*P2*(1-P2))
Gm3 <- tcrossprod(gh3)/sum(2*P3*(1-P3))
Gm1t <- tcrossprod(Z_c)/sum(2*P1*(1-P1))
Gm2t <- tcrossprod(Z_c2)/sum(2*P2*(1-P2))
Gm3t <- tcrossprod(Z_c3)/sum(2*P3*(1-P3))y_pheno <- data.frame(yp[, -1], row.names = 1)
y_pheno$GrowthHabit <- as.factor(y_pheno$GrowthHabit)
yc <- y_pheno[ks$cluster ==1,]
yc2 <- y_pheno[ks$cluster ==2,]
yc3 <- y_pheno[ks$cluster ==3,]gpred <- model_gb(Gm1, yc, y_name = 'adj_PlantHeight',random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -282.1665 converged in: 13 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 520 first 5 animals shown:
[,1]
AYT-O-0747 -0.44074298
PYT-S-4057 0.00796764
PYT-S-5267 0.12953880
PYT-S-3457 0.34140883
PYT-S-5009 0.07658197
PYT-S-1578 0.42420852
..........................................................
gpred2 <- model_gb(Gm2, yc2, y_name = 'adj_PlantHeight',random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -326.0185 converged in: 8 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 781 first 5 animals shown:
[,1]
AYT-S-0214 0.03764242
PYT-S-0872 0.45664402
PYT-S-0310 -0.12313513
PYT-S-0643 0.02194633
PYT-S-2639 0.28402193
PYT-S-0675 0.09190713
..........................................................
gpred3 <- model_gb(Gm3, yc3, y_name = 'adj_PlantHeight',random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -298.8649 converged in: 7 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 696 first 5 animals shown:
[,1]
PYT-S-2817 -1.19681095
PYT-S-3520 -0.36418352
PYT-S-4630 -0.57914896
PYT-S-3700 -0.76634901
PYT-S-3915 -0.02313317
PYT-S-2556 1.08935045
..........................................................
gpred_A <- model_gb(G_all , y_pheno, y_name = 'adj_PlantHeight', random = 'GrowthHabit') #combined..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -909.2252 converged in: 7 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 1997 first 5 animals shown:
[,1]
AYT-O-0747 -0.54099792
AYT-S-0214 0.04623541
PYT-S-4057 0.67847766
PYT-S-0872 0.30137969
PYT-S-5267 -0.19742503
PYT-S-0310 -0.40657306
..........................................................
gpd <- list(gpred, gpred2, gpred3, gpred_A)
lapply(gpd, varcomp)[[1]]
Estimate StdError prop.var se
G 0.179209063 0.048377077 0.167278925 0.043521348
GrowthHabit -0.005157945 0.001443757 -0.004814576 0.001325488
In 0.897267722 0.068530585 0.837535651 0.058339582
[[2]]
Estimate StdError prop.var se
G 0.25888580 0.04361968 0.27168586 0.04533680
GrowthHabit 0.08656524 0.07154837 0.09084528 0.06855478
In 0.60743548 0.04005889 0.63746886 0.06591133
[[3]]
Estimate StdError prop.var se
G 0.23405155 0.05035106 0.24339387 0.04685682
GrowthHabit 0.03816556 0.03608073 0.03968896 0.03617382
In 0.68939934 0.04504291 0.71691717 0.05962710
[[4]]
Estimate StdError prop.var se
G 0.19344007 0.02597810 0.19490686 0.02405537
GrowthHabit 0.03947198 0.03029067 0.03977128 0.02935576
In 0.75956235 0.02729989 0.76532186 0.03570723
gwa_c1 <- gwas(gpred, t(gh1))
gwa_c2 <- gwas(gpred2, t(gh2))
gwa_c3 <- gwas(gpred3, t(gh3))
gwa_cA <- gwas(gpred_A, t(Zc))gpredt <- model_gb(Gm1t, yc, y_name = 'adj_PlantHeight',random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -279.0025 converged in: 21 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 520 first 5 animals shown:
[,1]
AYT-O-0747 -0.246604850
PYT-S-4057 0.385375632
PYT-S-5267 -0.003119352
PYT-S-3457 0.934870667
PYT-S-5009 0.131613753
PYT-S-1578 0.398211385
..........................................................
gpred2t <- model_gb(Gm2t, yc2, y_name = 'adj_PlantHeight',random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -324.685 converged in: 8 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 781 first 5 animals shown:
[,1]
AYT-S-0214 0.0002249325
PYT-S-0872 0.4971828702
PYT-S-0310 -0.1587074104
PYT-S-0643 -0.0049202925
PYT-S-2639 0.2260634653
PYT-S-0675 0.0557953340
..........................................................
gpred3t <- model_gb(Gm3t, yc3, y_name = 'adj_PlantHeight',random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ 1
random effects equation:
~G + GrowthHabit + In
log-likelihood: -295.5092 converged in: 7 iterations
estimated fixed effects:
[1] test.st p.value
<0 rows> (or 0-length row.names)
estimated variance components:
[1] prop.var
<0 rows> (or 0-length row.names)
breeding value predicted for 696 first 5 animals shown:
[,1]
PYT-S-2817 -1.33857179
PYT-S-3520 -0.24700235
PYT-S-4630 -0.70179429
PYT-S-3700 -0.74068871
PYT-S-3915 0.09333683
PYT-S-2556 1.06080448
..........................................................
gpd <- list(gpredt, gpred2t, gpred3t)
lapply(gpd, varcomp)[[1]]
Estimate StdError prop.var se
G 0.107359293 0.0212844839 0.112238399 0.022932896
GrowthHabit -0.006068053 0.0003846691 -0.006343825 0.000183565
In 0.855237849 0.0629864682 0.894105426 0.033623549
[[2]]
Estimate StdError prop.var se
G 0.24323157 0.03994221 0.26213591 0.04427342
GrowthHabit 0.09122674 0.07490737 0.09831703 0.07309824
In 0.59342513 0.03976594 0.63954706 0.06799320
[[3]]
Estimate StdError prop.var se
G 0.17814204 0.03592232 0.20390673 0.03951195
GrowthHabit 0.03610586 0.03440114 0.04132785 0.03788353
In 0.65939683 0.04450621 0.75476542 0.05671778
gwa_1 <- gwas(gpredt, t(Z_c))
gwa_2 <- gwas(gpred2t, t(Z_c2))
gwa_3 <- gwas(gpred3t, t(Z_c3))P1_2 <- get_polys(gh1, gwa_c2, scale = FALSE)
P1_3 <- get_polys(gh1, gwa_c3, scale = FALSE)
yp1 <- add_poly(yc, P1_2, P1_3)
gpred1_2 <- model_gb(Gm1, yp1, y_name = 'adj_PlantHeight',fixed = c('poly1', 'poly2'), random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ poly1 + poly2
random effects equation:
~G + GrowthHabit + In
log-likelihood: -280.6772 converged in: 13 iterations
estimated fixed effects:
Estimate StdError test.st p.value
(Intercept) 1.544094e-01 0.125050015 1.23478143 0.2169119
poly1 3.652112e-05 0.001049638 0.03479401 0.9722440
poly2 1.571549e-03 0.001668810 0.94171851 0.3463368
estimated variance components:
Estimate StdError prop.var
G 0.183759671 0.049644864 0.171103869
GrowthHabit -0.004934129 0.001773975 -0.004594308
In 0.895140065 0.068670058 0.833490439
breeding value predicted for 520 first 5 animals shown:
[,1]
AYT-O-0747 -0.44167260
PYT-S-4057 -0.03756636
PYT-S-5267 0.16189930
PYT-S-3457 0.34638072
PYT-S-5009 0.05965172
PYT-S-1578 0.43251649
..........................................................
P1_2t <- get_polys(Z_c, gwa_2, scale = FALSE)
P1_3t <- get_polys(Z_c, gwa_3, scale = FALSE)
ypt <- add_poly(yc, P1_2t, P1_3t)
gpred1_2t <- model_gb(Gm1t, ypt, y_name = 'adj_PlantHeight',fixed = c('poly1', 'poly2'), random = 'GrowthHabit')..........................................................
summary gblup analysis of trait: adj_PlantHeight
fixed effects equation:
y ~ poly1 + poly2
random effects equation:
~G + GrowthHabit + In
log-likelihood: -277.165 converged in: 15 iterations
estimated fixed effects:
Estimate StdError test.st p.value
(Intercept) 0.0954107382 0.1046169949 0.9120004 0.3617685
poly1 0.0005411233 0.0006501459 0.8323105 0.4052337
poly2 0.0011662603 0.0010952842 1.0648015 0.2869657
estimated variance components:
Estimate StdError prop.var
G 0.093744758 0.0240417164 0.097367522
GrowthHabit -0.005976068 0.0004393901 -0.006207013
In 0.875024200 0.0679212324 0.908839491
breeding value predicted for 520 first 5 animals shown:
[,1]
AYT-O-0747 -0.2577357
PYT-S-4057 0.2294913
PYT-S-5267 0.1271897
PYT-S-3457 0.6905762
PYT-S-5009 0.2131915
PYT-S-1578 0.3875762
..........................................................
gw1_2t <- gwas(gpred1_2t, t(Z_c))
gw_TL1 <- gpred1_2t$coefm[2]*gwa_2$ghat + gpred1_2t$coefm[3]*gwa_3$ghat + gwa_1$ghat cr1 <- cor(gwa_1$ghat, gw_TL1)
tibble(x= gwa_1[,1], y = gw_TL1) %>%
ggplot(., aes(x,y))+
geom_point(color = "gold")+
geom_abline(color = "firebrick")+
geom_label(x = -3.7, y = 4, label = paste0("r: ", round(cr1, 2)))+
labs(title = "Individual complete SNP effects with Transfer complete SNP Effects",
subtitle = "Group 1",
x = expression(hat(g)[Ind]),
y = expression(hat(g)[Transfer]))+
theme(plot.title = element_text(color = "firebrick"), plot.subtitle = element_text(color = "goldenrod4"), axis.title.x = element_text(colour = "firebrick"),axis.title.y = element_text(colour = "goldenrod4"))