October 26, 2018
Example dataset from the SoyNAM package. We are querying two of the forty biparental families with a shared parental IA3023, grown in 18 environment.
Data = SoyNAM::BLUP(trait = 'yield', family = 2:3)
## solving BLUE of checks ## solving BLUP of phenotypes ## No redundant SNPs found ## There are 312 markers with MAF below the threshold ## Removing markers with more than 50% missing values ## Imputing with expectation (based on transition prob) ## removing repeated genotypes ## solving identity matrix ## indiviual 1 had 37 duplicate(s) ## indiviual 169 had 1 duplicate(s) ## indiviual 182 had 1 duplicate(s)
y = Data$Phen
M = Data$Gen
#
Z = apply(M,2,function(snp) snp-mean(snp))
ZZ = tcrossprod(Z)
Sum2pq = sum(apply(M,2,function(snp){p=mean(snp)/2; return(2*p*(1-p))}))
G = ZZ/Sum2pq
Kernel commonly deployed, referred in VanRaden (2008)
\[G=\frac{(M-P)(M-p)'}{2\sum^{J}_{j=1}{p_j(1-p_j)}}\]
image(G[,241:1], main='GRM heatmap',xaxt='n',yaxt='n')
Spectral = eigen(G,symmetric = TRUE) PCs = Spectral$vectors[,1:5] plot(PCs,xlab='PC 1',ylab='PC 2',main='Population on eigen spaces',col=Data$Fam,pch=20)
GeneticDistance = Gdist(M,method=6)
## Modified Rogers' distance
Tree = hclust(GeneticDistance,method = 'ward.D2') plot(Tree,labels = FALSE)
Clst = factor(cutree(Tree,k=2))
Marker = M[,117] # fit = lm( y ~ Marker ) anova( fit )
## Analysis of Variance Table ## ## Response: y ## Df Sum Sq Mean Sq F value Pr(>F) ## Marker 1 476321 476321 20.172 1.102e-05 *** ## Residuals 239 5643504 23613 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-log(anova(fit)$`Pr(>F)`[1],base = 10)
## [1] 4.957736
reduced_model = lm( y ~ PCs ) full_model = lm( y ~ PCs + Marker ) anova( reduced_model, full_model )
## Analysis of Variance Table ## ## Model 1: y ~ PCs ## Model 2: y ~ PCs + Marker ## Res.Df RSS Df Sum of Sq F Pr(>F) ## 1 235 4060362 ## 2 234 3562067 1 498295 32.734 3.215e-08 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-log((anova( reduced_model, full_model ))$`Pr(>F)`[2],base = 10)
## [1] 7.492813
reduced_model = lm( y ~ Clst ) full_model = lm( y ~ Clst + Marker ) anova( reduced_model, full_model )
## Analysis of Variance Table ## ## Model 1: y ~ Clst ## Model 2: y ~ Clst + Marker ## Res.Df RSS Df Sum of Sq F Pr(>F) ## 1 239 4275698 ## 2 238 3652041 1 623657 40.643 9.4e-10 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-log( anova(reduced_model,full_model)$`Pr(>F)`[2],base = 10)
## [1] 9.026884
Q = model.matrix(~Clst) reduced_model = reml( y=y, X=Q, K=G) full_model = reml( y=y, X=cbind(Q, Marker), K=G) LRT = -2*(full_model$loglik - reduced_model$loglik) -log(pchisq(LRT,1,lower.tail=FALSE),base=10)
## [1] 10.80903
reduced_model = lm( y ~ Clst )
glm_pval = apply(M,2,function(Marker){
pval = anova(reduced_model, lm(y~Clst+Marker) )$`Pr(>F)`[2]
return(-log(pval,base = 10))})
plot(glm_pval,pch=20,xlab='SNP',main='My first GLM GWAS')
NAM random model: \(y=\mu+Marker \times Pop+Zu+e\)
fit_gwa = gwas3(y=y, gen=M, fam=c(Clst), chr=Data$Chrom)
## Calculating G matrix ## Solving polygenic model ## Starting Eigendecomposition ## Starting Marker Analysis ## | | | 0% | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |==== | 7% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |====== | 10% | |======= | 10% | |======= | 11% | |======= | 12% | |======== | 12% | |======== | 13% | |========= | 13% | |========= | 14% | |========= | 15% | |========== | 15% | |========== | 16% | |=========== | 16% | |=========== | 17% | |=========== | 18% | |============ | 18% | |============ | 19% | |============= | 19% | |============= | 20% | |============= | 21% | |============== | 21% | |============== | 22% | |=============== | 22% | |=============== | 23% | |=============== | 24% | |================ | 24% | |================ | 25% | |================= | 25% | |================= | 26% | |================= | 27% | |================== | 27% | |================== | 28% | |=================== | 28% | |=================== | 29% | |=================== | 30% | |==================== | 30% | |==================== | 31% | |==================== | 32% | |===================== | 32% | |===================== | 33% | |====================== | 33% | |====================== | 34% | |====================== | 35% | |======================= | 35% | |======================= | 36% | |======================== | 36% | |======================== | 37% | |======================== | 38% | |========================= | 38% | |========================= | 39% | |========================== | 39% | |========================== | 40% | |========================== | 41% | |=========================== | 41% | |=========================== | 42% | |============================ | 42% | |============================ | 43% | |============================ | 44% | |============================= | 44% | |============================= | 45% | |============================== | 45% | |============================== | 46% | |============================== | 47% | |=============================== | 47% | |=============================== | 48% | |================================ | 48% | |================================ | 49% | |================================ | 50% | |================================= | 50% | |================================= | 51% | |================================= | 52% | |================================== | 52% | |================================== | 53% | |=================================== | 53% | |=================================== | 54% | |=================================== | 55% | |==================================== | 55% | |==================================== | 56% | |===================================== | 56% | |===================================== | 57% | |===================================== | 58% | |====================================== | 58% | |====================================== | 59% | |======================================= | 59% | |======================================= | 60% | |======================================= | 61% | |======================================== | 61% | |======================================== | 62% | |========================================= | 62% | |========================================= | 63% | |========================================= | 64% | |========================================== | 64% | |========================================== | 65% | |=========================================== | 65% | |=========================================== | 66% | |=========================================== | 67% | |============================================ | 67% | |============================================ | 68% | |============================================= | 68% | |============================================= | 69% | |============================================= | 70% | |============================================== | 70% | |============================================== | 71% | |============================================== | 72% | |=============================================== | 72% | |=============================================== | 73% | |================================================ | 73% | |================================================ | 74% | |================================================ | 75% | |================================================= | 75% | |================================================= | 76% | |================================================== | 76% | |================================================== | 77% | |================================================== | 78% | |=================================================== | 78% | |=================================================== | 79% | |==================================================== | 79% | |==================================================== | 80% | |==================================================== | 81% | |===================================================== | 81% | |===================================================== | 82% | |====================================================== | 82% | |====================================================== | 83% | |====================================================== | 84% | |======================================================= | 84% | |======================================================= | 85% | |======================================================== | 85% | |======================================================== | 86% | |======================================================== | 87% | |========================================================= | 87% | |========================================================= | 88% | |========================================================== | 88% | |========================================================== | 89% | |========================================================== | 90% | |=========================================================== | 90% | |=========================================================== | 91% | |=========================================================== | 92% | |============================================================ | 92% | |============================================================ | 93% | |============================================================= | 93% | |============================================================= | 94% | |============================================================= | 95% | |============================================================== | 95% | |============================================================== | 96% | |=============================================================== | 96% | |=============================================================== | 97% | |=============================================================== | 98% | |================================================================ | 98% | |================================================================ | 99% | |=================================================================| 99% | |=================================================================| 100% ## Calculations were performed successfully
plot(fit_gwa, pch=20, main = "My first MLM GWAS")
require(rrBLUP,quietly = TRUE); COL = fit_gwa$MAP[,1]%%2+1 # Color chromosomes geno=data.frame(colnames(M),fit_gwa$MAP[,1:2],t(M-1),row.names=NULL) pheno=data.frame(line=colnames(geno)[-c(1:3)],Pheno=y,Clst,row.names=NULL) fit_another_gwa=GWAS(pheno,geno,fixed='Clst',plot=FALSE)
## [1] "GWAS for trait: Pheno" ## [1] "Variance components estimated. Testing markers."
mlm_pval=fit_another_gwa$Pheno; mlm_pval[mlm_pval==0]=NA plot(glm_pval,mlm_pval,xlab='GLM',ylab='MLM',main='Compare')
nam_pval = fit_gwa$PolyTest$pval par(mfrow=c(1,3)) qqman::qq(glm_pval,main='GLM') qqman::qq(mlm_pval,main='MLM') qqman::qq(nam_pval,main='RLM')
Wikipedia: In statistics, the multiple comparisons, multiplicity or multiple testing problem occurs when one considers a set of statistical inferences simultaneously or infers a subset of parameters selected based on the observed values.In certain fields it is known as the look-elsewhere effect: The more inferences are made, the more likely erroneous inferences are to occur. Several statistical techniques have been developed to prevent this from happening, allowing significance levels for single and multiple comparisons to be directly compared. These techniques generally require a stricter significance threshold for individual comparisons, so as to compensate for the number of inferences being made.
Base significance threshold: \(\alpha=0.05/m\)
plot(fit_gwa, alpha=0.05, main = "No correction")
Bonferroni: \(\alpha=0.05/m\)
plot(fit_gwa, alpha=0.05/ncol(M), main = "Bonferroni correction")
Benjamini-Hochberg FDR: \(\alpha=\frac{0.05}{m\times (1-FDR)}\)
plot(fit_gwa, alpha=0.05/(ncol(M)*.75), main = "FDR 25%")
Unique segments based on Eigenvalues: \(m^* = D > 1\)
m_star = sum(Spectral$values>1) plot(fit_gwa, alpha=0.05/m_star, main="Bonferroni on unique segments")
fit_wgr = bWGR::BayesDpi(y=y,X=M,it=3000); par(mfrow=c(1,2)); plot(fit_wgr$PVAL,col=COL,pch=20,ylab='-log(p)',main='GWA') plot(fit_wgr$b,col=COL,pch=20,ylab='Marker effect',main='GWP')
plot(fit_wgr$hat,y,pch=20)
thr_none = -log(pchisq(qchisq(1-0.05/ncol(M),1),1,lower.tail=FALSE),base=10) thr_bonf = -log(pchisq(qchisq(1-0.05,1),1,lower.tail=FALSE),base=10) par(mfrow=c(1,2)); plot(fit_gwa,alpha=NULL,main="MLM",pch=20); abline(h=thr_none,col=3) plot(fit_wgr$PVAL,col=COL,ylab='-log(p)',main="WGR",pch=20); abline(h=thr_bonf,col=3)
fit_rf = ranger::ranger(y~.,data= data.frame(y=y,M),importance='impurity') plot(fit_rf$variable.importance,ylab='Importance',main='Random Forest',col=COL+7,pch=20)