#setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015") # Set current working directory
setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015") # Set current working directory
rm(list=ls()) # Remove all objects from work environment
library(synbreed)
## Warning: package 'synbreed' was built under R version 3.0.3
## Loading required package: doBy
## Warning: package 'doBy' was built under R version 3.0.3
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Warning: package 'regress' was built under R version 3.0.3
## Loading required package: abind
library(synbreedData)
library(rrBLUP)
Notice that the label.heter function is just “AB” because that is the only heterozygote genotype Also notice the “naive” imputation.
data(cattle)
summary(cattle)
## object of class 'gpData'
## covar
## No. of individuals 1929
## phenotyped 500
## genotyped 500
## pheno
## No. of traits: 2
##
## Phenotype1 Phenotype2
## Min. :-50.07000 Min. :-424.43
## 1st Qu.:-10.25750 1st Qu.: -94.40
## Median : 0.13500 Median : -0.84
## Mean : -0.00104 Mean : 0.05
## 3rd Qu.: 10.35250 3rd Qu.: 96.99
## Max. : 49.41000 Max. : 422.63
##
## geno
## No. of markers 7250
## genotypes AA AB BB
## frequencies 0.2889283 0.3456006 0.3627126
## NA's 0.276 %
## map
## No. of mapped markers 7250
## No. of chromosomes 29
##
## markers per chromosome
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250
## 19 20 21 22 23 24 25 26 27 28 29
## 250 250 250 250 250 250 250 250 250 250 250
##
## pedigree
## Number of
## individuals 1929
## Par 1 376
## Par 2 1053
## generations 6
cattleC<-codeGeno(cattle, label.heter = "AB",impute=T,impute.type="random",nmiss=0.01,maf=0.05)
##
## Summary of imputation
## total number of missing values : 9145
## number of random imputations : 9145
#function to check if there has been errors in recoding
check_recode<-function(i,M1,M2){ #gets two matrices and a column name
M<-as.matrix(table(M1[,i],M2[,i])) # creates a frequency table of the column in two matrices
ds<-sum(diag(M)) #sums diagonal
cds<-sum(diag(M[nrow(M):1,])) #sums counter diagonal
ts<-sum(M) #sums all matrix
return(ts==max(ds,cds)) #if recoding is 1:1 total should equal sum of diagonal or counter diagonal
}
#applies to all the columns of the recoded matrix
eqs<-sapply(colnames(cattleC$geno),check_recode,cattle$geno,cattleC$geno)
summary(eqs) #all OK
## Mode TRUE NA's
## logical 6688 0
dim(cattleC$geno) #all OK
## [1] 500 6688
G1<-kin(cattleC,ret="realized") # divides all columns by sum of expected variances
dim(G1)
## [1] 500 500
#computegenotype matrix as -1,0,1 code for rrBLUP
genot<-cattleC$geno-1
#system.time({G1b<-A.mat(genot)}) #run this to compare performance of single core
#A matrix from imputed genotypes
#try doing the same but using cattleCoded and imputing with A.mat.
#check imputation accuracy
system.time({G1b<-A.mat(genot,n.core=4)})
## user system elapsed
## 1.92 0.06 2.03
#comparison of the two relationship matrices
plot(diag(G1),diag(G1b))
abline(0,1,col="red")
This analysis is useful to explore population structure
eigenan<-eigen(G1b)
names(eigenan) #values and vectors (loadings)
## [1] "values" "vectors"
#values used to see % of variance explained by each component
eigenvalues<-eigenan$values
barplot(eigenvalues/sum(eigenvalues),ylab="% var. explained")
summary(eigenvalues/sum(eigenvalues))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0007489 0.0013090 0.0020000 0.0024770 0.0192500
head(eigenvalues/sum(eigenvalues))
## [1] 0.01925430 0.01394084 0.01343662 0.01252472 0.01085000 0.01004555
#principal component plots (try MDS as well)
pairs(eigenan$vectors[,1:4])
The conclussion is that there is very little, if any, population substructure in this population.
Notice the small % of variation explained by the first principal components.
Plus… there is very little need of using PC as fixed effects if we include the relationship matrix (see paper in D2L)
###Preparing the data for rrBLUP input
dim(genot)
## [1] 500 6688
#check for order of genotypes and maps
sum(colnames(genot)!=rownames(cattleC$map))
## [1] 0
#build genotype matrix for rrBLUP
genotas<-cbind(1:ncol(genot),cattleC$map[1],cattleC$map[2],t(genot))
dim(t(genot))
## [1] 6688 500
colnames(genotas)[1]<-"marker"
genotas[1:10,1:5]
## marker chr pos ID11430 ID11431
## SNP_1 1 1 0.000001 0 -1
## SNP_2 2 1 0.202580 -1 0
## SNP_4 3 1 0.668932 -1 1
## SNP_5 4 1 0.763913 -1 -1
## SNP_6 5 1 0.828652 -1 -1
## SNP_7 6 1 1.002623 -1 0
## SNP_8 7 1 1.079444 -1 0
## SNP_10 8 1 1.417374 0 -1
## SNP_11 9 1 1.546943 0 -1
## SNP_12 10 1 1.692001 -1 0
#prepare phenotype file
pheno<-cattleC$pheno[,,1]
pheno<-data.frame(rownames(pheno),pheno)
head(pheno)
## rownames.pheno. Phenotype1 Phenotype2
## ID11430 ID11430 -23.43 263.90
## ID11431 ID11431 15.48 182.22
## ID11432 ID11432 -19.18 6.64
## ID11433 ID11433 -10.43 42.55
## ID11434 ID11434 -14.07 326.41
## ID11435 ID11435 -6.62 -34.01
dim(pheno)
## [1] 500 3
#check phenotype and genotype order
sum(rownames(pheno)!=rownames(genot))
## [1] 0
#run GWA
system.time({
ans1<-GWAS(pheno=pheno,geno=genotas,
P3D=TRUE,K=G1b,
n.core=1,plot=F)})
## [1] "GWAS for trait: Phenotype1"
## [1] "Variance components estimated. Testing markers."
## [1] "GWAS for trait: Phenotype2"
## [1] "Variance components estimated. Testing markers."
## user system elapsed
## 170.01 7.22 180.58
head(ans1)
## marker chr pos Phenotype1 Phenotype2
## SNP_1 1 1 0.000001 1.3222535 0.217297287
## SNP_2 2 1 0.202580 0.4809035 0.146944675
## SNP_4 3 1 0.668932 0.1847823 0.004335523
## SNP_5 4 1 0.763913 0.3751974 0.899686291
## SNP_6 5 1 0.828652 0.3212045 0.391514571
## SNP_7 6 1 1.002623 0.6713966 0.687508889
Is there any really significant association?
pv1<-10^-ans1[,4]
pv2<-10^-ans1[,5]
hist(pv1)
summary(pv1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0006736 0.2427000 0.4901000 0.4943000 0.7457000 0.9999000
quantile(pv1,c(0.001,0.005,0.01))
## 0.1% 0.5% 1%
## 0.001933584 0.005137266 0.010873387
qqplot(-log(runif(100000),10),ans1[,4],
xlab="expected quantile -log10(pv)",
ylab="observed quantile -log10(pv)",
pch=19,cex=0.1)
abline(0,1,col="red")
hist(pv2)
summary(pv2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000151 0.2469000 0.4946000 0.4979000 0.7490000 1.0000000
quantile(pv2,c(0.001,0.005,0.01))
## 0.1% 0.5% 1%
## 0.001707315 0.006113482 0.009361335
qqplot(-log(runif(100000),10),ans1[,5],
xlab="expected quantile -log10(pv)",
ylab="observed quantile -log10(pv)",
pch=19,cex=0.1)
abline(0,1,col="red")
The conclussion is: No significant association is detected for any of these traits.
This is a very important question: How small needs to be a p-value to be called significant if we perform 1000s of tests?
Let’s create a function to plot p-values and then use it with a simulation
Notice that this function takes up a vector of p-values and it generates a log-uniform q-qplot
It has the possibility of creating a new plot or “adding the points” to an existing one.
It also parses any graphical option to the plot or points graphical functions
qqgplot = function(pvector, main=NULL, add=F,...) {
o = -log10(sort(pvector,decreasing=F))
e = -log10( 1:length(o)/length(o) )
if (!add){
plot(e,o, main=main, ...,
xlab=expression(Expected~~-log[10](italic(p))),
ylab=expression(Observed~~-log[10](italic(p))),
xlim=c(0,max(e)), ylim=c(0,max(o)+1))
} else {
points(e,o,...)
}
lines(e,e,col="red")
}
pvs<-matrix(0,1000,500)
for (i in 1:500){
pvs[,i]<-sort(runif(1000)) #uniform pv
#qqplot
qqgplot(pvs[,i],cex=.3,pch=19,add=c(T,F)[(i==1)+1],type="l",col="lightgrey")
}
#compute median across each quantile
qs<-apply(pvs,1,quantile,probs=0.5)
#add to qqplot
qqgplot(qs,cex=.3,pch=19,add=T,type="l",lty="dashed",lwd=3)
#compute confidence intervals for each quantile
qs<-apply(pvs,1,quantile,probs=c(0.05,.95))
#compute minimum and maximum of each quantile
qmin<-apply(pvs,1,min)
qmax<-apply(pvs,1,max)
#add them to qqplot
qqgplot(qmin,cex=.3,pch=19,add=T,type="l",lty="dashed",lwd=1)
qqgplot(qmax,cex=.3,pch=19,add=T,type="l",lty="dashed",lwd=1)
#prepare polygonal with confidence intervals
x1<- -log10( 1:length(qs[1,])/length(qs[1,]))
x2<- -log10( 1:length(qs[2,])/length(qs[2,]))
X.Vec <- c(x1, tail(x1, 1), rev(x2), x2[1])
Y.Vec <- c(-log(qs[1,],10), -log(tail(qs[1],1), 10), -log(rev(qs[2,]),10), -log(qs[2,1],10))
#add 95% confidence band of each quantile
polygon(X.Vec, Y.Vec, border = NA,col=rgb(0.33,.33,0.33,.3))