Set up files

#setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015")    # Set current working directory
setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015")    # Set current working directory
rm(list=ls())        # Remove all objects from work environment

library(synbreed)
## Warning: package 'synbreed' was built under R version 3.0.3
## Loading required package: doBy
## Warning: package 'doBy' was built under R version 3.0.3
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Warning: package 'regress' was built under R version 3.0.3
## Loading required package: abind
library(synbreedData)
library(rrBLUP)

Load cattle dataset and impute it

Notice that the label.heter function is just “AB” because that is the only heterozygote genotype Also notice the “naive” imputation.

data(cattle)
summary(cattle)
## object of class 'gpData' 
## covar 
##   No. of individuals 1929 
##           phenotyped 500 
##            genotyped 500 
## pheno 
##   No. of traits:        2 
## 
##    Phenotype1          Phenotype2     
##  Min.   :-50.07000   Min.   :-424.43  
##  1st Qu.:-10.25750   1st Qu.: -94.40  
##  Median :  0.13500   Median :  -0.84  
##  Mean   : -0.00104   Mean   :   0.05  
##  3rd Qu.: 10.35250   3rd Qu.:  96.99  
##  Max.   : 49.41000   Max.   : 422.63  
## 
## geno 
##   No. of markers 7250 
##   genotypes AA AB BB 
##   frequencies 0.2889283 0.3456006 0.3627126 
##   NA's 0.276 %
## map 
##   No. of mapped markers  7250 
##   No. of chromosomes     29 
## 
##   markers per chromosome 
##  
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 
##  19  20  21  22  23  24  25  26  27  28  29 
## 250 250 250 250 250 250 250 250 250 250 250 
## 
## pedigree 
## Number of 
##   individuals  1929 
##   Par 1        376 
##   Par 2        1053 
##   generations  6
cattleC<-codeGeno(cattle, label.heter = "AB",impute=T,impute.type="random",nmiss=0.01,maf=0.05)
## 
##      Summary of imputation 
##     total number of missing values                : 9145 
##     number of random imputations                  : 9145
#function to check if there has been errors in recoding
check_recode<-function(i,M1,M2){ #gets two matrices and a column name
  M<-as.matrix(table(M1[,i],M2[,i])) # creates a frequency table of the column in two matrices
  ds<-sum(diag(M))                   #sums diagonal
  cds<-sum(diag(M[nrow(M):1,]))      #sums counter diagonal
  ts<-sum(M)                        #sums all matrix
  return(ts==max(ds,cds))            #if recoding is 1:1 total should equal sum of diagonal or counter diagonal
}

#applies to all the columns of the recoded matrix
eqs<-sapply(colnames(cattleC$geno),check_recode,cattle$geno,cattleC$geno)
summary(eqs) #all OK
##    Mode    TRUE    NA's 
## logical    6688       0
dim(cattleC$geno) #all OK
## [1]  500 6688

compute relationship matrix with synbreed

G1<-kin(cattleC,ret="realized") # divides all columns by sum of expected variances
dim(G1)
## [1] 500 500
#computegenotype matrix as -1,0,1 code for rrBLUP
genot<-cattleC$geno-1

#system.time({G1b<-A.mat(genot)}) #run this to compare performance of single core
#A matrix from imputed genotypes
#try doing the same but using cattleCoded and imputing with A.mat.
#check imputation accuracy
system.time({G1b<-A.mat(genot,n.core=4)})
##    user  system elapsed 
##    1.92    0.06    2.03
#comparison of the two relationship matrices
plot(diag(G1),diag(G1b))
abline(0,1,col="red")

eigenvalue analysis

This analysis is useful to explore population structure

eigenan<-eigen(G1b)
names(eigenan) #values and vectors (loadings)
## [1] "values"  "vectors"
#values used to see % of variance explained by each component
eigenvalues<-eigenan$values
barplot(eigenvalues/sum(eigenvalues),ylab="% var. explained")

summary(eigenvalues/sum(eigenvalues))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0007489 0.0013090 0.0020000 0.0024770 0.0192500
head(eigenvalues/sum(eigenvalues))
## [1] 0.01925430 0.01394084 0.01343662 0.01252472 0.01085000 0.01004555
#principal component plots (try MDS as well)
pairs(eigenan$vectors[,1:4])

The conclussion is that there is very little, if any, population substructure in this population.
Notice the small % of variation explained by the first principal components.
Plus… there is very little need of using PC as fixed effects if we include the relationship matrix (see paper in D2L)
###Preparing the data for rrBLUP input

dim(genot)
## [1]  500 6688
#check for order of genotypes and maps
sum(colnames(genot)!=rownames(cattleC$map))
## [1] 0
#build genotype matrix for rrBLUP
genotas<-cbind(1:ncol(genot),cattleC$map[1],cattleC$map[2],t(genot))
dim(t(genot))
## [1] 6688  500
colnames(genotas)[1]<-"marker"
genotas[1:10,1:5]
##        marker chr      pos ID11430 ID11431
## SNP_1       1   1 0.000001       0      -1
## SNP_2       2   1 0.202580      -1       0
## SNP_4       3   1 0.668932      -1       1
## SNP_5       4   1 0.763913      -1      -1
## SNP_6       5   1 0.828652      -1      -1
## SNP_7       6   1 1.002623      -1       0
## SNP_8       7   1 1.079444      -1       0
## SNP_10      8   1 1.417374       0      -1
## SNP_11      9   1 1.546943       0      -1
## SNP_12     10   1 1.692001      -1       0
#prepare phenotype file
pheno<-cattleC$pheno[,,1]
pheno<-data.frame(rownames(pheno),pheno)
head(pheno)
##         rownames.pheno. Phenotype1 Phenotype2
## ID11430         ID11430     -23.43     263.90
## ID11431         ID11431      15.48     182.22
## ID11432         ID11432     -19.18       6.64
## ID11433         ID11433     -10.43      42.55
## ID11434         ID11434     -14.07     326.41
## ID11435         ID11435      -6.62     -34.01
dim(pheno)
## [1] 500   3
#check phenotype and genotype order
sum(rownames(pheno)!=rownames(genot))
## [1] 0

Run GWA for two traits

#run GWA
system.time({
  ans1<-GWAS(pheno=pheno,geno=genotas,
             P3D=TRUE,K=G1b,
             n.core=1,plot=F)})
## [1] "GWAS for trait: Phenotype1"
## [1] "Variance components estimated. Testing markers."
## [1] "GWAS for trait: Phenotype2"
## [1] "Variance components estimated. Testing markers."
##    user  system elapsed 
##  170.01    7.22  180.58
head(ans1)
##       marker chr      pos Phenotype1  Phenotype2
## SNP_1      1   1 0.000001  1.3222535 0.217297287
## SNP_2      2   1 0.202580  0.4809035 0.146944675
## SNP_4      3   1 0.668932  0.1847823 0.004335523
## SNP_5      4   1 0.763913  0.3751974 0.899686291
## SNP_6      5   1 0.828652  0.3212045 0.391514571
## SNP_7      6   1 1.002623  0.6713966 0.687508889

Exploratory analysis of results

Is there any really significant association?

pv1<-10^-ans1[,4]
pv2<-10^-ans1[,5]
hist(pv1)

summary(pv1)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0006736 0.2427000 0.4901000 0.4943000 0.7457000 0.9999000
quantile(pv1,c(0.001,0.005,0.01))
##        0.1%        0.5%          1% 
## 0.001933584 0.005137266 0.010873387
qqplot(-log(runif(100000),10),ans1[,4],
       xlab="expected quantile -log10(pv)",
       ylab="observed quantile -log10(pv)",
       pch=19,cex=0.1)
abline(0,1,col="red")

hist(pv2)

summary(pv2)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000151 0.2469000 0.4946000 0.4979000 0.7490000 1.0000000
quantile(pv2,c(0.001,0.005,0.01))
##        0.1%        0.5%          1% 
## 0.001707315 0.006113482 0.009361335
qqplot(-log(runif(100000),10),ans1[,5],
       xlab="expected quantile -log10(pv)",
       ylab="observed quantile -log10(pv)",
       pch=19,cex=0.1)
abline(0,1,col="red")

The conclussion is: No significant association is detected for any of these traits.

Assesing significance with 1000s of tests

This is a very important question: How small needs to be a p-value to be called significant if we perform 1000s of tests?
Let’s create a function to plot p-values and then use it with a simulation

A function to build q-qplots of pvalues

Notice that this function takes up a vector of p-values and it generates a log-uniform q-qplot
It has the possibility of creating a new plot or “adding the points” to an existing one.
It also parses any graphical option to the plot or points graphical functions

qqgplot = function(pvector, main=NULL, add=F,...) {
  o = -log10(sort(pvector,decreasing=F))
  e = -log10( 1:length(o)/length(o) )
  if (!add){
    plot(e,o, main=main, ...,
         xlab=expression(Expected~~-log[10](italic(p))),
         ylab=expression(Observed~~-log[10](italic(p))),
         xlim=c(0,max(e)), ylim=c(0,max(o)+1))
  } else {
    points(e,o,...)
  }
  lines(e,e,col="red")
}

let’s simmulate and plot several sets of p-values

pvs<-matrix(0,1000,500)

for (i in 1:500){
  pvs[,i]<-sort(runif(1000)) #uniform pv
  #qqplot 
  qqgplot(pvs[,i],cex=.3,pch=19,add=c(T,F)[(i==1)+1],type="l",col="lightgrey")  
}

#compute median across each quantile
qs<-apply(pvs,1,quantile,probs=0.5)
#add to qqplot
qqgplot(qs,cex=.3,pch=19,add=T,type="l",lty="dashed",lwd=3)
#compute confidence intervals for each quantile
qs<-apply(pvs,1,quantile,probs=c(0.05,.95))

#compute minimum and maximum of each quantile
qmin<-apply(pvs,1,min)
qmax<-apply(pvs,1,max)
#add them to qqplot
qqgplot(qmin,cex=.3,pch=19,add=T,type="l",lty="dashed",lwd=1)
qqgplot(qmax,cex=.3,pch=19,add=T,type="l",lty="dashed",lwd=1)

#prepare polygonal with confidence intervals
x1<- -log10( 1:length(qs[1,])/length(qs[1,])) 
x2<- -log10( 1:length(qs[2,])/length(qs[2,]))
X.Vec <- c(x1, tail(x1, 1), rev(x2), x2[1])
Y.Vec <- c(-log(qs[1,],10), -log(tail(qs[1],1), 10), -log(rev(qs[2,]),10), -log(qs[2,1],10))
#add 95% confidence band of each quantile 
polygon(X.Vec, Y.Vec, border = NA,col=rgb(0.33,.33,0.33,.3))