Set up files

setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2017")    # Set current working directory
rm(list=ls())        # Remove all objects from work environment

library(synbreed)
library(synbreedData)
library(gwaR)
library(regress)

Load cattle dataset and impute it

Notice that the label.heter function is just “AB” because that is the only heterozygote genotype Also notice the “naive” imputation.

data(cattle)
summary(cattle)

## object of class 'gpData' 
## covar 
##   No. of individuals 1929 
##           phenotyped 500 
##            genotyped 500 
## pheno 
##   No. of traits:        2 
## 
##    Phenotype1          Phenotype2     
##  Min.   :-50.07000   Min.   :-424.43  
##  1st Qu.:-10.25750   1st Qu.: -94.40  
##  Median :  0.13500   Median :  -0.84  
##  Mean   : -0.00104   Mean   :   0.05  
##  3rd Qu.: 10.35250   3rd Qu.:  96.99  
##  Max.   : 49.41000   Max.   : 422.63  
## 
## geno 
##   No. of markers 7250 
##   genotypes AA AB BB 
##   frequencies 0.2889283 0.3456006 0.3627126 
##   NA's 0.276 %
## map 
##   No. of mapped markers  7250 
##   No. of chromosomes     29 
## 
##   markers per chromosome 
##  
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 
##  19  20  21  22  23  24  25  26  27  28  29 
## 250 250 250 250 250 250 250 250 250 250 250 
## 
## pedigree 
## Number of 
##   individuals  1929 
##   Par 1        376 
##   Par 2        1053 
##   generations  6

cattleC<-codeGeno(cattle, label.heter = "AB",impute=T,impute.type="random",nmiss=0.01,maf=0.05)

## 
##      Summary of imputation 
##     total number of missing values                : 9145 
##     number of random imputations                  : 9145

#function to check if there has been errors in recoding
check_recode<-function(i,M1,M2){ #gets two matrices and a column name
  M<-as.matrix(table(M1[,i],M2[,i])) # creates a frequency table of the column in two matrices
  ds<-sum(diag(M))                   #sums diagonal
  cds<-sum(diag(M[nrow(M):1,]))      #sums counter diagonal
  ts<-sum(M)                        #sums all matrix
  return(ts==max(ds,cds))            #if recoding is 1:1 total should equal sum of diagonal or counter diagonal
}

#applies to all the columns of the recoded matrix
eqs<-sapply(colnames(cattleC$geno),check_recode,cattle$geno,cattleC$geno)
summary(eqs) #all OK

##    Mode    TRUE    NA's 
## logical    6688       0

dim(cattleC$geno) #all OK

## [1]  500 6688

compute relationship matrix with synbreed

G1<-kin(cattleC,ret="realized") # divides all columns by sum of expected variances
dim(G1)

## [1] 500 500

#computegenotype matrix as -1,0,1 code for rrBLUP
genot<-cattleC$geno-1

eigenvalue analysis

This analysis is useful to explore population structure

eigenan<-eigen(G1)
names(eigenan) #values and vectors (loadings)

## [1] "values"  "vectors"

#values used to see % of variance explained by each component
eigenvalues<-eigenan$values
barplot(eigenvalues/sum(eigenvalues),ylab="% var. explained")

summary(eigenvalues/sum(eigenvalues))

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0007503 0.0013080 0.0020000 0.0024770 0.0192600

head(eigenvalues/sum(eigenvalues))

## [1] 0.01926397 0.01394292 0.01342923 0.01251122 0.01085052 0.01005249

#principal component plots (try MDS as well)
pairs(eigenan$vectors[,1:4])

The conclussion is that there is very little, if any, population substructure in this population.
Notice the small % of variation explained by the first principal components.
Plus… there is very little need of using PC as fixed effects if we include the relationship matrix (see paper in D2L)

Run Gblup and GWA for two one trait

#run GWA
system.time({
  gb<-gb<-gblup(rsp="Phenotype1",data=cattleC,
          design=c(y~1),G=G1,pos=c(T,T))
})

##    user  system elapsed 
##    1.73    0.06    1.86

print(gb)

## gblup analysis of trait: Phenotype1 
## 
## fixed effects equation:
## y ~ 1
## 
## random effects equation:
## ~G + In
## 
## log-likelihood: -1613.968 converged in: 6 iterations 
## 
## estimated fixed effects:
##             Estimate  StdError      test.st   p.value
## (Intercept) -0.00104 0.6068364 -0.001713806 0.9986326
## 
## estimated variance components:
##     Estimate StdError prop.var
## G   58.12153 22.20662 0.239927
## In 184.12519 21.84083 0.760073

dim(genot)

## [1]  500 6688

colnames(genot)[1:10]

##  [1] "SNP_1"  "SNP_2"  "SNP_4"  "SNP_5"  "SNP_6"  "SNP_7"  "SNP_8" 
##  [8] "SNP_10" "SNP_11" "SNP_12"

rownames(genot)[1:10]

##  [1] "ID11430" "ID11431" "ID11432" "ID11433" "ID11434" "ID11435" "ID11436"
##  [8] "ID11437" "ID11438" "ID11439"

system.time({
gw<-gwas(gblup = gb,x = t(genot))
})

##    user  system elapsed 
##    1.76    0.05    1.86

head(gw)

##            ghat  var_ghat
## SNP_1 -61.70659  974.6122
## SNP_2 -50.31240 2681.5229
## SNP_4 -21.75763 2453.4327
## SNP_5  36.63304 2112.3916
## SNP_6 -19.83039  767.1457
## SNP_7  33.88330  734.9028

pv1<-getpvalue(gw)
head(cbind(gw,pv1))

##            ghat  var_ghat       pv1
## SNP_1 -61.70659  974.6122 1.3179587
## SNP_2 -50.31240 2681.5229 0.4798399
## SNP_4 -21.75763 2453.4327 0.1801462
## SNP_5  36.63304 2112.3916 0.3711803
## SNP_6 -19.83039  767.1457 0.3242102
## SNP_7  33.88330  734.9028 0.6750166

Exploratory analysis of results

Is there any really significant association?

hist(10^-pv1)

summary(10^-pv1)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000673 0.241800 0.488100 0.493900 0.744900 1.000000

quantile(10^-pv1,c(0.001,0.005,0.01))

##        0.1%        0.5%          1% 
## 0.002113294 0.005141932 0.011009730

qqgplot(10^-pv1)

manhattan_plot(10^-pv1,map=cattleC$map,threshold=1E-6)