setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2017") # Set current working directory
rm(list=ls()) # Remove all objects from work environment
library(synbreed)
library(synbreedData)
library(gwaR)
library(regress)
Notice that the label.heter function is just “AB” because that is the only heterozygote genotype Also notice the “naive” imputation.
data(cattle)
summary(cattle)
## object of class 'gpData'
## covar
## No. of individuals 1929
## phenotyped 500
## genotyped 500
## pheno
## No. of traits: 2
##
## Phenotype1 Phenotype2
## Min. :-50.07000 Min. :-424.43
## 1st Qu.:-10.25750 1st Qu.: -94.40
## Median : 0.13500 Median : -0.84
## Mean : -0.00104 Mean : 0.05
## 3rd Qu.: 10.35250 3rd Qu.: 96.99
## Max. : 49.41000 Max. : 422.63
##
## geno
## No. of markers 7250
## genotypes AA AB BB
## frequencies 0.2889283 0.3456006 0.3627126
## NA's 0.276 %
## map
## No. of mapped markers 7250
## No. of chromosomes 29
##
## markers per chromosome
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250
## 19 20 21 22 23 24 25 26 27 28 29
## 250 250 250 250 250 250 250 250 250 250 250
##
## pedigree
## Number of
## individuals 1929
## Par 1 376
## Par 2 1053
## generations 6
cattleC<-codeGeno(cattle, label.heter = "AB",impute=T,impute.type="random",nmiss=0.01,maf=0.05)
##
## Summary of imputation
## total number of missing values : 9145
## number of random imputations : 9145
#function to check if there has been errors in recoding
check_recode<-function(i,M1,M2){ #gets two matrices and a column name
M<-as.matrix(table(M1[,i],M2[,i])) # creates a frequency table of the column in two matrices
ds<-sum(diag(M)) #sums diagonal
cds<-sum(diag(M[nrow(M):1,])) #sums counter diagonal
ts<-sum(M) #sums all matrix
return(ts==max(ds,cds)) #if recoding is 1:1 total should equal sum of diagonal or counter diagonal
}
#applies to all the columns of the recoded matrix
eqs<-sapply(colnames(cattleC$geno),check_recode,cattle$geno,cattleC$geno)
summary(eqs) #all OK
## Mode TRUE NA's
## logical 6688 0
dim(cattleC$geno) #all OK
## [1] 500 6688
G1<-kin(cattleC,ret="realized") # divides all columns by sum of expected variances
dim(G1)
## [1] 500 500
#computegenotype matrix as -1,0,1 code for rrBLUP
genot<-cattleC$geno-1
This analysis is useful to explore population structure
eigenan<-eigen(G1)
names(eigenan) #values and vectors (loadings)
## [1] "values" "vectors"
#values used to see % of variance explained by each component
eigenvalues<-eigenan$values
barplot(eigenvalues/sum(eigenvalues),ylab="% var. explained")
summary(eigenvalues/sum(eigenvalues))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0007503 0.0013080 0.0020000 0.0024770 0.0192600
head(eigenvalues/sum(eigenvalues))
## [1] 0.01926397 0.01394292 0.01342923 0.01251122 0.01085052 0.01005249
#principal component plots (try MDS as well)
pairs(eigenan$vectors[,1:4])
The conclussion is that there is very little, if any, population substructure in this population.
Notice the small % of variation explained by the first principal components.
Plus… there is very little need of using PC as fixed effects if we include the relationship matrix (see paper in D2L)
#run GWA
system.time({
gb<-gb<-gblup(rsp="Phenotype1",data=cattleC,
design=c(y~1),G=G1,pos=c(T,T))
})
## user system elapsed
## 1.73 0.06 1.86
print(gb)
## gblup analysis of trait: Phenotype1
##
## fixed effects equation:
## y ~ 1
##
## random effects equation:
## ~G + In
##
## log-likelihood: -1613.968 converged in: 6 iterations
##
## estimated fixed effects:
## Estimate StdError test.st p.value
## (Intercept) -0.00104 0.6068364 -0.001713806 0.9986326
##
## estimated variance components:
## Estimate StdError prop.var
## G 58.12153 22.20662 0.239927
## In 184.12519 21.84083 0.760073
dim(genot)
## [1] 500 6688
colnames(genot)[1:10]
## [1] "SNP_1" "SNP_2" "SNP_4" "SNP_5" "SNP_6" "SNP_7" "SNP_8"
## [8] "SNP_10" "SNP_11" "SNP_12"
rownames(genot)[1:10]
## [1] "ID11430" "ID11431" "ID11432" "ID11433" "ID11434" "ID11435" "ID11436"
## [8] "ID11437" "ID11438" "ID11439"
system.time({
gw<-gwas(gblup = gb,x = t(genot))
})
## user system elapsed
## 1.76 0.05 1.86
head(gw)
## ghat var_ghat
## SNP_1 -61.70659 974.6122
## SNP_2 -50.31240 2681.5229
## SNP_4 -21.75763 2453.4327
## SNP_5 36.63304 2112.3916
## SNP_6 -19.83039 767.1457
## SNP_7 33.88330 734.9028
pv1<-getpvalue(gw)
head(cbind(gw,pv1))
## ghat var_ghat pv1
## SNP_1 -61.70659 974.6122 1.3179587
## SNP_2 -50.31240 2681.5229 0.4798399
## SNP_4 -21.75763 2453.4327 0.1801462
## SNP_5 36.63304 2112.3916 0.3711803
## SNP_6 -19.83039 767.1457 0.3242102
## SNP_7 33.88330 734.9028 0.6750166
Is there any really significant association?
hist(10^-pv1)
summary(10^-pv1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000673 0.241800 0.488100 0.493900 0.744900 1.000000
quantile(10^-pv1,c(0.001,0.005,0.01))
## 0.1% 0.5% 1%
## 0.002113294 0.005141932 0.011009730
qqgplot(10^-pv1)
manhattan_plot(10^-pv1,map=cattleC$map,threshold=1E-6)
The conclussion is: No significant association is detected for this trait.