setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015") # Set current working directory
#setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015") # Set current working directory
rm(list=ls()) # Remove all objects from work environment
library(synbreed)
## Loading required package: doBy
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Loading required package: abind
library(synbreedData)
library(regress)
let’s load mice dataset and impute
data(mice)
is.heter<-function(x){
substr(x,1,1)!=substr(x,3,3)
}
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
maf=0.05,nmiss=0.01,verbose=T)
## step 1 : 400 marker(s) removed with > 1 % missing values
## step 2 : Recoding alleles
## step 4 : 2148 marker(s) removed with maf < 0.05
## step 7 : Imputing of missing values
## step 7d : Random imputing of missing values
## step 8 : Recode alleles due to imputation
## step 9 : 0 marker(s) removed with maf < 0.05
## step 10 : No duplicated markers removed
## End : 9997 marker(s) remain after the check
##
## Summary of imputation
## total number of missing values : 20141
## number of random imputations : 20141
summary(miceCimp)
## object of class 'gpData'
## covar
## No. of individuals 2527
## phenotyped 2527
## genotyped 1940
## pheno
## No. of traits: 2
##
## weight growth.slope
## Min. :11.9 Min. :-0.08889
## 1st Qu.:17.8 1st Qu.: 0.04556
## Median :19.9 Median : 0.08024
## Mean :20.3 Mean : 0.08659
## 3rd Qu.:22.6 3rd Qu.: 0.12569
## Max. :30.2 Max. : 0.26408
## NA's :16 NA's :53
##
## geno
## No. of markers 9997
## genotypes 1 0 2
## frequencies 0.367 0.532 0.101
## NA's 0.000 %
## map
## No. of mapped markers 9997
## No. of chromosomes 20
##
## markers per chromosome
##
## 1 10 11 12 13 14 15 16 17 18 19 2 3 4 5 6 7 8
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479
## 9 X
## 527 6
##
## pedigree
## NULL
Let’s compute genomic relationship matrix, and check dimension of phenotypes and genotypes
Gmat<-kin(miceCimp,ret="realized")
plot(Gmat)
summary(Gmat)
## dimension 1940 x 1940
## rank 1939
## range of off-diagonal values -0.2796866 -- 1.219104
## mean off-diagonal values -0.0005252561
## range of diagonal values 0.812 -- 1.325
## mean diagonal values 1.018472
## number of unique values 1873911
dim(Gmat)
## [1] 1940 1940
dim(miceCimp$pheno)
## [1] 2527 2 1
phenotypes<-miceCimp$pheno[,1,1]
summary(phenotypes)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.9 17.8 19.9 20.3 22.6 30.2 16
length(phenotypes)
## [1] 2527
phenotypes<-phenotypes[!is.na(phenotypes)]
sum(rownames(Gmat)%in%names(phenotypes))
## [1] 1928
Use positional indexes
idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)
## [1] 1928 1928
idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))
## [1] 0
idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))
## [1] 0
Use positional indexes
idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)
## [1] 1928 1928
idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))
## [1] 0
idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))
## [1] 0
lmmg<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex,~Gpheno)
lmmg
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity)3+as.factor(cvg$CageDensity)4+as.factor(cvg$CageDensity)5+as.factor(cvg$CageDensity)6+as.factor(cvg$CageDensity)7+cvg$sexM
##
## Maximized log likelihood with kernel K is -2559.913
##
## Linear Coefficients:
## Estimate Std. Error
## (Intercept) 19.326 0.492
## as.factor(cvg$CageDensity)3 -0.958 0.525
## as.factor(cvg$CageDensity)4 -0.946 0.497
## as.factor(cvg$CageDensity)5 -1.193 0.501
## as.factor(cvg$CageDensity)6 -1.389 0.500
## as.factor(cvg$CageDensity)7 -2.353 0.584
## cvg$sexM 4.208 0.103
##
## Variance Coefficients:
## Estimate Std. Error
## Gpheno 3.182 0.351
## In 3.900 0.158
lmmg$sigma[1]/(sum(lmmg$sigma)) #estimate heritability
## Gpheno
## 0.4493161
EBVg<-BLUP(lmmg)
Compare GEBV to phenotypes and to corrected phenotypes
plot(EBVg$Mean,phenoG)
abline(0,1)
cor(phenoG,EBVg$Mean)
## [1] 0.5988551
corrected<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex)
Ycor<-phenoG-corrected$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)
cor(Ycor,EBVg$Mean)
## [,1]
## [1,] 0.7454928
Let’s set aside some observations to prediction and refit model.
#random vector of missing values
Nobs<-length(phenoG)
Nmis<-round(length(phenoG)*0.1) #10% missing
tomissing<-sample(1:Nobs,Nmis)
phenoGna<-phenoG[-tomissing]
lmmgna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing],
~Gpheno[-tomissing,-tomissing])
lmmgna
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity[-tomissing])3+as.factor(cvg$CageDensity[-tomissing])4+as.factor(cvg$CageDensity[-tomissing])5+as.factor(cvg$CageDensity[-tomissing])6+as.factor(cvg$CageDensity[-tomissing])7+cvg$sex[-tomissing]M
##
## Maximized log likelihood with kernel K is -2323.978
##
## Linear Coefficients:
## Estimate Std. Error
## (Intercept) 19.256 0.509
## as.factor(cvg$CageDensity[-tomissing])3 -0.876 0.544
## as.factor(cvg$CageDensity[-tomissing])4 -0.840 0.514
## as.factor(cvg$CageDensity[-tomissing])5 -1.165 0.519
## as.factor(cvg$CageDensity[-tomissing])6 -1.278 0.519
## as.factor(cvg$CageDensity[-tomissing])7 -2.237 0.604
## cvg$sex[-tomissing]M 4.164 0.109
##
## Variance Coefficients:
## Estimate Std. Error
## Gpheno[-tomissing, -tomissing] 3.223 0.371
## In 3.959 0.171
lmmgna$sigma[1]/(sum(lmmgna$sigma)) #estimate heritability
## Gpheno[-tomissing, -tomissing]
## 0.4487875
EBVgna<-BLUP(lmmgna)
plot(EBVgna$Mean,phenoGna)
abline(0,1)
cor(phenoGna,EBVgna$Mean)
## [1] 0.6108896
correctedna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing])
Ycorna<-phenoGna-correctedna$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)
cor(Ycorna,EBVgna$Mean)
## [,1]
## [1,] 0.7493676
EBV_norec<- Gpheno[tomissing,-tomissing]%*%solve(Gpheno[-tomissing,-tomissing])%*%EBVgna$Mean
head(sort(EBV_norec[,1],decreasing=T),10)
## A063867785 A084126609 A064096308 A084289027 A067106356 A053073028
## 3.547340 3.411871 2.425728 2.329499 2.317289 2.241341
## A063528111 A064094852 A084286885 A067043333
## 2.211431 2.066039 2.046795 1.912738
plot(EBV_norec,phenoG[tomissing])
cor(EBV_norec,phenoG[tomissing])
## [,1]
## [1,] 0.3761723
plot(EBV_norec,Ycor[tomissing])
abline(0,1)
cor(EBV_norec,Ycor[tomissing])
## [,1]
## [1,] 0.5311374